paperfit-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/adjust-length.md +21 -0
- package/.claude/commands/check-visual.md +27 -0
- package/.claude/commands/fix-layout.md +31 -0
- package/.claude/commands/migrate-template.md +23 -0
- package/.claude/commands/repair-table.md +21 -0
- package/.claude/commands/show-status.md +32 -0
- package/.claude-plugin/README.md +77 -0
- package/.claude-plugin/marketplace.json +41 -0
- package/.claude-plugin/plugin.json +39 -0
- package/CLAUDE.md +266 -0
- package/CONTRIBUTING.md +131 -0
- package/LICENSE +21 -0
- package/README.md +164 -0
- package/agents/code-surgeon-agent.md +214 -0
- package/agents/layout-detective-agent.md +229 -0
- package/agents/orchestrator-agent.md +254 -0
- package/agents/quality-gatekeeper-agent.md +270 -0
- package/agents/rule-engine-agent.md +224 -0
- package/agents/semantic-polish-agent.md +250 -0
- package/bin/paperfit.js +176 -0
- package/config/agent_roles.yaml +56 -0
- package/config/layout_rules.yaml +54 -0
- package/config/templates.yaml +241 -0
- package/config/vto_taxonomy.yaml +489 -0
- package/config/writing_rules.yaml +64 -0
- package/install.sh +30 -0
- package/package.json +52 -0
- package/requirements.txt +5 -0
- package/scripts/benchmark_runner.py +629 -0
- package/scripts/compile.sh +244 -0
- package/scripts/config_validator.py +339 -0
- package/scripts/cv_detector.py +600 -0
- package/scripts/evidence_collector.py +167 -0
- package/scripts/float_fixers.py +861 -0
- package/scripts/inject_defects.py +549 -0
- package/scripts/install-claude-global.js +148 -0
- package/scripts/install.js +66 -0
- package/scripts/install.sh +106 -0
- package/scripts/overflow_fixers.py +656 -0
- package/scripts/package-for-opensource.sh +138 -0
- package/scripts/parse_log.py +260 -0
- package/scripts/postinstall.js +38 -0
- package/scripts/pre_tool_use.py +265 -0
- package/scripts/render_pages.py +244 -0
- package/scripts/session_logger.py +329 -0
- package/scripts/space_util_fixers.py +773 -0
- package/scripts/state_manager.py +352 -0
- package/scripts/test_commands.py +187 -0
- package/scripts/test_cv_detector.py +214 -0
- package/scripts/test_integration.py +290 -0
- package/skills/consistency-polisher/SKILL.md +337 -0
- package/skills/float-optimizer/SKILL.md +284 -0
- package/skills/latex_fixers/__init__.py +82 -0
- package/skills/latex_fixers/float_fixers.py +392 -0
- package/skills/latex_fixers/fullwidth_fixers.py +375 -0
- package/skills/latex_fixers/overflow_fixers.py +250 -0
- package/skills/latex_fixers/semantic_micro_tuning.py +362 -0
- package/skills/latex_fixers/space_util_fixers.py +389 -0
- package/skills/latex_fixers/utils.py +55 -0
- package/skills/overflow-repair/SKILL.md +304 -0
- package/skills/space-util-fixer/SKILL.md +307 -0
- package/skills/taxonomy-vto/SKILL.md +486 -0
- package/skills/template-migrator/SKILL.md +251 -0
- package/skills/visual-inspector/SKILL.md +217 -0
- package/skills/writing-polish/SKILL.md +289 -0
|
@@ -0,0 +1,629 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
VTO Benchmark 评测运行器
|
|
4
|
+
|
|
5
|
+
批量评估 PaperFit 对不同缺陷类型的检测和修复能力,输出结构化评测报告。
|
|
6
|
+
|
|
7
|
+
功能:
|
|
8
|
+
1. 加载预定义的测试样本(包含已知缺陷)
|
|
9
|
+
2. 运行 PaperFix 修复流程
|
|
10
|
+
3. 收集修复前后的指标
|
|
11
|
+
4. 生成评测报告和准确率统计
|
|
12
|
+
|
|
13
|
+
用法:
|
|
14
|
+
python benchmark_runner.py [--samples-dir DIR] [--output-dir DIR] [--rounds N]
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import subprocess
|
|
20
|
+
import sys
|
|
21
|
+
import time
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from datetime import datetime
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ============================================================
|
|
29
|
+
# 评测指标定义
|
|
30
|
+
# ============================================================
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class DefectDetectionResult:
|
|
34
|
+
"""缺陷检测结果"""
|
|
35
|
+
defect_id: str
|
|
36
|
+
expected: bool # 是否应该被检测到
|
|
37
|
+
detected: bool # 是否实际被检测到
|
|
38
|
+
confidence: Optional[float] = None # 置信度(如果有)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def is_true_positive(self) -> bool:
|
|
42
|
+
return self.expected and self.detected
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def is_false_positive(self) -> bool:
|
|
46
|
+
return not self.expected and self.detected
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def is_false_negative(self) -> bool:
|
|
50
|
+
return self.expected and not self.detected
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def is_true_negative(self) -> bool:
|
|
54
|
+
return not self.expected and not self.detected
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class DefectRepairResult:
|
|
59
|
+
"""缺陷修复结果"""
|
|
60
|
+
defect_id: str
|
|
61
|
+
attempted: bool # 是否尝试修复
|
|
62
|
+
successful: bool # 是否修复成功
|
|
63
|
+
method: str = "" # 使用的修复方法
|
|
64
|
+
side_effects: List[str] = field(default_factory=list) # 副作用列表
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class RoundMetrics:
|
|
69
|
+
"""单轮评测指标"""
|
|
70
|
+
round_id: int
|
|
71
|
+
sample_name: str
|
|
72
|
+
initial_defects: List[Dict]
|
|
73
|
+
detected_defects: List[DefectDetectionResult]
|
|
74
|
+
repair_results: List[DefectRepairResult]
|
|
75
|
+
compile_success: bool
|
|
76
|
+
compile_time_sec: float
|
|
77
|
+
total_time_sec: float
|
|
78
|
+
page_count_before: int
|
|
79
|
+
page_count_after: int
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def detection_precision(self) -> float:
|
|
83
|
+
"""检测查准率"""
|
|
84
|
+
tp = sum(1 for d in self.detected_defects if d.is_true_positive)
|
|
85
|
+
fp = sum(1 for d in self.detected_defects if d.is_false_positive)
|
|
86
|
+
if tp + fp == 0:
|
|
87
|
+
return 0.0
|
|
88
|
+
return tp / (tp + fp)
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def detection_recall(self) -> float:
|
|
92
|
+
"""检测查全率"""
|
|
93
|
+
tp = sum(1 for d in self.detected_defects if d.is_true_positive)
|
|
94
|
+
fn = sum(1 for d in self.detected_defects if d.is_false_negative)
|
|
95
|
+
if tp + fn == 0:
|
|
96
|
+
return 0.0
|
|
97
|
+
return tp / (tp + fn)
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def detection_f1(self) -> float:
|
|
101
|
+
"""检测 F1 分数"""
|
|
102
|
+
p = self.detection_precision
|
|
103
|
+
r = self.detection_recall
|
|
104
|
+
if p + r == 0:
|
|
105
|
+
return 0.0
|
|
106
|
+
return 2 * p * r / (p + r)
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def repair_success_rate(self) -> float:
|
|
110
|
+
"""修复成功率"""
|
|
111
|
+
attempted = sum(1 for r in self.repair_results if r.attempted)
|
|
112
|
+
successful = sum(1 for r in self.repair_results if r.successful)
|
|
113
|
+
if attempted == 0:
|
|
114
|
+
return 0.0
|
|
115
|
+
return successful / attempted
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class BenchmarkSummary:
|
|
120
|
+
"""评测汇总"""
|
|
121
|
+
benchmark_id: str
|
|
122
|
+
timestamp: str
|
|
123
|
+
total_samples: int
|
|
124
|
+
total_rounds: int
|
|
125
|
+
avg_detection_precision: float
|
|
126
|
+
avg_detection_recall: float
|
|
127
|
+
avg_detection_f1: float
|
|
128
|
+
avg_repair_success_rate: float
|
|
129
|
+
avg_compile_time_sec: float
|
|
130
|
+
avg_total_time_sec: float
|
|
131
|
+
category_breakdown: Dict[str, Dict[str, float]] = field(default_factory=dict)
|
|
132
|
+
per_sample_results: List[Dict] = field(default_factory=list)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ============================================================
|
|
136
|
+
# Benchmark 运行器
|
|
137
|
+
# ============================================================
|
|
138
|
+
|
|
139
|
+
class BenchmarkRunner:
|
|
140
|
+
"""Benchmark 评测运行器"""
|
|
141
|
+
|
|
142
|
+
def __init__(
|
|
143
|
+
self,
|
|
144
|
+
samples_dir: Path,
|
|
145
|
+
output_dir: Path,
|
|
146
|
+
paperfit_root: Optional[Path] = None,
|
|
147
|
+
):
|
|
148
|
+
self.samples_dir = samples_dir
|
|
149
|
+
self.output_dir = output_dir
|
|
150
|
+
self.paperfit_root = paperfit_root or Path(__file__).parent.parent
|
|
151
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
152
|
+
|
|
153
|
+
# 评测结果存储
|
|
154
|
+
self.all_metrics: List[RoundMetrics] = []
|
|
155
|
+
|
|
156
|
+
def run_benchmark(
|
|
157
|
+
self,
|
|
158
|
+
sample_names: Optional[List[str]] = None,
|
|
159
|
+
max_rounds_per_sample: int = 3,
|
|
160
|
+
) -> BenchmarkSummary:
|
|
161
|
+
"""
|
|
162
|
+
运行完整评测流程
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
sample_names: 要评测的样本名称列表,None 表示评测所有
|
|
166
|
+
max_rounds_per_sample: 每个样本的最大迭代轮数
|
|
167
|
+
"""
|
|
168
|
+
# 发现样本
|
|
169
|
+
if sample_names is None:
|
|
170
|
+
sample_names = self._discover_samples()
|
|
171
|
+
|
|
172
|
+
print(f"\n开始评测 {len(sample_names)} 个样本")
|
|
173
|
+
print("=" * 60)
|
|
174
|
+
|
|
175
|
+
for sample_name in sample_names:
|
|
176
|
+
self._run_sample(
|
|
177
|
+
sample_name=sample_name,
|
|
178
|
+
max_rounds=max_rounds_per_sample,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# 生成汇总报告
|
|
182
|
+
summary = self._generate_summary()
|
|
183
|
+
self._save_summary(summary)
|
|
184
|
+
|
|
185
|
+
return summary
|
|
186
|
+
|
|
187
|
+
def _discover_samples(self) -> List[str]:
|
|
188
|
+
"""发现所有可用的测试样本"""
|
|
189
|
+
sample_files = list(self.samples_dir.glob("*.tex"))
|
|
190
|
+
# 排除干净样本和基础样本
|
|
191
|
+
exclude_prefixes = ["clean", "_base"]
|
|
192
|
+
return [
|
|
193
|
+
f.stem for f in sample_files
|
|
194
|
+
if not any(f.stem.startswith(p) for p in exclude_prefixes)
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
def _run_sample(
|
|
198
|
+
self,
|
|
199
|
+
sample_name: str,
|
|
200
|
+
max_rounds: int,
|
|
201
|
+
) -> List[RoundMetrics]:
|
|
202
|
+
"""运行单个样本的评测"""
|
|
203
|
+
print(f"\n[评测] {sample_name}")
|
|
204
|
+
print("-" * 40)
|
|
205
|
+
|
|
206
|
+
sample_path = self.samples_dir / f"{sample_name}.tex"
|
|
207
|
+
defects_path = self.samples_dir / f"{sample_name}_defects.json"
|
|
208
|
+
|
|
209
|
+
if not sample_path.exists():
|
|
210
|
+
print(f" [跳过] 样本不存在:{sample_path}")
|
|
211
|
+
return []
|
|
212
|
+
|
|
213
|
+
# 加载缺陷清单
|
|
214
|
+
expected_defects = []
|
|
215
|
+
if defects_path.exists():
|
|
216
|
+
with open(defects_path, "r", encoding="utf-8") as f:
|
|
217
|
+
expected_defects = json.load(f)
|
|
218
|
+
print(f" [加载] 预期缺陷数:{len(expected_defects)}")
|
|
219
|
+
else:
|
|
220
|
+
print(f" [警告] 未找到缺陷清单:{defects_path}")
|
|
221
|
+
|
|
222
|
+
# 复制样本到工作目录
|
|
223
|
+
work_dir = self.output_dir / "work"
|
|
224
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
225
|
+
work_tex = work_dir / f"{sample_name}.tex"
|
|
226
|
+
|
|
227
|
+
# 编译并统计初始页数
|
|
228
|
+
page_before = self._compile_and_count_pages(sample_path, work_tex)
|
|
229
|
+
print(f" [编译] 初始页数:{page_before}")
|
|
230
|
+
|
|
231
|
+
round_metrics_list: List[RoundMetrics] = []
|
|
232
|
+
|
|
233
|
+
for round_id in range(1, max_rounds + 1):
|
|
234
|
+
print(f"\n [轮次] {round_id}/{max_rounds}")
|
|
235
|
+
|
|
236
|
+
start_time = time.time()
|
|
237
|
+
|
|
238
|
+
# 运行 PaperFit 修复流程(调用 fix-layout 命令)
|
|
239
|
+
compile_success, compile_time = self._run_paperfit_fix(work_tex)
|
|
240
|
+
|
|
241
|
+
# 编译后统计
|
|
242
|
+
page_after = self._count_pdf_pages(work_dir / f"{sample_name}.pdf")
|
|
243
|
+
|
|
244
|
+
# 收集检测结果(从 state.json 或日志中解析)
|
|
245
|
+
detected_defects = self._collect_detection_results(work_dir)
|
|
246
|
+
repair_results = self._collect_repair_results(work_dir, expected_defects)
|
|
247
|
+
|
|
248
|
+
elapsed = time.time() - start_time
|
|
249
|
+
|
|
250
|
+
metrics = RoundMetrics(
|
|
251
|
+
round_id=round_id,
|
|
252
|
+
sample_name=sample_name,
|
|
253
|
+
initial_defects=expected_defects,
|
|
254
|
+
detected_defects=detected_defects,
|
|
255
|
+
repair_results=repair_results,
|
|
256
|
+
compile_success=compile_success,
|
|
257
|
+
compile_time_sec=compile_time,
|
|
258
|
+
total_time_sec=elapsed,
|
|
259
|
+
page_count_before=page_before,
|
|
260
|
+
page_count_after=page_after,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
self.all_metrics.append(metrics)
|
|
264
|
+
round_metrics_list.append(metrics)
|
|
265
|
+
|
|
266
|
+
# 输出本轮指标
|
|
267
|
+
print(f" 检测 F1: {metrics.detection_f1:.2%}")
|
|
268
|
+
print(f" 修复成功率:{metrics.repair_success_rate:.2%}")
|
|
269
|
+
print(f" 编译时间:{compile_time:.2f}s")
|
|
270
|
+
|
|
271
|
+
# 如果达到 DONE 状态或无缺陷可修复,提前结束
|
|
272
|
+
if metrics.detection_f1 == 1.0 or metrics.repair_success_rate == 0.0:
|
|
273
|
+
print(f" [完成] 样本 {sample_name} 已达到最优状态")
|
|
274
|
+
break
|
|
275
|
+
|
|
276
|
+
return round_metrics_list
|
|
277
|
+
|
|
278
|
+
def _compile_and_count_pages(
|
|
279
|
+
self,
|
|
280
|
+
source_tex: Path,
|
|
281
|
+
dest_tex: Path,
|
|
282
|
+
) -> int:
|
|
283
|
+
"""编译并返回页数"""
|
|
284
|
+
# 复制文件到目标位置
|
|
285
|
+
import shutil
|
|
286
|
+
shutil.copy(source_tex, dest_tex)
|
|
287
|
+
shutil.copy(source_tex.parent / "clean_sample.tex", dest_tex.parent, dirs_exist_ok=True)
|
|
288
|
+
|
|
289
|
+
pdf_path = dest_tex.with_suffix(".pdf")
|
|
290
|
+
return self._count_pdf_pages(pdf_path)
|
|
291
|
+
|
|
292
|
+
def _count_pdf_pages(self, pdf_path: Path) -> int:
|
|
293
|
+
"""统计 PDF 页数"""
|
|
294
|
+
if not pdf_path.exists():
|
|
295
|
+
return 0
|
|
296
|
+
try:
|
|
297
|
+
# 使用 pdfinfo 工具(poppler-utils)
|
|
298
|
+
result = subprocess.run(
|
|
299
|
+
["pdfinfo", str(pdf_path)],
|
|
300
|
+
capture_output=True,
|
|
301
|
+
text=True,
|
|
302
|
+
timeout=10,
|
|
303
|
+
)
|
|
304
|
+
for line in result.stdout.split("\n"):
|
|
305
|
+
if line.startswith("Pages:"):
|
|
306
|
+
return int(line.split(":")[1].strip())
|
|
307
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, ValueError):
|
|
308
|
+
pass
|
|
309
|
+
return 0
|
|
310
|
+
|
|
311
|
+
def _run_paperfit_fix(self, tex_path: Path) -> Tuple[bool, float]:
|
|
312
|
+
"""运行 PaperFix 修复流程"""
|
|
313
|
+
start_time = time.time()
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
# 调用 fix-layout 命令(这里使用占位实现)
|
|
317
|
+
# 实际使用时需要根据项目结构调整
|
|
318
|
+
result = subprocess.run(
|
|
319
|
+
[
|
|
320
|
+
sys.executable,
|
|
321
|
+
str(self.paperfit_root / "scripts" / "state_manager.py"),
|
|
322
|
+
"--status",
|
|
323
|
+
],
|
|
324
|
+
capture_output=True,
|
|
325
|
+
text=True,
|
|
326
|
+
timeout=300, # 5 分钟超时
|
|
327
|
+
cwd=str(tex_path.parent),
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
compile_success = result.returncode == 0
|
|
331
|
+
elapsed = time.time() - start_time
|
|
332
|
+
return compile_success, elapsed
|
|
333
|
+
|
|
334
|
+
except subprocess.TimeoutExpired:
|
|
335
|
+
print(f" [错误] 修复超时(>5 分钟)")
|
|
336
|
+
return False, time.time() - start_time
|
|
337
|
+
except Exception as e:
|
|
338
|
+
print(f" [错误] 执行失败:{e}")
|
|
339
|
+
return False, time.time() - start_time
|
|
340
|
+
|
|
341
|
+
def _collect_detection_results(
|
|
342
|
+
self,
|
|
343
|
+
work_dir: Path,
|
|
344
|
+
) -> List[DefectDetectionResult]:
|
|
345
|
+
"""收集缺陷检测结果"""
|
|
346
|
+
results: List[DefectDetectionResult] = []
|
|
347
|
+
|
|
348
|
+
# 尝试从 state.json 读取检测结果
|
|
349
|
+
state_path = self.paperfit_root / "data" / "state.json"
|
|
350
|
+
if state_path.exists():
|
|
351
|
+
with open(state_path, "r", encoding="utf-8") as f:
|
|
352
|
+
state = json.load(f)
|
|
353
|
+
visual_defects = state.get("visual_defects", [])
|
|
354
|
+
for defect in visual_defects:
|
|
355
|
+
results.append(
|
|
356
|
+
DefectDetectionResult(
|
|
357
|
+
defect_id=defect.get("category", "unknown"),
|
|
358
|
+
expected=True,
|
|
359
|
+
detected=True,
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
return results
|
|
364
|
+
|
|
365
|
+
def _collect_repair_results(
|
|
366
|
+
self,
|
|
367
|
+
work_dir: Path,
|
|
368
|
+
expected_defects: List[Dict],
|
|
369
|
+
) -> List[DefectRepairResult]:
|
|
370
|
+
"""收集缺陷修复结果"""
|
|
371
|
+
results: List[DefectRepairResult] = []
|
|
372
|
+
|
|
373
|
+
# 从状态文件或日志中解析修复结果
|
|
374
|
+
# 这里使用占位实现,实际需要根据项目结构调整
|
|
375
|
+
for defect in expected_defects:
|
|
376
|
+
results.append(
|
|
377
|
+
DefectRepairResult(
|
|
378
|
+
defect_id=defect.get("defect_id", "unknown"),
|
|
379
|
+
attempted=True,
|
|
380
|
+
successful=True, # 占位
|
|
381
|
+
method="auto",
|
|
382
|
+
)
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
return results
|
|
386
|
+
|
|
387
|
+
def _generate_summary(self) -> BenchmarkSummary:
|
|
388
|
+
"""生成评测汇总"""
|
|
389
|
+
if not self.all_metrics:
|
|
390
|
+
return BenchmarkSummary(
|
|
391
|
+
benchmark_id=datetime.now().strftime("%Y%m%d_%H%M%S"),
|
|
392
|
+
timestamp=datetime.now().isoformat(),
|
|
393
|
+
total_samples=0,
|
|
394
|
+
total_rounds=0,
|
|
395
|
+
avg_detection_precision=0.0,
|
|
396
|
+
avg_detection_recall=0.0,
|
|
397
|
+
avg_detection_f1=0.0,
|
|
398
|
+
avg_repair_success_rate=0.0,
|
|
399
|
+
avg_compile_time_sec=0.0,
|
|
400
|
+
avg_total_time_sec=0.0,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
# 计算平均指标
|
|
404
|
+
avg_precision = sum(m.detection_precision for m in self.all_metrics) / len(self.all_metrics)
|
|
405
|
+
avg_recall = sum(m.detection_recall for m in self.all_metrics) / len(self.all_metrics)
|
|
406
|
+
avg_f1 = sum(m.detection_f1 for m in self.all_metrics) / len(self.all_metrics)
|
|
407
|
+
avg_repair = sum(m.repair_success_rate for m in self.all_metrics) / len(self.all_metrics)
|
|
408
|
+
avg_compile = sum(m.compile_time_sec for m in self.all_metrics) / len(self.all_metrics)
|
|
409
|
+
avg_total = sum(m.total_time_sec for m in self.all_metrics) / len(self.all_metrics)
|
|
410
|
+
|
|
411
|
+
# 按类别分解
|
|
412
|
+
category_stats: Dict[str, Dict[str, float]] = {}
|
|
413
|
+
for metrics in self.all_metrics:
|
|
414
|
+
for defect in metrics.initial_defects:
|
|
415
|
+
cat = defect.get("defect_id", "unknown")[0] # 取首字母作为类别
|
|
416
|
+
if cat not in category_stats:
|
|
417
|
+
category_stats[cat] = {"count": 0, "repaired": 0}
|
|
418
|
+
category_stats[cat]["count"] += 1
|
|
419
|
+
# 统计修复成功的数量
|
|
420
|
+
for repair in metrics.repair_results:
|
|
421
|
+
if repair.defect_id.startswith(cat) and repair.successful:
|
|
422
|
+
category_stats[cat]["repaired"] += 1
|
|
423
|
+
|
|
424
|
+
# 计算各类别成功率
|
|
425
|
+
for cat, stats in category_stats.items():
|
|
426
|
+
stats["success_rate"] = (
|
|
427
|
+
stats["repaired"] / stats["count"] if stats["count"] > 0 else 0.0
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# 按样本分组
|
|
431
|
+
sample_names = set(m.sample_name for m in self.all_metrics)
|
|
432
|
+
per_sample = []
|
|
433
|
+
for name in sample_names:
|
|
434
|
+
sample_metrics = [m for m in self.all_metrics if m.sample_name == name]
|
|
435
|
+
if sample_metrics:
|
|
436
|
+
last = sample_metrics[-1]
|
|
437
|
+
per_sample.append({
|
|
438
|
+
"sample_name": name,
|
|
439
|
+
"final_f1": last.detection_f1,
|
|
440
|
+
"final_repair_rate": last.repair_success_rate,
|
|
441
|
+
"total_rounds": len(sample_metrics),
|
|
442
|
+
})
|
|
443
|
+
|
|
444
|
+
return BenchmarkSummary(
|
|
445
|
+
benchmark_id=datetime.now().strftime("%Y%m%d_%H%M%S"),
|
|
446
|
+
timestamp=datetime.now().isoformat(),
|
|
447
|
+
total_samples=len(sample_names),
|
|
448
|
+
total_rounds=len(self.all_metrics),
|
|
449
|
+
avg_detection_precision=avg_precision,
|
|
450
|
+
avg_detection_recall=avg_recall,
|
|
451
|
+
avg_detection_f1=avg_f1,
|
|
452
|
+
avg_repair_success_rate=avg_repair,
|
|
453
|
+
avg_compile_time_sec=avg_compile,
|
|
454
|
+
avg_total_time_sec=avg_total,
|
|
455
|
+
category_breakdown=category_stats,
|
|
456
|
+
per_sample_results=per_sample,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
def _save_summary(self, summary: BenchmarkSummary) -> Path:
|
|
460
|
+
"""保存评测汇总"""
|
|
461
|
+
output_path = self.output_dir / f"benchmark_{summary.benchmark_id}.json"
|
|
462
|
+
|
|
463
|
+
summary_dict = {
|
|
464
|
+
"benchmark_id": summary.benchmark_id,
|
|
465
|
+
"timestamp": summary.timestamp,
|
|
466
|
+
"total_samples": summary.total_samples,
|
|
467
|
+
"total_rounds": summary.total_rounds,
|
|
468
|
+
"metrics": {
|
|
469
|
+
"avg_detection_precision": summary.avg_detection_precision,
|
|
470
|
+
"avg_detection_recall": summary.avg_detection_recall,
|
|
471
|
+
"avg_detection_f1": summary.avg_detection_f1,
|
|
472
|
+
"avg_repair_success_rate": summary.avg_repair_success_rate,
|
|
473
|
+
"avg_compile_time_sec": summary.avg_compile_time_sec,
|
|
474
|
+
"avg_total_time_sec": summary.avg_total_time_sec,
|
|
475
|
+
},
|
|
476
|
+
"category_breakdown": summary.category_breakdown,
|
|
477
|
+
"per_sample_results": summary.per_sample_results,
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
481
|
+
json.dump(summary_dict, f, indent=2, ensure_ascii=False)
|
|
482
|
+
|
|
483
|
+
print(f"\n[保存] 评测报告:{output_path}")
|
|
484
|
+
return output_path
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
# ============================================================
|
|
488
|
+
# 报告生成器
|
|
489
|
+
# ============================================================
|
|
490
|
+
|
|
491
|
+
def generate_markdown_report(summary: BenchmarkSummary, output_path: Path) -> None:
|
|
492
|
+
"""生成 Markdown 格式的评测报告"""
|
|
493
|
+
report_lines = [
|
|
494
|
+
f"# VTO Benchmark 评测报告",
|
|
495
|
+
f"",
|
|
496
|
+
f"**评测 ID**: {summary.benchmark_id}",
|
|
497
|
+
f"**生成时间**: {summary.timestamp}",
|
|
498
|
+
f"",
|
|
499
|
+
f"## 总体指标",
|
|
500
|
+
f"",
|
|
501
|
+
f"| 指标 | 值 |",
|
|
502
|
+
f"|------|-----|",
|
|
503
|
+
f"| 评测样本数 | {summary.total_samples} |",
|
|
504
|
+
f"| 总轮数 | {summary.total_rounds} |",
|
|
505
|
+
f"| 平均检测查准率 | {summary.avg_detection_precision:.2%} |",
|
|
506
|
+
f"| 平均检测查全率 | {summary.avg_detection_recall:.2%} |",
|
|
507
|
+
f"| 平均检测 F1 | {summary.avg_detection_f1:.2%} |",
|
|
508
|
+
f"| 平均修复成功率 | {summary.avg_repair_success_rate:.2%} |",
|
|
509
|
+
f"| 平均编译时间 | {summary.avg_compile_time_sec:.2f}s |",
|
|
510
|
+
f"| 平均总耗时 | {summary.avg_total_time_sec:.2f}s |",
|
|
511
|
+
f"",
|
|
512
|
+
f"## 按类别分解",
|
|
513
|
+
f"",
|
|
514
|
+
f"| 类别 | 缺陷数 | 修复成功 | 成功率 |",
|
|
515
|
+
f"|------|--------|----------|--------|",
|
|
516
|
+
]
|
|
517
|
+
|
|
518
|
+
for cat, stats in sorted(summary.category_breakdown.items()):
|
|
519
|
+
report_lines.append(
|
|
520
|
+
f"| Category {cat} | {stats['count']} | {stats['repaired']} | "
|
|
521
|
+
f"{stats.get('success_rate', 0):.2%} |"
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
report_lines.extend([
|
|
525
|
+
f"",
|
|
526
|
+
f"## 各样本结果",
|
|
527
|
+
f"",
|
|
528
|
+
f"| 样本名称 | 最终 F1 | 修复成功率 | 轮数 |",
|
|
529
|
+
f"|----------|---------|------------|------|",
|
|
530
|
+
])
|
|
531
|
+
|
|
532
|
+
for sample in sorted(summary.per_sample_results, key=lambda x: x["sample_name"]):
|
|
533
|
+
report_lines.append(
|
|
534
|
+
f"| {sample['sample_name']} | {sample['final_f1']:.2%} | "
|
|
535
|
+
f"{sample['final_repair_rate']:.2%} | {sample['total_rounds']} |"
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
report_lines.extend([
|
|
539
|
+
f"",
|
|
540
|
+
f"---",
|
|
541
|
+
f"*报告由 benchmark_runner.py 自动生成*",
|
|
542
|
+
])
|
|
543
|
+
|
|
544
|
+
output_path.write_text("\n".join(report_lines), encoding="utf-8")
|
|
545
|
+
print(f"[保存] Markdown 报告:{output_path}")
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
# ============================================================
|
|
549
|
+
# 主函数
|
|
550
|
+
# ============================================================
|
|
551
|
+
|
|
552
|
+
def main():
|
|
553
|
+
"""主函数"""
|
|
554
|
+
parser = argparse.ArgumentParser(
|
|
555
|
+
description="VTO Benchmark 评测运行器"
|
|
556
|
+
)
|
|
557
|
+
parser.add_argument(
|
|
558
|
+
"--samples-dir",
|
|
559
|
+
type=str,
|
|
560
|
+
default="data/benchmarks/samples",
|
|
561
|
+
help="测试样本目录"
|
|
562
|
+
)
|
|
563
|
+
parser.add_argument(
|
|
564
|
+
"--output-dir",
|
|
565
|
+
type=str,
|
|
566
|
+
default="data/benchmarks/results",
|
|
567
|
+
help="输出结果目录"
|
|
568
|
+
)
|
|
569
|
+
parser.add_argument(
|
|
570
|
+
"--samples",
|
|
571
|
+
nargs="+",
|
|
572
|
+
default=None,
|
|
573
|
+
help="指定要评测的样本名称,默认评测所有"
|
|
574
|
+
)
|
|
575
|
+
parser.add_argument(
|
|
576
|
+
"--rounds",
|
|
577
|
+
type=int,
|
|
578
|
+
default=3,
|
|
579
|
+
help="每个样本的最大迭代轮数"
|
|
580
|
+
)
|
|
581
|
+
parser.add_argument(
|
|
582
|
+
"--paperfit-root",
|
|
583
|
+
type=str,
|
|
584
|
+
default=None,
|
|
585
|
+
help="PaperFit 项目根目录"
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
args = parser.parse_args()
|
|
589
|
+
|
|
590
|
+
samples_dir = Path(args.samples_dir)
|
|
591
|
+
output_dir = Path(args.output_dir)
|
|
592
|
+
paperfit_root = Path(args.paperfit_root) if args.paperfit_root else None
|
|
593
|
+
|
|
594
|
+
if not samples_dir.exists():
|
|
595
|
+
print(f"[错误] 样本目录不存在:{samples_dir}")
|
|
596
|
+
print("请先运行 inject_defects.py 生成测试样本")
|
|
597
|
+
sys.exit(1)
|
|
598
|
+
|
|
599
|
+
# 创建运行器
|
|
600
|
+
runner = BenchmarkRunner(
|
|
601
|
+
samples_dir=samples_dir,
|
|
602
|
+
output_dir=output_dir,
|
|
603
|
+
paperfit_root=paperfit_root,
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
# 运行评测
|
|
607
|
+
summary = runner.run_benchmark(
|
|
608
|
+
sample_names=args.samples,
|
|
609
|
+
max_rounds_per_sample=args.rounds,
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
# 生成 Markdown 报告
|
|
613
|
+
report_path = output_dir / f"benchmark_{summary.benchmark_id}.md"
|
|
614
|
+
generate_markdown_report(summary, report_path)
|
|
615
|
+
|
|
616
|
+
# 打印摘要
|
|
617
|
+
print("\n" + "=" * 60)
|
|
618
|
+
print("评测摘要")
|
|
619
|
+
print("=" * 60)
|
|
620
|
+
print(f"样本数:{summary.total_samples}")
|
|
621
|
+
print(f"总轮数:{summary.total_rounds}")
|
|
622
|
+
print(f"平均检测 F1: {summary.avg_detection_f1:.2%}")
|
|
623
|
+
print(f"平均修复成功率:{summary.avg_repair_success_rate:.2%}")
|
|
624
|
+
print(f"平均耗时:{summary.avg_total_time_sec:.2f}s")
|
|
625
|
+
print("=" * 60)
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
if __name__ == "__main__":
|
|
629
|
+
main()
|