paperfit-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.claude/commands/adjust-length.md +21 -0
  2. package/.claude/commands/check-visual.md +27 -0
  3. package/.claude/commands/fix-layout.md +31 -0
  4. package/.claude/commands/migrate-template.md +23 -0
  5. package/.claude/commands/repair-table.md +21 -0
  6. package/.claude/commands/show-status.md +32 -0
  7. package/.claude-plugin/README.md +77 -0
  8. package/.claude-plugin/marketplace.json +41 -0
  9. package/.claude-plugin/plugin.json +39 -0
  10. package/CLAUDE.md +266 -0
  11. package/CONTRIBUTING.md +131 -0
  12. package/LICENSE +21 -0
  13. package/README.md +164 -0
  14. package/agents/code-surgeon-agent.md +214 -0
  15. package/agents/layout-detective-agent.md +229 -0
  16. package/agents/orchestrator-agent.md +254 -0
  17. package/agents/quality-gatekeeper-agent.md +270 -0
  18. package/agents/rule-engine-agent.md +224 -0
  19. package/agents/semantic-polish-agent.md +250 -0
  20. package/bin/paperfit.js +176 -0
  21. package/config/agent_roles.yaml +56 -0
  22. package/config/layout_rules.yaml +54 -0
  23. package/config/templates.yaml +241 -0
  24. package/config/vto_taxonomy.yaml +489 -0
  25. package/config/writing_rules.yaml +64 -0
  26. package/install.sh +30 -0
  27. package/package.json +52 -0
  28. package/requirements.txt +5 -0
  29. package/scripts/benchmark_runner.py +629 -0
  30. package/scripts/compile.sh +244 -0
  31. package/scripts/config_validator.py +339 -0
  32. package/scripts/cv_detector.py +600 -0
  33. package/scripts/evidence_collector.py +167 -0
  34. package/scripts/float_fixers.py +861 -0
  35. package/scripts/inject_defects.py +549 -0
  36. package/scripts/install-claude-global.js +148 -0
  37. package/scripts/install.js +66 -0
  38. package/scripts/install.sh +106 -0
  39. package/scripts/overflow_fixers.py +656 -0
  40. package/scripts/package-for-opensource.sh +138 -0
  41. package/scripts/parse_log.py +260 -0
  42. package/scripts/postinstall.js +38 -0
  43. package/scripts/pre_tool_use.py +265 -0
  44. package/scripts/render_pages.py +244 -0
  45. package/scripts/session_logger.py +329 -0
  46. package/scripts/space_util_fixers.py +773 -0
  47. package/scripts/state_manager.py +352 -0
  48. package/scripts/test_commands.py +187 -0
  49. package/scripts/test_cv_detector.py +214 -0
  50. package/scripts/test_integration.py +290 -0
  51. package/skills/consistency-polisher/SKILL.md +337 -0
  52. package/skills/float-optimizer/SKILL.md +284 -0
  53. package/skills/latex_fixers/__init__.py +82 -0
  54. package/skills/latex_fixers/float_fixers.py +392 -0
  55. package/skills/latex_fixers/fullwidth_fixers.py +375 -0
  56. package/skills/latex_fixers/overflow_fixers.py +250 -0
  57. package/skills/latex_fixers/semantic_micro_tuning.py +362 -0
  58. package/skills/latex_fixers/space_util_fixers.py +389 -0
  59. package/skills/latex_fixers/utils.py +55 -0
  60. package/skills/overflow-repair/SKILL.md +304 -0
  61. package/skills/space-util-fixer/SKILL.md +307 -0
  62. package/skills/taxonomy-vto/SKILL.md +486 -0
  63. package/skills/template-migrator/SKILL.md +251 -0
  64. package/skills/visual-inspector/SKILL.md +217 -0
  65. package/skills/writing-polish/SKILL.md +289 -0
@@ -0,0 +1,861 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Float Fixers Module
4
+
5
+ 处理 Category B:浮动体缺陷
6
+ - B1: 浮动体远离首次引用
7
+ - B2: 浮动体大小不适配栏宽
8
+ - B3: 浮动体连续堆叠
9
+ - B4: 浮动体跨页分裂
10
+
11
+ 该模块被 code-surgeon-agent 调用,执行对 .tex 源码的精确修改。
12
+ 所有修复遵循最小修改原则,不改变学术内容。
13
+ """
14
+
15
+ import re
16
+ from pathlib import Path
17
+ from dataclasses import dataclass, field
18
+ from typing import List, Dict, Optional, Tuple, Any
19
+
20
+
21
+ # ============================================================
22
+ # 数据结构定义
23
+ # ============================================================
24
+
25
+ @dataclass
26
+ class FixResult:
27
+ """修复结果"""
28
+ defect_id: str
29
+ object_name: str
30
+ action: str
31
+ before: str
32
+ after: str
33
+ page: int = 0
34
+ line_number: Optional[int] = None
35
+ success: bool = False
36
+
37
+
38
+ @dataclass
39
+ class FloatFixReport:
40
+ """修复报告"""
41
+ status: str # success | partial | failed
42
+ modified_files: List[str] = field(default_factory=list)
43
+ changes: List[FixResult] = field(default_factory=list)
44
+ unresolved: List[str] = field(default_factory=list)
45
+
46
+ def to_dict(self) -> Dict[str, Any]:
47
+ return {
48
+ "skill": "float-optimizer",
49
+ "status": self.status,
50
+ "modified_files": self.modified_files,
51
+ "changes": [
52
+ {
53
+ "defect_id": c.defect_id,
54
+ "object": c.object_name,
55
+ "action": c.action,
56
+ "before": c.before,
57
+ "after": c.after,
58
+ "page": c.page,
59
+ "line_number": c.line_number,
60
+ "success": c.success,
61
+ }
62
+ for c in self.changes
63
+ ],
64
+ "unresolved": self.unresolved,
65
+ }
66
+
67
+
68
+ # ============================================================
69
+ # B1:浮动体远离首次引用
70
+ # ============================================================
71
+
72
+ def fix_float_reference_distance(
73
+ tex_content: str,
74
+ float_label: str,
75
+ ref_page: int,
76
+ float_page: int,
77
+ ) -> Tuple[str, Optional[FixResult]]:
78
+ """
79
+ 修复浮动体远离首次引用问题
80
+
81
+ 策略优先级:
82
+ 1. 调整位置参数为 [htbp]
83
+ 2. 在引用点后添加 \FloatBarrier
84
+ 3. 移动浮动体源码位置
85
+ 4. 拆分大型浮动体
86
+
87
+ Args:
88
+ tex_content: .tex 文件内容
89
+ float_label: 浮动体标签 (如 "fig:result" 或 "tab:results")
90
+ ref_page: 首次引用所在页码
91
+ float_page: 浮动体实际所在页码
92
+
93
+ Returns:
94
+ (修改后的内容,修复结果)
95
+ """
96
+ # 计算距离
97
+ distance = abs(float_page - ref_page)
98
+ if distance <= 1:
99
+ # 距离在可接受范围内
100
+ return tex_content, None
101
+
102
+ # 定位浮动体环境
103
+ float_type = "figure" if "fig" in float_label.lower() else "table"
104
+
105
+ # 策略 1: 调整位置参数
106
+ pattern = r'\\begin\{' + float_type + r'\}(\[[^\]]*\])?'
107
+ matches = list(re.finditer(pattern, tex_content))
108
+
109
+ # 找到包含目标 label 的浮动体
110
+ target_match = None
111
+ for match in matches:
112
+ # 向后查找 \label
113
+ after_start = match.end()
114
+ label_pattern = r'\\label\{' + re.escape(float_label) + r'\}'
115
+ label_match = re.search(label_pattern, tex_content[after_start:after_start + 500])
116
+ if label_match:
117
+ target_match = match
118
+ break
119
+
120
+ if target_match:
121
+ pos_param = target_match.group(1) if target_match.group(1) else ""
122
+
123
+ # 检查当前参数是否限制性强
124
+ if pos_param in ['[t]', '[b]', '[h]', '[!t]', '[!b]', '[!h]']:
125
+ # 改为 [htbp]
126
+ new_param = "[htbp]"
127
+ modified_content = tex_content[:target_match.start(1)] + new_param + tex_content[target_match.end(1):]
128
+ return modified_content, FixResult(
129
+ defect_id="B1",
130
+ object_name=float_label,
131
+ action=f"将浮动体位置参数从 {pos_param} 改为 {new_param}",
132
+ before=f"\\begin{{{float_type}}}{pos_param}",
133
+ after=f"\\begin{{{float_type}}}{new_param}",
134
+ success=True,
135
+ )
136
+ elif not pos_param:
137
+ # 没有参数,添加 [htbp]
138
+ new_param = "[htbp]"
139
+ insert_pos = target_match.end()
140
+ modified_content = tex_content[:insert_pos] + new_param + tex_content[insert_pos:]
141
+ return modified_content, FixResult(
142
+ defect_id="B1",
143
+ object_name=float_label,
144
+ action=f"添加浮动体位置参数 {new_param}",
145
+ before=f"\\begin{{{float_type}}}",
146
+ after=f"\\begin{{{float_type}}}{new_param}",
147
+ success=True,
148
+ )
149
+
150
+ # 策略 2: 在引用点后添加 \FloatBarrier
151
+ # 查找首次引用该 label 的位置
152
+ ref_pattern = r'\\(ref|autoref|cref|Cref)\{' + re.escape(float_label) + r'\}'
153
+ ref_match = re.search(ref_pattern, tex_content)
154
+
155
+ if ref_match:
156
+ # 在引用后添加 \FloatBarrier
157
+ insert_pos = ref_match.end()
158
+ # 检查是否已有 \FloatBarrier
159
+ after_ref = tex_content[insert_pos:insert_pos + 100]
160
+ if '\\FloatBarrier' not in after_ref:
161
+ modified_content = tex_content[:insert_pos] + "\n\\FloatBarrier" + tex_content[insert_pos:]
162
+ return modified_content, FixResult(
163
+ defect_id="B1",
164
+ object_name=float_label,
165
+ action="在引用后添加 \\FloatBarrier 以阻止浮动体继续漂后",
166
+ before=ref_match.group(0)[:30] + "...",
167
+ after=ref_match.group(0) + "\n\\FloatBarrier",
168
+ success=True,
169
+ )
170
+
171
+ return tex_content, None
172
+
173
+
174
+ def add_floatbarrier_to_preamble(
175
+ tex_content: str,
176
+ ) -> Tuple[str, Optional[FixResult]]:
177
+ """
178
+ 在导言区添加 placeins 宏包以支持 \FloatBarrier
179
+ """
180
+ if '\\usepackage{placeins}' in tex_content:
181
+ return tex_content, None
182
+
183
+ # 在 \begin{document} 前添加
184
+ match = re.search(r'\\begin\{document\}', tex_content)
185
+ if match:
186
+ insert_pos = match.start()
187
+ modified_content = tex_content[:insert_pos] + "\\usepackage{placeins}\n" + tex_content[insert_pos:]
188
+ return modified_content, FixResult(
189
+ defect_id="B1",
190
+ object_name="导言区",
191
+ action="添加 placeins 宏包以支持 \\FloatBarrier",
192
+ before="\\begin{document}",
193
+ after="\\usepackage{placeins}\n\\begin{document}",
194
+ success=True,
195
+ )
196
+
197
+ return tex_content, None
198
+
199
+
200
+ # ============================================================
201
+ # B2:浮动体大小不适配栏宽
202
+ # ============================================================
203
+
204
+ def fix_figure_width_mismatch(
205
+ tex_content: str,
206
+ figure_label: str,
207
+ template_type: str = "single_column",
208
+ ) -> Tuple[str, Optional[FixResult]]:
209
+ """
210
+ 修复图片宽度不适配栏宽问题
211
+
212
+ 策略优先级:
213
+ 1. 设置宽度为 \linewidth
214
+ 2. 区分单栏/跨栏 (双栏模板)
215
+ 3. 设置高度 + keepaspectratio
216
+
217
+ Args:
218
+ tex_content: .tex 文件内容
219
+ figure_label: 图片标签
220
+ template_type: 模板类型 ("single_column" | "double_column")
221
+
222
+ Returns:
223
+ (修改后的内容,修复结果)
224
+ """
225
+ # 定位 figure 环境
226
+ pattern = r'(\\begin\{(?:figure|figure\*)\}(?:\[[htbp]+\])?.*?)(\\includegraphics(?:\[[^\]]*\])?\{[^}]+\})'
227
+ matches = list(re.finditer(pattern, tex_content, re.DOTALL))
228
+
229
+ target_match = None
230
+ for match in matches:
231
+ label_pattern = r'\\label\{' + re.escape(figure_label) + r'\}'
232
+ label_match = re.search(label_pattern, tex_content[match.start():match.end()])
233
+ if label_match:
234
+ target_match = match
235
+ break
236
+
237
+ if not target_match:
238
+ return tex_content, None
239
+
240
+ include_graphic = target_match.group(2)
241
+
242
+ # 检查是否已有 \linewidth 宽度
243
+ if r'\linewidth' in include_graphic or r'\textwidth' in include_graphic:
244
+ # 宽度已合理设置
245
+ return tex_content, None
246
+
247
+ # 策略 1: 标准化宽度设置
248
+ # 解析当前宽度参数
249
+ width_match = re.search(r'width=([^\s,\]]+)', include_graphic)
250
+
251
+ if width_match:
252
+ # 替换现有宽度为 \linewidth
253
+ new_graphic = re.sub(
254
+ r'width=[^\s,\]]+',
255
+ r'width=\\linewidth',
256
+ include_graphic
257
+ )
258
+ else:
259
+ # 没有 width 参数,添加
260
+ # 检查是否有可选参数
261
+ if include_graphic.startswith('\\includegraphics['):
262
+ # 有可选参数,在 ] 前插入
263
+ bracket_pos = include_graphic.find(']')
264
+ new_graphic = include_graphic[:bracket_pos] + ',width=\\linewidth' + include_graphic[bracket_pos:]
265
+ else:
266
+ # 没有可选参数,添加
267
+ new_graphic = include_graphic.replace(
268
+ '\\includegraphics',
269
+ '\\includegraphics[width=\\linewidth]'
270
+ )
271
+
272
+ modified_content = tex_content.replace(include_graphic, new_graphic, 1)
273
+
274
+ return modified_content, FixResult(
275
+ defect_id="B2",
276
+ object_name=figure_label,
277
+ action="将图片宽度设为 \\linewidth",
278
+ before=include_graphic[:50] + "...",
279
+ after=new_graphic[:50] + "...",
280
+ success=True,
281
+ )
282
+
283
+
284
+ def fix_table_width_mismatch(
285
+ tex_content: str,
286
+ table_label: str,
287
+ ) -> Tuple[str, Optional[FixResult]]:
288
+ """
289
+ 修复表格宽度不适配栏宽问题
290
+
291
+ 策略优先级:
292
+ 1. 将 tabular 改为 tabularx 并设宽度为 \linewidth
293
+ 2. 调整列规格
294
+ 3. 使用 sidewaystable 旋转超宽表格
295
+
296
+ Args:
297
+ tex_content: .tex 文件内容
298
+ table_label: 表格标签
299
+
300
+ Returns:
301
+ (修改后的内容,修复结果)
302
+ """
303
+ # 定位 table 环境
304
+ pattern = r'(\\begin\{table\}(?:\[[htbp]+\])?.*?)(\\begin\{tabular\})(\{[^}]+\})(.*?)(\\end\{tabular\})'
305
+ matches = list(re.finditer(pattern, tex_content, re.DOTALL))
306
+
307
+ target_match = None
308
+ for match in matches:
309
+ label_pattern = r'\\label\{' + re.escape(table_label) + r'\}'
310
+ label_match = re.search(label_pattern, tex_content[match.start():match.end()])
311
+ if label_match:
312
+ target_match = match
313
+ break
314
+
315
+ if not target_match:
316
+ return tex_content, None
317
+
318
+ tabular_start = target_match.group(2)
319
+ column_spec = target_match.group(3)
320
+ table_body = target_match.group(4)
321
+ tabular_end = target_match.group(5)
322
+
323
+ # 检查是否已有宽度设置
324
+ if target_match.group(1).strip().endswith('{\\linewidth}'):
325
+ return tex_content, None
326
+
327
+ # 策略:改为 tabularx
328
+ new_column_spec = _convert_to_tabularx_columns(column_spec.strip('{}'))
329
+
330
+ old_full = f"\\begin{{tabular}}{column_spec}"
331
+ new_full = f"\\begin{{tabularx}}{{\\linewidth}}{new_column_spec}"
332
+
333
+ modified_content = tex_content.replace(
334
+ old_full,
335
+ new_full,
336
+ 1
337
+ ).replace(
338
+ "\\end{tabular}",
339
+ "\\end{tabularx}",
340
+ 1
341
+ )
342
+
343
+ return modified_content, FixResult(
344
+ defect_id="B2",
345
+ object_name=table_label,
346
+ action=f"将 tabular 改为 tabularx,宽度设为 \\linewidth",
347
+ before=f"\\begin{{tabular}}{column_spec}",
348
+ after=f"\\begin{{tabularx}}{{\\linewidth}}{new_column_spec}",
349
+ success=True,
350
+ )
351
+
352
+
353
+ def _convert_to_tabularx_columns(column_spec: str) -> str:
354
+ """
355
+ 将 tabular 列规格转换为 tabularx 列规格
356
+ 策略:将最宽的文本列改为 X 列
357
+ """
358
+ spec = column_spec.strip('{}')
359
+
360
+ # 统计列类型
361
+ text_columns = []
362
+ for i, c in enumerate(spec):
363
+ if c in 'lrc':
364
+ text_columns.append((i, c))
365
+
366
+ if not text_columns:
367
+ return column_spec
368
+
369
+ # 将最后一个文本列改为 X 列
370
+ last_text_idx, _ = text_columns[-1]
371
+ new_spec = spec[:last_text_idx] + 'X' + spec[last_text_idx + 1:]
372
+
373
+ return '{' + new_spec + '}'
374
+
375
+
376
+ def fix_wide_float_in_double_column(
377
+ tex_content: str,
378
+ float_label: str,
379
+ ) -> Tuple[str, Optional[FixResult]]:
380
+ """
381
+ 在双栏模板中修复宽浮动体
382
+
383
+ 策略:
384
+ 1. 将 figure 改为 figure* (跨栏)
385
+ 2. 将 table 改为 table*
386
+ 3. 宽度设为 \\textwidth
387
+
388
+ Args:
389
+ tex_content: .tex 文件内容
390
+ float_label: 浮动体标签
391
+
392
+ Returns:
393
+ (修改后的内容,修复结果)
394
+ """
395
+ # 确定浮动体类型
396
+ is_figure = "fig" in float_label.lower()
397
+ float_type = "figure" if is_figure else "table"
398
+
399
+ # 查找浮动体环境
400
+ pattern = r'\\begin\{' + float_type + r'\}(\[[htbp]+\])?'
401
+ matches = list(re.finditer(pattern, tex_content))
402
+
403
+ target_match = None
404
+ for match in matches:
405
+ label_pattern = r'\\label\{' + re.escape(float_label) + r'\}'
406
+ after_start = match.end()
407
+ label_match = re.search(label_pattern, tex_content[after_start:after_start + 500])
408
+ if label_match:
409
+ target_match = match
410
+ break
411
+
412
+ if not target_match:
413
+ return tex_content, None
414
+
415
+ # 改为跨栏环境
416
+ old_env = f"\\begin{{{float_type}}}"
417
+ new_env = f"\\begin{{{float_type}*}}"
418
+
419
+ modified_content = tex_content.replace(old_env, new_env, 1)
420
+ modified_content = modified_content.replace(
421
+ f"\\end{{{float_type}}}",
422
+ f"\\end{{{float_type}*}}",
423
+ 1
424
+ )
425
+
426
+ return modified_content, FixResult(
427
+ defect_id="B2",
428
+ object_name=float_label,
429
+ action=f"将 {float_type} 改为 {float_type}* 以跨栏显示",
430
+ before=old_env,
431
+ after=new_env,
432
+ success=True,
433
+ )
434
+
435
+
436
+ # ============================================================
437
+ # B3:浮动体连续堆叠
438
+ # ============================================================
439
+
440
+ def fix_float_clustering(
441
+ tex_content: str,
442
+ float_labels: List[str],
443
+ ) -> Tuple[str, Optional[FixResult]]:
444
+ """
445
+ 修复浮动体连续堆叠问题
446
+
447
+ 策略优先级:
448
+ 1. 分散浮动体位置参数
449
+ 2. 在浮动体之间插入正文
450
+ 3. 使用 \FloatBarrier 控制
451
+
452
+ Args:
453
+ tex_content: .tex 文件内容
454
+ float_labels: 堆叠的浮动体标签列表
455
+
456
+ Returns:
457
+ (修改后的内容,修复结果)
458
+ """
459
+ if len(float_labels) < 2:
460
+ return tex_content, None
461
+
462
+ # 策略 1: 为每个浮动体分配不同的位置偏好
463
+ position_prefs = ["[t]", "[b]", "[p]", "[htbp]"]
464
+ changes_made = []
465
+
466
+ for i, label in enumerate(float_labels[:len(position_prefs)]):
467
+ # 确定浮动体类型
468
+ float_type = "figure" if "fig" in label.lower() else "table"
469
+ pattern = r'\\begin\{' + float_type + r'\}(\[[^\]]*\])?'
470
+ match = re.search(pattern, tex_content)
471
+
472
+ if match:
473
+ current_param = match.group(1) if match.group(1) else ""
474
+ new_param = position_prefs[i]
475
+
476
+ if current_param != new_param:
477
+ if current_param:
478
+ tex_content = tex_content[:match.start(1)] + new_param + tex_content[match.end(1):]
479
+ else:
480
+ insert_pos = match.end()
481
+ tex_content = tex_content[:insert_pos] + new_param + tex_content[insert_pos:]
482
+
483
+ changes_made.append({
484
+ "label": label,
485
+ "before": current_param or f"\\begin{{{float_type}}}",
486
+ "after": f"\\begin{{{float_type}}}{new_param}",
487
+ })
488
+
489
+ if changes_made:
490
+ return tex_content, FixResult(
491
+ defect_id="B3",
492
+ object_name=", ".join([c["label"] for c in changes_made]),
493
+ action="分散浮动体位置参数以避免堆叠",
494
+ before="; ".join([c["before"] for c in changes_made]),
495
+ after="; ".join([c["after"] for c in changes_made]),
496
+ success=True,
497
+ )
498
+
499
+ return tex_content, None
500
+
501
+
502
+ # ============================================================
503
+ # B4:浮动体跨页分裂
504
+ # ============================================================
505
+
506
+ def fix_split_table(
507
+ tex_content: str,
508
+ table_label: str,
509
+ ) -> Tuple[str, Optional[FixResult]]:
510
+ """
511
+ 修复长表格跨页分裂问题
512
+
513
+ 策略优先级:
514
+ 1. 将 table + tabular 改为 longtable
515
+ 2. 强制表格不跨页 [!h]
516
+ 3. 拆分过大的表格
517
+
518
+ Args:
519
+ tex_content: .tex 文件内容
520
+ table_label: 表格标签
521
+
522
+ Returns:
523
+ (修改后的内容,修复结果)
524
+ """
525
+ # 定位 table 环境
526
+ pattern = r'(\\begin\{table\}(?:\[[^\]]*\])?.*?)(\\begin\{tabular\})(\{[^}]+\})(.*?)(\\end\{tabular\})(.*?\\end\{table\})'
527
+ matches = list(re.finditer(pattern, tex_content, re.DOTALL))
528
+
529
+ target_match = None
530
+ for match in matches:
531
+ label_pattern = r'\\label\{' + re.escape(table_label) + r'\}'
532
+ label_match = re.search(label_pattern, tex_content[match.start():match.end()])
533
+ if label_match:
534
+ target_match = match
535
+ break
536
+
537
+ if not target_match:
538
+ return tex_content, None
539
+
540
+ full_table = target_match.group(0)
541
+ table_start = target_match.group(1)
542
+ tabular_start = target_match.group(2)
543
+ column_spec = target_match.group(3)
544
+ table_body = target_match.group(4)
545
+ tabular_end = target_match.group(5)
546
+ table_end = target_match.group(6)
547
+
548
+ # 检查是否已有 caption 和 label
549
+ caption_match = re.search(r'\\caption\{([^}]+)\}', table_start + table_body)
550
+ label_match = re.search(r'\\label\{([^}]+)\}', table_start + table_body)
551
+
552
+ caption_text = caption_match.group(1) if caption_match else "Long Table"
553
+ label_text = label_match.group(1) if label_match else table_label
554
+
555
+ # 策略 1: 改为 longtable
556
+ # 提取表头 (第一行)
557
+ header_match = re.search(r'([^\\]*?)(?:\\hline)?\s*([^\\]+?)\s*\\\\', table_body)
558
+ if header_match:
559
+ header_row = header_match.group(2).strip()
560
+
561
+ longtable_content = f"""\\begin{{longtable}}{column_spec}
562
+ \\caption{{{caption_text}}} \\label{{{label_text}}} \\\\
563
+ \\hline
564
+ {header_row} \\\\
565
+ \\hline
566
+ \\endfirsthead
567
+ \\hline
568
+ {header_row} \\\\
569
+ \\hline
570
+ \\endhead
571
+ \\hline \\multicolumn{{{len(column_spec.strip("{}"))}}}{{r}}{{Continued on next page}} \\\\
572
+ \\endfoot
573
+ \\hline
574
+ \\endlastfoot
575
+ """
576
+ # 添加表体 (去除第一行)
577
+ body_lines = table_body.split('\\\\')
578
+ if len(body_lines) > 1:
579
+ longtable_content += '\n'.join(body_lines[1:])
580
+
581
+ longtable_content += "\n\\end{longtable}"
582
+
583
+ modified_content = tex_content.replace(full_table, longtable_content, 1)
584
+
585
+ return modified_content, FixResult(
586
+ defect_id="B4",
587
+ object_name=table_label,
588
+ action="将 table+tabular 改为 longtable 以支持跨页",
589
+ before=f"\\begin{{table}}...\\end{{tabular}}...\\end{{table}}",
590
+ after=f"\\begin{{longtable}}{column_spec}...\\end{{longtable}}",
591
+ success=True,
592
+ )
593
+
594
+ return tex_content, None
595
+
596
+
597
+ def fix_split_figure(
598
+ tex_content: str,
599
+ figure_label: str,
600
+ ) -> Tuple[str, Optional[FixResult]]:
601
+ """
602
+ 修复图片组跨页分裂问题
603
+
604
+ 策略:
605
+ 1. 强制图片不跨页 [!h]
606
+ 2. 拆分过大的图片组为多个独立 figure
607
+
608
+ Args:
609
+ tex_content: .tex 文件内容
610
+ figure_label: 图片标签
611
+
612
+ Returns:
613
+ (修改后的内容,修复结果)
614
+ """
615
+ # 定位 figure 环境
616
+ pattern = r'\\begin\{figure\}(\[[^\]]*\])?'
617
+ matches = list(re.finditer(pattern, tex_content))
618
+
619
+ target_match = None
620
+ for match in matches:
621
+ label_pattern = r'\\label\{' + re.escape(figure_label) + r'\}'
622
+ after_start = match.end()
623
+ label_match = re.search(label_pattern, tex_content[after_start:after_start + 500])
624
+ if label_match:
625
+ target_match = match
626
+ break
627
+
628
+ if not target_match:
629
+ return tex_content, None
630
+
631
+ current_param = target_match.group(1) if target_match.group(1) else ""
632
+
633
+ # 策略:添加 [!h] 强制位置
634
+ new_param = "[!h]"
635
+ if current_param != new_param:
636
+ if current_param:
637
+ modified_content = tex_content[:target_match.start(1)] + new_param + tex_content[target_match.end(1):]
638
+ else:
639
+ insert_pos = target_match.end()
640
+ modified_content = tex_content[:insert_pos] + new_param + tex_content[insert_pos:]
641
+
642
+ return modified_content, FixResult(
643
+ defect_id="B4",
644
+ object_name=figure_label,
645
+ action=f"添加 [!h] 强制图片放置在此处以避免分裂",
646
+ before=f"\\begin{{figure}}{current_param}",
647
+ after=f"\\begin{{figure}}{new_param}",
648
+ success=True,
649
+ )
650
+
651
+ return tex_content, None
652
+
653
+
654
+ # ============================================================
655
+ # 主修复函数
656
+ # ============================================================
657
+
658
+ def fix_float_defects(
659
+ tex_file_path: str,
660
+ defects: List[Dict[str, Any]],
661
+ template_type: str = "single_column",
662
+ ) -> FloatFixReport:
663
+ """
664
+ 修复所有 Category B 缺陷
665
+
666
+ Args:
667
+ tex_file_path: .tex 文件路径
668
+ defects: 缺陷列表,每个缺陷包含:
669
+ - defect_id: B1, B2, B3, B4
670
+ - page: 页码
671
+ - object: 对象名称 (图表标签)
672
+ - description: 描述
673
+ - ref_page: 引用页码 (B1 需要)
674
+ template_type: 模板类型 ("single_column" | "double_column")
675
+
676
+ Returns:
677
+ FloatFixReport: 修复报告
678
+ """
679
+ tex_path = Path(tex_file_path)
680
+ if not tex_path.exists():
681
+ return FloatFixReport(
682
+ status="failed",
683
+ unresolved=[f"文件不存在:{tex_file_path}"]
684
+ )
685
+
686
+ try:
687
+ tex_content = tex_path.read_text(encoding='utf-8')
688
+ except (OSError, UnicodeDecodeError) as e:
689
+ return FloatFixReport(
690
+ status="failed",
691
+ unresolved=[f"无法读取文件 {tex_file_path}: {e}"]
692
+ )
693
+
694
+ modified_files = set()
695
+ changes = []
696
+ unresolved = []
697
+
698
+ # 检查是否需要添加 placeins 宏包
699
+ needs_placeins = any(d.get("defect_id") == "B1" for d in defects)
700
+ if needs_placeins and '\\usepackage{placeins}' not in tex_content:
701
+ new_content, fix_result = add_floatbarrier_to_preamble(tex_content)
702
+ if fix_result and new_content != tex_content:
703
+ tex_content = new_content
704
+ changes.append(fix_result)
705
+ modified_files.add(str(tex_path))
706
+
707
+ for defect in defects:
708
+ defect_id = defect.get("defect_id", "")
709
+ page = defect.get("page", 0)
710
+ object_name = defect.get("object", "")
711
+ ref_page = defect.get("ref_page", 0)
712
+
713
+ new_content = tex_content
714
+ fix_result = None
715
+
716
+ if defect_id == "B1":
717
+ # 浮动体远离首次引用
718
+ new_content, fix_result = fix_float_reference_distance(
719
+ tex_content,
720
+ float_label=object_name,
721
+ ref_page=ref_page,
722
+ float_page=page,
723
+ )
724
+
725
+ elif defect_id == "B2":
726
+ # 浮动体大小不适配
727
+ if "fig" in object_name.lower():
728
+ new_content, fix_result = fix_figure_width_mismatch(
729
+ tex_content,
730
+ figure_label=object_name,
731
+ template_type=template_type,
732
+ )
733
+ elif "tab" in object_name.lower():
734
+ new_content, fix_result = fix_table_width_mismatch(
735
+ tex_content,
736
+ table_label=object_name,
737
+ )
738
+
739
+ elif defect_id == "B3":
740
+ # 浮动体连续堆叠 - 需要收集所有堆叠的标签
741
+ # 简化实现:单个处理
742
+ new_content, fix_result = fix_float_clustering(
743
+ tex_content,
744
+ float_labels=[object_name],
745
+ )
746
+
747
+ elif defect_id == "B4":
748
+ # 浮动体跨页分裂
749
+ if "fig" in object_name.lower():
750
+ new_content, fix_result = fix_split_figure(
751
+ tex_content,
752
+ figure_label=object_name,
753
+ )
754
+ elif "tab" in object_name.lower():
755
+ new_content, fix_result = fix_split_table(
756
+ tex_content,
757
+ table_label=object_name,
758
+ )
759
+
760
+ # 检查修复是否成功
761
+ if fix_result and new_content != tex_content:
762
+ tex_content = new_content
763
+ fix_result.page = page
764
+ fix_result.line_number = defect.get("line_number")
765
+ changes.append(fix_result)
766
+ modified_files.add(str(tex_path))
767
+ else:
768
+ unresolved.append(
769
+ f"{defect_id} ({object_name or '未知对象'}): 无法自动修复,可能需要人工调整"
770
+ )
771
+
772
+ # 写入修改后的内容
773
+ if modified_files:
774
+ try:
775
+ tex_path.write_text(tex_content, encoding='utf-8')
776
+ except OSError as e:
777
+ unresolved.append(f"无法写入文件 {tex_path}: {e}")
778
+ return FloatFixReport(
779
+ status="failed",
780
+ modified_files=list(modified_files),
781
+ changes=changes,
782
+ unresolved=unresolved,
783
+ )
784
+
785
+ status = "success" if not unresolved else ("partial" if changes else "failed")
786
+
787
+ return FloatFixReport(
788
+ status=status,
789
+ modified_files=list(modified_files),
790
+ changes=changes,
791
+ unresolved=unresolved,
792
+ )
793
+
794
+
795
+ # ============================================================
796
+ # CLI 入口
797
+ # ============================================================
798
+
799
+ def main():
800
+ """命令行接口"""
801
+ import argparse
802
+ import json
803
+
804
+ parser = argparse.ArgumentParser(
805
+ description="Fix Category B float defects in LaTeX documents"
806
+ )
807
+ parser.add_argument(
808
+ "tex_file",
809
+ help="Path to .tex file"
810
+ )
811
+ parser.add_argument(
812
+ "--defects",
813
+ type=str,
814
+ help="JSON string or file path containing defect list"
815
+ )
816
+ parser.add_argument(
817
+ "--template",
818
+ type=str,
819
+ default="single_column",
820
+ choices=["single_column", "double_column"],
821
+ help="Template type"
822
+ )
823
+ parser.add_argument(
824
+ "--json",
825
+ "-j",
826
+ action="store_true",
827
+ help="Output JSON report"
828
+ )
829
+
830
+ args = parser.parse_args()
831
+
832
+ # 解析缺陷列表
833
+ defects = []
834
+ if args.defects:
835
+ if Path(args.defects).exists():
836
+ with open(args.defects, 'r', encoding='utf-8') as f:
837
+ defects = json.load(f)
838
+ else:
839
+ defects = json.loads(args.defects)
840
+
841
+ # 执行修复
842
+ report = fix_float_defects(args.tex_file, defects, template_type=args.template)
843
+
844
+ if args.json:
845
+ print(json.dumps(report.to_dict(), indent=2, ensure_ascii=False))
846
+ else:
847
+ print(f"\nFloat Fix Report")
848
+ print("=" * 50)
849
+ print(f"Status: {report.status}")
850
+ print(f"Modified files: {report.modified_files}")
851
+ print(f"Changes: {len(report.changes)}")
852
+ for change in report.changes:
853
+ print(f" - [{change.defect_id}] {change.object_name}: {change.action}")
854
+ if report.unresolved:
855
+ print(f"\nUnresolved: {len(report.unresolved)}")
856
+ for u in report.unresolved:
857
+ print(f" - {u}")
858
+
859
+
860
+ if __name__ == "__main__":
861
+ main()