superlab 0.1.64 → 0.1.66
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/i18n.cjs +6 -0
- package/lib/lab_write_contract.json +4 -4
- package/package-assets/claude/commands/lab/write.md +1 -1
- package/package-assets/claude/commands/lab-write.md +1 -1
- package/package-assets/claude/commands/lab:write.md +1 -1
- package/package-assets/claude/commands/lab/357/274/232write.md +1 -1
- package/package-assets/codex/prompts/lab/write.md +1 -1
- package/package-assets/codex/prompts/lab-write.md +1 -1
- package/package-assets/codex/prompts/lab:write.md +1 -1
- package/package-assets/codex/prompts/lab/357/274/232write.md +1 -1
- package/package-assets/shared/lab/.managed/scripts/extract_reference_paper_structure.py +1200 -0
- package/package-assets/shared/lab/.managed/scripts/validate_manuscript_delivery.py +57 -0
- package/package-assets/shared/lab/.managed/scripts/validate_section_draft.py +141 -0
- package/package-assets/shared/lab/.managed/templates/reference-template-intake.md +50 -0
- package/package-assets/shared/lab/.managed/templates/write-iteration.md +27 -0
- package/package-assets/shared/skills/lab/stages/write.md +19 -0
- package/package.json +1 -1
|
@@ -0,0 +1,1200 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Extract reusable writing templates from reference papers.
|
|
3
|
+
|
|
4
|
+
This script is intentionally lightweight. It extracts structure, paragraph
|
|
5
|
+
roles, and visual/table roles for `/lab:write`; it does not summarize or copy
|
|
6
|
+
paper content.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import hashlib
|
|
13
|
+
import html
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import shutil
|
|
18
|
+
import subprocess
|
|
19
|
+
import sys
|
|
20
|
+
import tempfile
|
|
21
|
+
import urllib.parse
|
|
22
|
+
import urllib.request
|
|
23
|
+
from dataclasses import asdict, dataclass
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
SECTION_ALIASES = {
|
|
28
|
+
"abstract": ("abstract", "摘要"),
|
|
29
|
+
"introduction": ("introduction", "intro", "引言", "绪论"),
|
|
30
|
+
"related-work": ("related work", "background", "literature", "相关工作", "文献综述"),
|
|
31
|
+
"method": (
|
|
32
|
+
"method",
|
|
33
|
+
"methods",
|
|
34
|
+
"methodology",
|
|
35
|
+
"approach",
|
|
36
|
+
"model",
|
|
37
|
+
"algorithm",
|
|
38
|
+
"方法",
|
|
39
|
+
"模型",
|
|
40
|
+
"算法",
|
|
41
|
+
),
|
|
42
|
+
"experiments": (
|
|
43
|
+
"experiment",
|
|
44
|
+
"experiments",
|
|
45
|
+
"evaluation",
|
|
46
|
+
"empirical",
|
|
47
|
+
"results",
|
|
48
|
+
"main results",
|
|
49
|
+
"ablation",
|
|
50
|
+
"sensitivity",
|
|
51
|
+
"coverage",
|
|
52
|
+
"实验",
|
|
53
|
+
"评估",
|
|
54
|
+
"结果",
|
|
55
|
+
),
|
|
56
|
+
"discussion": ("discussion", "analysis", "讨论", "分析"),
|
|
57
|
+
"conclusion": ("conclusion", "conclusions", "future work", "结论", "总结"),
|
|
58
|
+
"references": ("references", "bibliography", "参考文献"),
|
|
59
|
+
"appendix": ("appendix", "supplement", "附录"),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
CANONICAL_HEADING_TITLES = {
|
|
63
|
+
"abstract",
|
|
64
|
+
"introduction",
|
|
65
|
+
"related work",
|
|
66
|
+
"background",
|
|
67
|
+
"method",
|
|
68
|
+
"methods",
|
|
69
|
+
"methodology",
|
|
70
|
+
"approach",
|
|
71
|
+
"model",
|
|
72
|
+
"algorithm",
|
|
73
|
+
"experiments",
|
|
74
|
+
"experiment",
|
|
75
|
+
"experimental setup",
|
|
76
|
+
"experimental setups",
|
|
77
|
+
"evaluation",
|
|
78
|
+
"results",
|
|
79
|
+
"main results",
|
|
80
|
+
"ablation study",
|
|
81
|
+
"sensitivity analysis",
|
|
82
|
+
"discussion",
|
|
83
|
+
"conclusion",
|
|
84
|
+
"conclusions",
|
|
85
|
+
"references",
|
|
86
|
+
"bibliography",
|
|
87
|
+
"appendix",
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
EXPERIMENT_PROTOCOL_SLOT_GUIDANCE = {
|
|
91
|
+
"dataset_description": {
|
|
92
|
+
"reader_question": "Which datasets define the evaluation scope, and why are they relevant?",
|
|
93
|
+
"placement_guidance": "place before baselines, metrics, and main results",
|
|
94
|
+
},
|
|
95
|
+
"dataset_statistics": {
|
|
96
|
+
"reader_question": "What dataset scale, feature, treatment, or split facts constrain interpretation?",
|
|
97
|
+
"placement_guidance": "place near dataset descriptions or move detailed statistics to appendix with a main-text pointer",
|
|
98
|
+
},
|
|
99
|
+
"split_protocol": {
|
|
100
|
+
"reader_question": "How are train, validation, test, seed, or sampling decisions made?",
|
|
101
|
+
"placement_guidance": "place before metrics and results so comparisons have a fixed protocol",
|
|
102
|
+
},
|
|
103
|
+
"baseline_setup": {
|
|
104
|
+
"reader_question": "Which comparator families are included, and what role does each comparator play?",
|
|
105
|
+
"placement_guidance": "place after datasets and before the main comparison table",
|
|
106
|
+
},
|
|
107
|
+
"metric_definition": {
|
|
108
|
+
"reader_question": "Which metrics decide ranking, what do they measure, and which direction is better?",
|
|
109
|
+
"placement_guidance": "place before the first result table and repeat local definitions in table notes when needed",
|
|
110
|
+
},
|
|
111
|
+
"implementation_details": {
|
|
112
|
+
"reader_question": "Which tuning, validation, training, or hardware details are needed for reproducibility?",
|
|
113
|
+
"placement_guidance": "place after metrics or in appendix when details are long",
|
|
114
|
+
},
|
|
115
|
+
"main_results": {
|
|
116
|
+
"reader_question": "What is the primary comparison result under the declared protocol?",
|
|
117
|
+
"placement_guidance": "place after setup, baselines, and metrics",
|
|
118
|
+
},
|
|
119
|
+
"ablation": {
|
|
120
|
+
"reader_question": "Which component or design choice accounts for the claimed effect?",
|
|
121
|
+
"placement_guidance": "place after the main results",
|
|
122
|
+
},
|
|
123
|
+
"sensitivity": {
|
|
124
|
+
"reader_question": "How stable is the result under relevant protocol or hyperparameter changes?",
|
|
125
|
+
"placement_guidance": "place after ablations or in an analysis subsection",
|
|
126
|
+
},
|
|
127
|
+
"appendix_dataset_statistics": {
|
|
128
|
+
"reader_question": "Which detailed dataset facts support the compact dataset setup in the main experiments?",
|
|
129
|
+
"placement_guidance": "link from the main experimental setup and keep detailed tables in appendix",
|
|
130
|
+
},
|
|
131
|
+
"appendix_baseline_metric_details": {
|
|
132
|
+
"reader_question": "Which baseline, metric, or implementation details are too long for the main setup?",
|
|
133
|
+
"placement_guidance": "link from the main setup and keep long comparator or metric definitions in appendix",
|
|
134
|
+
},
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class SectionRecord:
|
|
140
|
+
title: str
|
|
141
|
+
section_type: str
|
|
142
|
+
level: int
|
|
143
|
+
start_line: int
|
|
144
|
+
end_line: int
|
|
145
|
+
start_page: int | None
|
|
146
|
+
end_page: int | None
|
|
147
|
+
content: str
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass
|
|
151
|
+
class AssetRecord:
|
|
152
|
+
asset_type: str
|
|
153
|
+
asset_id: str
|
|
154
|
+
caption: str
|
|
155
|
+
appears_in_section: str
|
|
156
|
+
appears_in_title: str
|
|
157
|
+
evidence_role: str
|
|
158
|
+
local_role: str
|
|
159
|
+
placement_logic: str
|
|
160
|
+
text_bridge_before: str
|
|
161
|
+
text_bridge_after: str
|
|
162
|
+
reuse_guidance: str
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def parse_args() -> argparse.Namespace:
|
|
166
|
+
parser = argparse.ArgumentParser(
|
|
167
|
+
description="Extract reference-paper structure templates for /lab:write."
|
|
168
|
+
)
|
|
169
|
+
parser.add_argument("sources", nargs="+", help="Local paper paths or http(s) PDF/HTML URLs")
|
|
170
|
+
parser.add_argument(
|
|
171
|
+
"--output-dir",
|
|
172
|
+
required=True,
|
|
173
|
+
help="Directory to write reference-pattern artifacts",
|
|
174
|
+
)
|
|
175
|
+
return parser.parse_args()
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def slugify(value: str) -> str:
|
|
179
|
+
value = re.sub(r"\.[A-Za-z0-9]+$", "", value)
|
|
180
|
+
slug = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-").lower()
|
|
181
|
+
return slug[:80] or "reference-paper"
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def short_excerpt(text: str, limit: int = 240) -> str:
|
|
185
|
+
cleaned = re.sub(r"\s+", " ", text).strip()
|
|
186
|
+
if len(cleaned) <= limit:
|
|
187
|
+
return cleaned
|
|
188
|
+
return cleaned[: limit - 3].rstrip() + "..."
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def materialize_source(source: str, cache_dir: Path) -> tuple[Path, str]:
|
|
192
|
+
parsed = urllib.parse.urlparse(source)
|
|
193
|
+
if parsed.scheme in {"http", "https"}:
|
|
194
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
195
|
+
suffix = Path(parsed.path).suffix or ".pdf"
|
|
196
|
+
name = slugify(Path(parsed.path).name or parsed.netloc)
|
|
197
|
+
digest = hashlib.sha256(source.encode("utf-8")).hexdigest()[:10]
|
|
198
|
+
target = cache_dir / f"{name}-{digest}{suffix}"
|
|
199
|
+
urllib.request.urlretrieve(source, target)
|
|
200
|
+
return target, "url"
|
|
201
|
+
return Path(source).expanduser().resolve(), "local"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def extract_pdf_text(path: Path) -> tuple[str, str, int | None]:
|
|
205
|
+
errors: list[str] = []
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
from pypdf import PdfReader # type: ignore
|
|
209
|
+
|
|
210
|
+
reader = PdfReader(str(path))
|
|
211
|
+
page_texts = []
|
|
212
|
+
for index, page in enumerate(reader.pages, start=1):
|
|
213
|
+
page_texts.append(f"[Page {index}]\n{page.extract_text() or ''}")
|
|
214
|
+
return "\n\n".join(page_texts), "pypdf", len(reader.pages)
|
|
215
|
+
except Exception as exc:
|
|
216
|
+
errors.append(f"pypdf: {exc}")
|
|
217
|
+
|
|
218
|
+
try:
|
|
219
|
+
import pdfplumber # type: ignore
|
|
220
|
+
|
|
221
|
+
page_texts = []
|
|
222
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
223
|
+
for index, page in enumerate(pdf.pages, start=1):
|
|
224
|
+
page_texts.append(f"[Page {index}]\n{page.extract_text() or ''}")
|
|
225
|
+
return "\n\n".join(page_texts), "pdfplumber", len(pdf.pages)
|
|
226
|
+
except Exception as exc:
|
|
227
|
+
errors.append(f"pdfplumber: {exc}")
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
import fitz # type: ignore
|
|
231
|
+
|
|
232
|
+
doc = fitz.open(str(path))
|
|
233
|
+
page_texts = [f"[Page {i + 1}]\n{page.get_text()}" for i, page in enumerate(doc)]
|
|
234
|
+
page_count = len(doc)
|
|
235
|
+
doc.close()
|
|
236
|
+
return "\n\n".join(page_texts), "pymupdf", page_count
|
|
237
|
+
except Exception as exc:
|
|
238
|
+
errors.append(f"pymupdf: {exc}")
|
|
239
|
+
|
|
240
|
+
if shutil.which("pdftotext"):
|
|
241
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
242
|
+
out_path = Path(tmp_dir) / "paper.txt"
|
|
243
|
+
command = ["pdftotext", "-layout", str(path), str(out_path)]
|
|
244
|
+
completed = subprocess.run(command, capture_output=True, text=True, check=False)
|
|
245
|
+
if completed.returncode == 0:
|
|
246
|
+
return out_path.read_text(encoding="utf-8", errors="ignore"), "pdftotext", None
|
|
247
|
+
errors.append(f"pdftotext: {completed.stderr.strip()}")
|
|
248
|
+
|
|
249
|
+
raise RuntimeError(
|
|
250
|
+
"No PDF text extractor succeeded. Install pypdf, pdfplumber, PyMuPDF, or pdftotext. "
|
|
251
|
+
+ " | ".join(errors)
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def find_uv() -> str | None:
|
|
256
|
+
uv_path = shutil.which("uv")
|
|
257
|
+
if uv_path:
|
|
258
|
+
return uv_path
|
|
259
|
+
local_uv = Path.home() / ".local" / "bin" / "uv"
|
|
260
|
+
if local_uv.exists():
|
|
261
|
+
return str(local_uv)
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def rerun_with_uv_pypdf() -> int | None:
|
|
266
|
+
if os.environ.get("LAB_REFERENCE_PDF_BOOTSTRAPPED"):
|
|
267
|
+
return None
|
|
268
|
+
uv_path = find_uv()
|
|
269
|
+
if not uv_path:
|
|
270
|
+
return None
|
|
271
|
+
env = os.environ.copy()
|
|
272
|
+
env["LAB_REFERENCE_PDF_BOOTSTRAPPED"] = "1"
|
|
273
|
+
command = [
|
|
274
|
+
uv_path,
|
|
275
|
+
"run",
|
|
276
|
+
"--with",
|
|
277
|
+
"pypdf",
|
|
278
|
+
"python3",
|
|
279
|
+
str(Path(__file__).resolve()),
|
|
280
|
+
*sys.argv[1:],
|
|
281
|
+
]
|
|
282
|
+
return subprocess.run(command, env=env, check=False).returncode
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def strip_html(text: str) -> str:
|
|
286
|
+
text = re.sub(r"(?is)<(script|style).*?</\1>", " ", text)
|
|
287
|
+
text = re.sub(r"(?i)<br\s*/?>", "\n", text)
|
|
288
|
+
text = re.sub(r"(?i)</(p|div|section|article|h[1-6]|li|tr)>", "\n", text)
|
|
289
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
290
|
+
return html.unescape(re.sub(r"\n{3,}", "\n\n", text))
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def read_source_text(path: Path) -> tuple[str, str, int | None]:
|
|
294
|
+
suffix = path.suffix.lower()
|
|
295
|
+
if suffix == ".pdf":
|
|
296
|
+
return extract_pdf_text(path)
|
|
297
|
+
raw = path.read_text(encoding="utf-8", errors="ignore")
|
|
298
|
+
if suffix in {".html", ".htm"}:
|
|
299
|
+
return strip_html(raw), "html-text", None
|
|
300
|
+
return raw, "plain-text", None
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def detect_title(text: str, fallback: str) -> str:
|
|
304
|
+
for line in text.splitlines():
|
|
305
|
+
stripped = re.sub(r"^#+\s*", "", line).strip()
|
|
306
|
+
if not stripped or stripped.startswith("[Page "):
|
|
307
|
+
continue
|
|
308
|
+
if is_caption_line(stripped):
|
|
309
|
+
continue
|
|
310
|
+
if len(stripped) < 180:
|
|
311
|
+
return stripped
|
|
312
|
+
return fallback
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def normalize_heading(line: str) -> str:
|
|
316
|
+
line = re.sub(r"^#+\s*", "", line.strip())
|
|
317
|
+
return line.strip()
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def section_type_for_title(title: str) -> str:
|
|
321
|
+
lowered = title.lower()
|
|
322
|
+
if re.match(r"^(appendix\b|[a-z]\.\d+(?:\.\d+)*\.?\s+)", lowered.strip()):
|
|
323
|
+
return "appendix"
|
|
324
|
+
lowered = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", lowered).strip()
|
|
325
|
+
for section_type, aliases in SECTION_ALIASES.items():
|
|
326
|
+
if any(alias in lowered for alias in aliases):
|
|
327
|
+
return section_type
|
|
328
|
+
return "other"
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def heading_level(title: str) -> int:
|
|
332
|
+
markdown = re.match(r"^(#{1,6})\s+", title)
|
|
333
|
+
if markdown:
|
|
334
|
+
return len(markdown.group(1))
|
|
335
|
+
numeric = re.match(r"^(\d+(?:\.\d+)*)\.?\s+", title.strip())
|
|
336
|
+
if numeric:
|
|
337
|
+
return min(1 + numeric.group(1).count("."), 4)
|
|
338
|
+
return 1
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def is_caption_line(line: str) -> bool:
|
|
342
|
+
return bool(re.match(r"^(?:Table|TABLE|Figure|FIGURE|Fig\.?)\s+[A-Za-z0-9.]+", line.strip()))
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def is_heading_line(line: str) -> bool:
|
|
346
|
+
stripped = line.strip()
|
|
347
|
+
if not stripped or stripped.startswith("[Page ") or is_caption_line(stripped):
|
|
348
|
+
return False
|
|
349
|
+
if len(stripped) > 140:
|
|
350
|
+
return False
|
|
351
|
+
if stripped.endswith((".", ",", ";", ":")) and not re.match(r"^\d+(?:\.\d+)*\.?\s+", stripped):
|
|
352
|
+
return False
|
|
353
|
+
if re.match(r"^#{1,4}\s+\S", stripped):
|
|
354
|
+
return True
|
|
355
|
+
lowered = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", stripped.lower()).strip()
|
|
356
|
+
lowered = re.sub(r"^[ivx]+\.\s+", "", lowered).strip()
|
|
357
|
+
lowered = re.sub(r"^appendix\s+[a-z](?:\.\d+)*\.?\s*", "", lowered).strip()
|
|
358
|
+
if lowered in CANONICAL_HEADING_TITLES:
|
|
359
|
+
return True
|
|
360
|
+
if re.match(r"^\d+(?:\.\d+)*\.?\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
|
|
361
|
+
return True
|
|
362
|
+
if re.match(r"^[A-Z]\.\d+(?:\.\d+)*\.?\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
|
|
363
|
+
return True
|
|
364
|
+
if re.match(r"^Appendix\s+[A-Z](?:\.\d+)*\.?\s*[:\-]?\s*[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
|
|
365
|
+
return True
|
|
366
|
+
if re.match(r"^[IVX]+\.\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
|
|
367
|
+
return True
|
|
368
|
+
return False
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def split_sections(text: str) -> list[SectionRecord]:
|
|
372
|
+
lines = text.splitlines()
|
|
373
|
+
headings: list[tuple[int, str, int | None]] = []
|
|
374
|
+
current_page: int | None = None
|
|
375
|
+
line_pages: dict[int, int | None] = {}
|
|
376
|
+
|
|
377
|
+
for index, line in enumerate(lines):
|
|
378
|
+
page_match = re.match(r"^\[Page\s+(\d+)\]$", line.strip())
|
|
379
|
+
if page_match:
|
|
380
|
+
current_page = int(page_match.group(1))
|
|
381
|
+
line_pages[index] = current_page
|
|
382
|
+
if is_heading_line(line):
|
|
383
|
+
headings.append((index, normalize_heading(line), current_page))
|
|
384
|
+
|
|
385
|
+
if not headings:
|
|
386
|
+
content = "\n".join(line for line in lines if not line.startswith("[Page "))
|
|
387
|
+
return [
|
|
388
|
+
SectionRecord(
|
|
389
|
+
title="Full Text",
|
|
390
|
+
section_type="other",
|
|
391
|
+
level=1,
|
|
392
|
+
start_line=1,
|
|
393
|
+
end_line=len(lines),
|
|
394
|
+
start_page=None,
|
|
395
|
+
end_page=None,
|
|
396
|
+
content=content,
|
|
397
|
+
)
|
|
398
|
+
]
|
|
399
|
+
|
|
400
|
+
sections: list[SectionRecord] = []
|
|
401
|
+
for pos, (start_index, title, start_page) in enumerate(headings):
|
|
402
|
+
end_index = headings[pos + 1][0] - 1 if pos + 1 < len(headings) else len(lines) - 1
|
|
403
|
+
content_lines = [
|
|
404
|
+
line for line in lines[start_index + 1 : end_index + 1] if not line.startswith("[Page ")
|
|
405
|
+
]
|
|
406
|
+
end_page = line_pages.get(end_index)
|
|
407
|
+
sections.append(
|
|
408
|
+
SectionRecord(
|
|
409
|
+
title=title,
|
|
410
|
+
section_type=section_type_for_title(title),
|
|
411
|
+
level=heading_level(title),
|
|
412
|
+
start_line=start_index + 1,
|
|
413
|
+
end_line=end_index + 1,
|
|
414
|
+
start_page=start_page,
|
|
415
|
+
end_page=end_page,
|
|
416
|
+
content="\n".join(content_lines).strip(),
|
|
417
|
+
)
|
|
418
|
+
)
|
|
419
|
+
infer_section_types(sections)
|
|
420
|
+
return sections
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def infer_section_types(sections: list[SectionRecord]) -> None:
|
|
424
|
+
"""Recover section families when PDF text only preserves numbered headings.
|
|
425
|
+
|
|
426
|
+
Many academic PDF extractors keep subsection titles such as "4.2 Feature
|
|
427
|
+
Selection" but lose the visual hierarchy that tells us they belong to the
|
|
428
|
+
Method section. For template extraction, section family is more useful than
|
|
429
|
+
the literal heading string, so unknown subsections inherit from their nearest
|
|
430
|
+
known parent. Unknown top-level technical sections before Experiments are
|
|
431
|
+
treated as method-style sections, which matches common ML paper structure.
|
|
432
|
+
"""
|
|
433
|
+
|
|
434
|
+
first_experiment_index = next(
|
|
435
|
+
(index for index, section in enumerate(sections) if section.section_type == "experiments"),
|
|
436
|
+
None,
|
|
437
|
+
)
|
|
438
|
+
first_intro_index = next(
|
|
439
|
+
(index for index, section in enumerate(sections) if section.section_type == "introduction"),
|
|
440
|
+
None,
|
|
441
|
+
)
|
|
442
|
+
parent_by_level: dict[int, str] = {}
|
|
443
|
+
inheritable = {"related-work", "method", "experiments", "discussion", "conclusion", "appendix"}
|
|
444
|
+
|
|
445
|
+
for index, section in enumerate(sections):
|
|
446
|
+
for level in list(parent_by_level):
|
|
447
|
+
if level >= section.level:
|
|
448
|
+
parent_by_level.pop(level, None)
|
|
449
|
+
|
|
450
|
+
if section.section_type == "other":
|
|
451
|
+
inherited = None
|
|
452
|
+
for level in range(section.level - 1, 0, -1):
|
|
453
|
+
parent_type = parent_by_level.get(level)
|
|
454
|
+
if parent_type in inheritable:
|
|
455
|
+
inherited = parent_type
|
|
456
|
+
break
|
|
457
|
+
if inherited:
|
|
458
|
+
section.section_type = inherited
|
|
459
|
+
elif (
|
|
460
|
+
section.level == 1
|
|
461
|
+
and first_experiment_index is not None
|
|
462
|
+
and index < first_experiment_index
|
|
463
|
+
and (first_intro_index is None or index > first_intro_index)
|
|
464
|
+
):
|
|
465
|
+
section.section_type = "method"
|
|
466
|
+
|
|
467
|
+
parent_by_level[section.level] = section.section_type
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def paragraph_role(section_type: str, paragraph: str) -> str:
|
|
471
|
+
lowered = paragraph.lower()
|
|
472
|
+
if is_caption_line(paragraph):
|
|
473
|
+
return "visual_or_table_anchor"
|
|
474
|
+
if section_type == "abstract":
|
|
475
|
+
return "abstract_summary"
|
|
476
|
+
if any(token in lowered for token in ("dataset statistics", "statistics of the dataset", "sample count", "feature dimension")):
|
|
477
|
+
return "dataset_statistics"
|
|
478
|
+
if re.search(r"\bdatasets?\s*[:.]", lowered) or any(
|
|
479
|
+
token in lowered for token in ("benchmark dataset", "public dataset", "semi-synthetic dataset")
|
|
480
|
+
):
|
|
481
|
+
return "dataset_description"
|
|
482
|
+
if re.search(r"\bbaselines?\s*[:.]", lowered) or any(
|
|
483
|
+
token in lowered for token in ("baseline family", "comparator", "compare with")
|
|
484
|
+
):
|
|
485
|
+
return "baseline_setup"
|
|
486
|
+
if re.search(r"\bmetrics?\s*[:.]", lowered) or any(
|
|
487
|
+
token in lowered
|
|
488
|
+
for token in (
|
|
489
|
+
"metric definition",
|
|
490
|
+
"we report",
|
|
491
|
+
"primary metric",
|
|
492
|
+
"secondary metric",
|
|
493
|
+
"higher is better",
|
|
494
|
+
"lower is better",
|
|
495
|
+
"auuc",
|
|
496
|
+
"qini",
|
|
497
|
+
)
|
|
498
|
+
):
|
|
499
|
+
return "metric_definition"
|
|
500
|
+
if any(
|
|
501
|
+
token in lowered
|
|
502
|
+
for token in (
|
|
503
|
+
"train/test",
|
|
504
|
+
"train, validation",
|
|
505
|
+
"training split",
|
|
506
|
+
"validation split",
|
|
507
|
+
"test split",
|
|
508
|
+
"random split",
|
|
509
|
+
"repeated split",
|
|
510
|
+
"sampling",
|
|
511
|
+
"seed",
|
|
512
|
+
"protocol",
|
|
513
|
+
)
|
|
514
|
+
):
|
|
515
|
+
return "split_protocol"
|
|
516
|
+
if any(
|
|
517
|
+
token in lowered
|
|
518
|
+
for token in (
|
|
519
|
+
"implementation detail",
|
|
520
|
+
"hyperparameter",
|
|
521
|
+
"tuning",
|
|
522
|
+
"learning rate",
|
|
523
|
+
"epoch",
|
|
524
|
+
"batch size",
|
|
525
|
+
"hardware",
|
|
526
|
+
"gpu",
|
|
527
|
+
)
|
|
528
|
+
):
|
|
529
|
+
return "implementation_details"
|
|
530
|
+
if any(token in lowered for token in ("limitation", "future work", "caveat", "drift", "局限")):
|
|
531
|
+
return "limitation_boundary"
|
|
532
|
+
if any(token in lowered for token in ("ablation", "component", "without", "remove")):
|
|
533
|
+
return "ablation_interpretation"
|
|
534
|
+
if any(token in lowered for token in ("result", "outperform", "improve", "coverage", "interval length", "gain")):
|
|
535
|
+
return "result_interpretation"
|
|
536
|
+
if any(token in lowered for token in ("baseline", "compare", "comparator")):
|
|
537
|
+
return "comparator_setup"
|
|
538
|
+
if any(token in lowered for token in ("dataset", "benchmark", "metric", "protocol", "split", "auuc", "qini")):
|
|
539
|
+
return "experimental_protocol"
|
|
540
|
+
if any(token in lowered for token in ("contribution", "we propose", "our method", "framework", "model")):
|
|
541
|
+
return "contribution_or_method_claim"
|
|
542
|
+
if any(token in lowered for token in ("existing", "requires", "problem", "gap", "lack", "overfit")):
|
|
543
|
+
return "problem_or_gap"
|
|
544
|
+
if section_type == "method":
|
|
545
|
+
return "method_exposition"
|
|
546
|
+
if section_type == "experiments":
|
|
547
|
+
return "experiment_exposition"
|
|
548
|
+
return "section_exposition"
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def split_paragraphs(text: str) -> list[str]:
|
|
552
|
+
chunks = re.split(r"\n\s*\n", text)
|
|
553
|
+
paragraphs: list[str] = []
|
|
554
|
+
for chunk in chunks:
|
|
555
|
+
cleaned = "\n".join(line.strip() for line in chunk.splitlines() if line.strip())
|
|
556
|
+
if cleaned:
|
|
557
|
+
paragraphs.append(cleaned)
|
|
558
|
+
return paragraphs
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def caption_match(line: str) -> re.Match[str] | None:
|
|
562
|
+
return re.match(
|
|
563
|
+
r"^(?P<kind>Table|TABLE|Figure|FIGURE|Fig\.?)\s+(?P<id>[A-Za-z0-9.]+)\s*[:.\-]?\s*(?P<caption>.*)$",
|
|
564
|
+
line.strip(),
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def classify_asset_role(asset_type: str, caption: str) -> str:
|
|
569
|
+
lowered = caption.lower()
|
|
570
|
+
if "ablation" in lowered or "component" in lowered:
|
|
571
|
+
return "ablation"
|
|
572
|
+
if any(token in lowered for token in ("coverage", "interval", "trade-off", "tradeoff", "sensitivity", "shift")):
|
|
573
|
+
return "uncertainty_or_tradeoff"
|
|
574
|
+
if any(token in lowered for token in ("main", "result", "performance", "ranking", "auuc", "qini")):
|
|
575
|
+
return "main_results"
|
|
576
|
+
if any(token in lowered for token in ("dataset", "statistics", "summary", "treatment assignment")):
|
|
577
|
+
return "dataset_or_protocol"
|
|
578
|
+
if any(token in lowered for token in ("overview", "framework", "pipeline", "architecture", "model")):
|
|
579
|
+
return "method_overview"
|
|
580
|
+
if any(token in lowered for token in ("case", "example")):
|
|
581
|
+
return "case_analysis"
|
|
582
|
+
return "supporting_evidence" if asset_type == "table" else "conceptual_visual"
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def caption_needs_continuation(caption: str) -> bool:
|
|
586
|
+
caption = caption.strip()
|
|
587
|
+
return bool(caption) and len(caption) < 140 and not re.search(r"[.!?)]$", caption)
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def extend_caption(caption: str, lines: list[str], start_index: int) -> str:
|
|
591
|
+
parts = [caption.strip()]
|
|
592
|
+
next_index = start_index + 1
|
|
593
|
+
while caption_needs_continuation(" ".join(parts)) and next_index < len(lines) and len(parts) < 4:
|
|
594
|
+
candidate = lines[next_index].strip()
|
|
595
|
+
next_index += 1
|
|
596
|
+
if not candidate:
|
|
597
|
+
continue
|
|
598
|
+
if candidate.startswith("[Page ") or is_caption_line(candidate) or is_heading_line(candidate):
|
|
599
|
+
break
|
|
600
|
+
if len(candidate) > 180:
|
|
601
|
+
break
|
|
602
|
+
parts.append(candidate)
|
|
603
|
+
return re.sub(r"\s+", " ", " ".join(parts)).strip()
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def asset_role_text(evidence_role: str) -> tuple[str, str, str, str]:
|
|
607
|
+
mapping = {
|
|
608
|
+
"main_results": (
|
|
609
|
+
"answers the primary comparison question",
|
|
610
|
+
"state the comparison question and ranking metric before the table",
|
|
611
|
+
"interpret ranking, margin, and evidence strength after the table",
|
|
612
|
+
"place after protocol, metrics, and comparator definitions",
|
|
613
|
+
),
|
|
614
|
+
"ablation": (
|
|
615
|
+
"isolates which component supports the claimed gain",
|
|
616
|
+
"name the design choice being tested before the asset",
|
|
617
|
+
"explain the component-level implication after the asset",
|
|
618
|
+
"place after main results so the reader first sees the headline effect",
|
|
619
|
+
),
|
|
620
|
+
"dataset_or_protocol": (
|
|
621
|
+
"defines the evaluation substrate and comparison scope",
|
|
622
|
+
"tell the reader why the dataset or protocol matters",
|
|
623
|
+
"connect the protocol to the later comparison table",
|
|
624
|
+
"place before baseline and result interpretation",
|
|
625
|
+
),
|
|
626
|
+
"method_overview": (
|
|
627
|
+
"orients the reader to the method pipeline",
|
|
628
|
+
"introduce the mechanism or module sequence before the figure",
|
|
629
|
+
"map visual elements to later method prose after the figure",
|
|
630
|
+
"place near the beginning of the method section",
|
|
631
|
+
),
|
|
632
|
+
"uncertainty_or_tradeoff": (
|
|
633
|
+
"explains a robustness, uncertainty, or trade-off pattern",
|
|
634
|
+
"state the trade-off or diagnostic question before the asset",
|
|
635
|
+
"explain the mechanism or limitation revealed by the asset",
|
|
636
|
+
"place after the main result table or in analysis/sensitivity subsections",
|
|
637
|
+
),
|
|
638
|
+
}
|
|
639
|
+
return mapping.get(
|
|
640
|
+
evidence_role,
|
|
641
|
+
(
|
|
642
|
+
"supports a local subsection claim",
|
|
643
|
+
"state the local question before the asset",
|
|
644
|
+
"explain the takeaway after the asset",
|
|
645
|
+
"place next to the subsection claim it supports",
|
|
646
|
+
),
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def extract_assets(sections: list[SectionRecord]) -> list[AssetRecord]:
|
|
651
|
+
assets: list[AssetRecord] = []
|
|
652
|
+
for section in sections:
|
|
653
|
+
lines = section.content.splitlines()
|
|
654
|
+
for index, line in enumerate(lines):
|
|
655
|
+
match = caption_match(line)
|
|
656
|
+
if not match:
|
|
657
|
+
continue
|
|
658
|
+
raw_kind = match.group("kind").lower()
|
|
659
|
+
asset_type = "table" if raw_kind.startswith("table") else "figure"
|
|
660
|
+
caption = match.group("caption").strip() or line.strip()
|
|
661
|
+
caption = extend_caption(caption, lines, index)
|
|
662
|
+
evidence_role = classify_asset_role(asset_type, caption)
|
|
663
|
+
local_role, before, after, placement = asset_role_text(evidence_role)
|
|
664
|
+
assets.append(
|
|
665
|
+
AssetRecord(
|
|
666
|
+
asset_type=asset_type,
|
|
667
|
+
asset_id=match.group("id"),
|
|
668
|
+
caption=short_excerpt(caption, 180),
|
|
669
|
+
appears_in_section=section.section_type,
|
|
670
|
+
appears_in_title=section.title,
|
|
671
|
+
evidence_role=evidence_role,
|
|
672
|
+
local_role=local_role,
|
|
673
|
+
placement_logic=placement,
|
|
674
|
+
text_bridge_before=before,
|
|
675
|
+
text_bridge_after=after,
|
|
676
|
+
reuse_guidance="Reuse the asset function and placement logic only; do not copy caption wording or claims.",
|
|
677
|
+
)
|
|
678
|
+
)
|
|
679
|
+
return assets
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def build_paragraph_roles(sections: list[SectionRecord]) -> list[dict]:
|
|
683
|
+
roles: list[dict] = []
|
|
684
|
+
for section in sections:
|
|
685
|
+
for index, paragraph in enumerate(split_paragraphs(section.content), start=1):
|
|
686
|
+
roles.append(
|
|
687
|
+
{
|
|
688
|
+
"section": section.section_type,
|
|
689
|
+
"section_title": section.title,
|
|
690
|
+
"paragraph_index": index,
|
|
691
|
+
"role": paragraph_role(section.section_type, paragraph),
|
|
692
|
+
"excerpt": short_excerpt(paragraph),
|
|
693
|
+
"reuse_guidance": "Reuse the paragraph function, not the source wording.",
|
|
694
|
+
}
|
|
695
|
+
)
|
|
696
|
+
return roles
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
def section_slots(section: SectionRecord, paragraph_roles: list[dict], assets: list[AssetRecord]) -> list[str]:
|
|
700
|
+
slots: list[str] = []
|
|
701
|
+
title = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", section.title).strip()
|
|
702
|
+
if title and title.lower() not in {"experiments", "experiment", "results"}:
|
|
703
|
+
slots.append(title.lower())
|
|
704
|
+
for role in paragraph_roles:
|
|
705
|
+
if role["section_title"] == section.title and role["role"] not in slots:
|
|
706
|
+
slots.append(role["role"].replace("_", " "))
|
|
707
|
+
for asset in assets:
|
|
708
|
+
if asset.appears_in_title == section.title:
|
|
709
|
+
slots.append(f"{asset.asset_type}: {asset.evidence_role}")
|
|
710
|
+
return slots
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def write_paper_artifacts(
|
|
714
|
+
paper_dir: Path,
|
|
715
|
+
source: str,
|
|
716
|
+
materialized_path: Path,
|
|
717
|
+
source_kind: str,
|
|
718
|
+
title: str,
|
|
719
|
+
method: str,
|
|
720
|
+
page_count: int | None,
|
|
721
|
+
sections: list[SectionRecord],
|
|
722
|
+
roles: list[dict],
|
|
723
|
+
assets: list[AssetRecord],
|
|
724
|
+
) -> dict:
|
|
725
|
+
paper_dir.mkdir(parents=True, exist_ok=True)
|
|
726
|
+
metadata = {
|
|
727
|
+
"source": source,
|
|
728
|
+
"source_kind": source_kind,
|
|
729
|
+
"materialized_path": str(materialized_path),
|
|
730
|
+
"title": title,
|
|
731
|
+
"extraction_method": method,
|
|
732
|
+
"page_count": page_count,
|
|
733
|
+
"section_count": len(sections),
|
|
734
|
+
"visual_asset_count": len(assets),
|
|
735
|
+
}
|
|
736
|
+
(paper_dir / "metadata.json").write_text(json.dumps(metadata, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
737
|
+
(paper_dir / "section-map.json").write_text(
|
|
738
|
+
json.dumps(
|
|
739
|
+
[
|
|
740
|
+
{
|
|
741
|
+
key: value
|
|
742
|
+
for key, value in asdict(section).items()
|
|
743
|
+
if key != "content"
|
|
744
|
+
}
|
|
745
|
+
for section in sections
|
|
746
|
+
],
|
|
747
|
+
indent=2,
|
|
748
|
+
ensure_ascii=False,
|
|
749
|
+
),
|
|
750
|
+
encoding="utf-8",
|
|
751
|
+
)
|
|
752
|
+
(paper_dir / "paragraph-roles.json").write_text(json.dumps(roles, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
753
|
+
(paper_dir / "visual-assets.json").write_text(
|
|
754
|
+
json.dumps([asdict(asset) for asset in assets], indent=2, ensure_ascii=False),
|
|
755
|
+
encoding="utf-8",
|
|
756
|
+
)
|
|
757
|
+
write_section_logic(paper_dir / "section-logic.md", title, sections, roles, assets)
|
|
758
|
+
write_writing_patterns(paper_dir / "writing-patterns.md", title, sections, roles, assets)
|
|
759
|
+
(paper_dir / "extraction-report.md").write_text(
|
|
760
|
+
"\n".join(
|
|
761
|
+
[
|
|
762
|
+
f"# Extraction Report: {title}",
|
|
763
|
+
"",
|
|
764
|
+
f"- Source: `{source}`",
|
|
765
|
+
f"- Extraction method: `{method}`",
|
|
766
|
+
f"- Sections detected: {len(sections)}",
|
|
767
|
+
f"- Visual/table assets detected: {len(assets)}",
|
|
768
|
+
"- Boundary: this artifact extracts reusable structure only; it must not be used as evidence for the current paper.",
|
|
769
|
+
]
|
|
770
|
+
)
|
|
771
|
+
+ "\n",
|
|
772
|
+
encoding="utf-8",
|
|
773
|
+
)
|
|
774
|
+
return metadata
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def write_section_logic(path: Path, title: str, sections: list[SectionRecord], roles: list[dict], assets: list[AssetRecord]) -> None:
|
|
778
|
+
lines = [f"# Section Logic: {title}", ""]
|
|
779
|
+
for section in sections:
|
|
780
|
+
section_roles = [role for role in roles if role["section_title"] == section.title]
|
|
781
|
+
section_assets = [asset for asset in assets if asset.appears_in_title == section.title]
|
|
782
|
+
lines.append(f"## {section.title}")
|
|
783
|
+
lines.append("")
|
|
784
|
+
lines.append(f"- Section type: `{section.section_type}`")
|
|
785
|
+
if section_roles:
|
|
786
|
+
role_list = ", ".join(dict.fromkeys(role["role"] for role in section_roles))
|
|
787
|
+
lines.append(f"- Paragraph roles: {role_list}")
|
|
788
|
+
if section_assets:
|
|
789
|
+
asset_list = ", ".join(f"{asset.asset_type}:{asset.evidence_role}" for asset in section_assets)
|
|
790
|
+
lines.append(f"- Asset roles: {asset_list}")
|
|
791
|
+
lines.append("- Reuse: preserve the slot order and rhetorical function, not the wording.")
|
|
792
|
+
lines.append("")
|
|
793
|
+
path.write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
def write_writing_patterns(path: Path, title: str, sections: list[SectionRecord], roles: list[dict], assets: list[AssetRecord]) -> None:
|
|
797
|
+
lines = [f"# Writing Patterns: {title}", ""]
|
|
798
|
+
lines.append("## Template Use")
|
|
799
|
+
lines.append("")
|
|
800
|
+
lines.append("- Use this paper as one structural template among several references.")
|
|
801
|
+
lines.append("- Reproduce section slots, paragraph functions, and asset placement logic only.")
|
|
802
|
+
lines.append("- Do not copy wording, claims, metrics, or conclusions.")
|
|
803
|
+
lines.append("")
|
|
804
|
+
lines.append("## Observed Section Slots")
|
|
805
|
+
lines.append("")
|
|
806
|
+
for section in sections:
|
|
807
|
+
lines.append(f"- {section.section_type}: {section.title}")
|
|
808
|
+
lines.append("")
|
|
809
|
+
lines.append("## Visual/Table Pattern")
|
|
810
|
+
lines.append("")
|
|
811
|
+
if assets:
|
|
812
|
+
for asset in assets:
|
|
813
|
+
lines.append(f"- {asset.asset_type} {asset.asset_id}: {asset.evidence_role} -> {asset.local_role}")
|
|
814
|
+
else:
|
|
815
|
+
lines.append("- No table or figure caption was detected.")
|
|
816
|
+
path.write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def merge_unique(values: list[str]) -> list[str]:
|
|
820
|
+
seen: set[str] = set()
|
|
821
|
+
merged: list[str] = []
|
|
822
|
+
for value in values:
|
|
823
|
+
normalized = re.sub(r"\s+", " ", value.strip())
|
|
824
|
+
key = normalized.lower()
|
|
825
|
+
if not normalized or key in seen:
|
|
826
|
+
continue
|
|
827
|
+
seen.add(key)
|
|
828
|
+
merged.append(normalized)
|
|
829
|
+
return merged
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
def normalized_section_title(title: str) -> str:
|
|
833
|
+
title = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", title.strip())
|
|
834
|
+
title = re.sub(r"^[A-Z]\.\d+(?:\.\d+)*\.?\s+", "", title)
|
|
835
|
+
title = re.sub(r"^Appendix\s+[A-Z](?:\.\d+)*\.?\s*[:\-]?\s*", "", title, flags=re.IGNORECASE)
|
|
836
|
+
return re.sub(r"\s+", " ", title).strip().lower()
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
def experiment_slots_from_signal(section: SectionRecord, role: str, text: str) -> list[str]:
|
|
840
|
+
title = normalized_section_title(section.title)
|
|
841
|
+
lowered = f"{title} {text.lower()}"
|
|
842
|
+
is_appendix = section.section_type == "appendix"
|
|
843
|
+
slots: list[str] = []
|
|
844
|
+
|
|
845
|
+
if is_appendix and "dataset" in lowered and any(
|
|
846
|
+
token in lowered for token in ("statistics", "statistic", "summary", "sample", "feature")
|
|
847
|
+
):
|
|
848
|
+
slots.append("appendix_dataset_statistics")
|
|
849
|
+
if is_appendix and any(token in lowered for token in ("baseline", "metric", "implementation", "hyperparameter")):
|
|
850
|
+
slots.append("appendix_baseline_metric_details")
|
|
851
|
+
if role == "dataset_statistics" or (
|
|
852
|
+
"dataset" in lowered and any(token in lowered for token in ("statistics", "summary", "sample", "feature"))
|
|
853
|
+
):
|
|
854
|
+
slots.append("dataset_statistics")
|
|
855
|
+
if role == "dataset_description" or "datasets" in title or "dataset" in title:
|
|
856
|
+
slots.append("dataset_description")
|
|
857
|
+
if role == "split_protocol" or any(
|
|
858
|
+
token in lowered
|
|
859
|
+
for token in ("split", "seed", "sampling", "train/test", "validation", "protocol")
|
|
860
|
+
):
|
|
861
|
+
slots.append("split_protocol")
|
|
862
|
+
if role == "baseline_setup" or "baseline" in lowered or "comparator" in lowered:
|
|
863
|
+
slots.append("baseline_setup")
|
|
864
|
+
if role == "metric_definition" or "metric" in lowered or "auuc" in lowered or "qini" in lowered:
|
|
865
|
+
slots.append("metric_definition")
|
|
866
|
+
if role == "implementation_details" or any(
|
|
867
|
+
token in lowered
|
|
868
|
+
for token in ("implementation", "hyperparameter", "tuning", "epoch", "learning rate", "hardware")
|
|
869
|
+
):
|
|
870
|
+
slots.append("implementation_details")
|
|
871
|
+
if "ablation" in lowered:
|
|
872
|
+
slots.append("ablation")
|
|
873
|
+
if "sensitivity" in lowered or "shift" in lowered or "trade-off" in lowered or "tradeoff" in lowered:
|
|
874
|
+
slots.append("sensitivity")
|
|
875
|
+
if role == "result_interpretation" or any(
|
|
876
|
+
token in lowered for token in ("main result", "overall performance", "performance", "results and discussion")
|
|
877
|
+
):
|
|
878
|
+
slots.append("main_results")
|
|
879
|
+
return list(dict.fromkeys(slots))
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
def is_experiment_protocol_section(section: SectionRecord) -> bool:
|
|
883
|
+
if section.section_type in {"experiments", "discussion"}:
|
|
884
|
+
return True
|
|
885
|
+
if section.section_type != "appendix":
|
|
886
|
+
return False
|
|
887
|
+
title = normalized_section_title(section.title)
|
|
888
|
+
return any(
|
|
889
|
+
token in title
|
|
890
|
+
for token in (
|
|
891
|
+
"dataset",
|
|
892
|
+
"baseline",
|
|
893
|
+
"metric",
|
|
894
|
+
"experiment",
|
|
895
|
+
"experimental",
|
|
896
|
+
"setup",
|
|
897
|
+
"result",
|
|
898
|
+
"ablation",
|
|
899
|
+
"sensitivity",
|
|
900
|
+
"complexity",
|
|
901
|
+
"online",
|
|
902
|
+
)
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
def slot_payload(
|
|
907
|
+
*,
|
|
908
|
+
source_paper: str,
|
|
909
|
+
slot: str,
|
|
910
|
+
section: SectionRecord,
|
|
911
|
+
evidence_excerpt: str,
|
|
912
|
+
paragraph_index: int | None = None,
|
|
913
|
+
asset_type: str | None = None,
|
|
914
|
+
asset_id: str | None = None,
|
|
915
|
+
) -> dict:
|
|
916
|
+
guidance = EXPERIMENT_PROTOCOL_SLOT_GUIDANCE[slot]
|
|
917
|
+
payload = {
|
|
918
|
+
"slot": slot,
|
|
919
|
+
"source_paper": source_paper,
|
|
920
|
+
"source_heading": section.title,
|
|
921
|
+
"source_section_type": section.section_type,
|
|
922
|
+
"paragraph_index": paragraph_index,
|
|
923
|
+
"evidence_excerpt": short_excerpt(evidence_excerpt),
|
|
924
|
+
"reader_question": guidance["reader_question"],
|
|
925
|
+
"placement_guidance": guidance["placement_guidance"],
|
|
926
|
+
"linked_main_section": "experiments" if slot.startswith("appendix_") else "",
|
|
927
|
+
"reuse_guidance": "Reuse this protocol role and placement logic only; do not copy wording, claims, metrics, data, or conclusions.",
|
|
928
|
+
}
|
|
929
|
+
if asset_type and asset_id:
|
|
930
|
+
payload["asset_type"] = asset_type
|
|
931
|
+
payload["asset_id"] = asset_id
|
|
932
|
+
return payload
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def build_experiment_protocol_slots_for_payload(payload: dict) -> list[dict]:
|
|
936
|
+
slots: list[dict] = []
|
|
937
|
+
sections_by_title = {section.title: section for section in payload["sections"]}
|
|
938
|
+
|
|
939
|
+
for role in payload["roles"]:
|
|
940
|
+
section = sections_by_title.get(role["section_title"])
|
|
941
|
+
if not section:
|
|
942
|
+
continue
|
|
943
|
+
if not is_experiment_protocol_section(section):
|
|
944
|
+
continue
|
|
945
|
+
role_slots = experiment_slots_from_signal(section, role["role"], role["excerpt"])
|
|
946
|
+
if not role_slots:
|
|
947
|
+
continue
|
|
948
|
+
for slot in role_slots:
|
|
949
|
+
slots.append(
|
|
950
|
+
slot_payload(
|
|
951
|
+
source_paper=payload["slug"],
|
|
952
|
+
slot=slot,
|
|
953
|
+
section=section,
|
|
954
|
+
paragraph_index=role["paragraph_index"],
|
|
955
|
+
evidence_excerpt=role["excerpt"],
|
|
956
|
+
)
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
for asset in payload["assets"]:
|
|
960
|
+
section = sections_by_title.get(asset.appears_in_title)
|
|
961
|
+
if not section:
|
|
962
|
+
continue
|
|
963
|
+
if not is_experiment_protocol_section(section):
|
|
964
|
+
continue
|
|
965
|
+
asset_slots = experiment_slots_from_signal(section, asset.evidence_role, asset.caption)
|
|
966
|
+
if asset.evidence_role == "dataset_or_protocol" and section.section_type == "appendix":
|
|
967
|
+
asset_slots = ["appendix_dataset_statistics"]
|
|
968
|
+
if not asset_slots:
|
|
969
|
+
continue
|
|
970
|
+
for slot in asset_slots:
|
|
971
|
+
slots.append(
|
|
972
|
+
slot_payload(
|
|
973
|
+
source_paper=payload["slug"],
|
|
974
|
+
slot=slot,
|
|
975
|
+
section=section,
|
|
976
|
+
evidence_excerpt=asset.caption,
|
|
977
|
+
asset_type=asset.asset_type,
|
|
978
|
+
asset_id=asset.asset_id,
|
|
979
|
+
)
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
seen: set[tuple[str, str, str, str]] = set()
|
|
983
|
+
unique_slots: list[dict] = []
|
|
984
|
+
for item in slots:
|
|
985
|
+
key = (
|
|
986
|
+
item["source_paper"],
|
|
987
|
+
item["slot"],
|
|
988
|
+
item["source_heading"].lower(),
|
|
989
|
+
item["evidence_excerpt"].lower(),
|
|
990
|
+
)
|
|
991
|
+
if key in seen:
|
|
992
|
+
continue
|
|
993
|
+
seen.add(key)
|
|
994
|
+
unique_slots.append(item)
|
|
995
|
+
return unique_slots
|
|
996
|
+
|
|
997
|
+
|
|
998
|
+
def build_section_templates(output_dir: Path, paper_payloads: list[dict]) -> None:
|
|
999
|
+
target = output_dir / "section-templates"
|
|
1000
|
+
target.mkdir(parents=True, exist_ok=True)
|
|
1001
|
+
section_types = ["abstract", "introduction", "related-work", "method", "experiments", "discussion", "conclusion"]
|
|
1002
|
+
for section_type in section_types:
|
|
1003
|
+
source_papers: list[str] = []
|
|
1004
|
+
observed_titles: list[str] = []
|
|
1005
|
+
slots: list[str] = []
|
|
1006
|
+
paragraph_roles: list[str] = []
|
|
1007
|
+
asset_roles: list[dict] = []
|
|
1008
|
+
for payload in paper_payloads:
|
|
1009
|
+
matching_sections = [section for section in payload["sections"] if section.section_type == section_type]
|
|
1010
|
+
if section_type == "experiments":
|
|
1011
|
+
matching_sections.extend(
|
|
1012
|
+
section for section in payload["sections"] if section.section_type in {"discussion"}
|
|
1013
|
+
)
|
|
1014
|
+
if not matching_sections:
|
|
1015
|
+
continue
|
|
1016
|
+
source_papers.append(payload["slug"])
|
|
1017
|
+
for section in matching_sections:
|
|
1018
|
+
observed_titles.append(section.title)
|
|
1019
|
+
slots.extend(section_slots(section, payload["roles"], payload["assets"]))
|
|
1020
|
+
paragraph_roles.extend(
|
|
1021
|
+
role["role"] for role in payload["roles"] if role["section"] == section_type
|
|
1022
|
+
)
|
|
1023
|
+
asset_roles.extend(
|
|
1024
|
+
{
|
|
1025
|
+
"asset_type": asset.asset_type,
|
|
1026
|
+
"evidence_role": asset.evidence_role,
|
|
1027
|
+
"local_role": asset.local_role,
|
|
1028
|
+
"source_paper": payload["slug"],
|
|
1029
|
+
}
|
|
1030
|
+
for asset in payload["assets"]
|
|
1031
|
+
if asset.appears_in_section == section_type
|
|
1032
|
+
or (section_type == "experiments" and asset.appears_in_section in {"experiments", "discussion"})
|
|
1033
|
+
)
|
|
1034
|
+
if not source_papers:
|
|
1035
|
+
continue
|
|
1036
|
+
template = {
|
|
1037
|
+
"section": section_type,
|
|
1038
|
+
"template_id": f"{section_type}-multi-reference-template",
|
|
1039
|
+
"source_papers": merge_unique(source_papers),
|
|
1040
|
+
"observed_titles": merge_unique(observed_titles),
|
|
1041
|
+
"section_slots": merge_unique(slots),
|
|
1042
|
+
"paragraph_roles": merge_unique(paragraph_roles),
|
|
1043
|
+
"asset_roles": asset_roles,
|
|
1044
|
+
"reuse_rule": "Reuse structure only; do not copy wording, claims, metrics, or conclusions from reference papers.",
|
|
1045
|
+
}
|
|
1046
|
+
if section_type == "experiments":
|
|
1047
|
+
protocol_slots: list[dict] = []
|
|
1048
|
+
for payload in paper_payloads:
|
|
1049
|
+
protocol_slots.extend(build_experiment_protocol_slots_for_payload(payload))
|
|
1050
|
+
template["experiment_protocol_slots"] = protocol_slots
|
|
1051
|
+
(target / f"{section_type}.json").write_text(
|
|
1052
|
+
json.dumps(template, indent=2, ensure_ascii=False),
|
|
1053
|
+
encoding="utf-8",
|
|
1054
|
+
)
|
|
1055
|
+
if section_type == "experiments":
|
|
1056
|
+
(target / "experiments-protocol.json").write_text(
|
|
1057
|
+
json.dumps(
|
|
1058
|
+
{
|
|
1059
|
+
"section": "experiments",
|
|
1060
|
+
"template_id": "experiments-protocol-slots",
|
|
1061
|
+
"source_papers": template["source_papers"],
|
|
1062
|
+
"experiment_protocol_slots": template["experiment_protocol_slots"],
|
|
1063
|
+
"reuse_rule": "Reuse experiment setup topology only: dataset, split, baseline, metric, implementation, result, ablation, sensitivity, and appendix-link roles.",
|
|
1064
|
+
},
|
|
1065
|
+
indent=2,
|
|
1066
|
+
ensure_ascii=False,
|
|
1067
|
+
),
|
|
1068
|
+
encoding="utf-8",
|
|
1069
|
+
)
|
|
1070
|
+
|
|
1071
|
+
|
|
1072
|
+
def build_visual_templates(output_dir: Path, paper_payloads: list[dict]) -> None:
|
|
1073
|
+
target = output_dir / "visual-templates"
|
|
1074
|
+
target.mkdir(parents=True, exist_ok=True)
|
|
1075
|
+
asset_roles: list[dict] = []
|
|
1076
|
+
for payload in paper_payloads:
|
|
1077
|
+
for asset in payload["assets"]:
|
|
1078
|
+
if asset.appears_in_section in {"experiments", "discussion", "method"}:
|
|
1079
|
+
item = asdict(asset)
|
|
1080
|
+
item["source_paper"] = payload["slug"]
|
|
1081
|
+
asset_roles.append(item)
|
|
1082
|
+
template = {
|
|
1083
|
+
"template_id": "experiment-visual-and-table-template",
|
|
1084
|
+
"asset_roles": asset_roles,
|
|
1085
|
+
"reuse_rule": "Use table and figure functions, placement, and bridge logic to plan current paper assets; do not copy captions or data.",
|
|
1086
|
+
}
|
|
1087
|
+
(target / "experiment-assets.json").write_text(
|
|
1088
|
+
json.dumps(template, indent=2, ensure_ascii=False),
|
|
1089
|
+
encoding="utf-8",
|
|
1090
|
+
)
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
def write_aggregate_playbook(output_dir: Path, paper_payloads: list[dict]) -> None:
|
|
1094
|
+
lines = [
|
|
1095
|
+
"# Aggregate Template Playbook",
|
|
1096
|
+
"",
|
|
1097
|
+
"Purpose: help `/lab:write` reproduce mature paper structure from multiple reference templates.",
|
|
1098
|
+
"",
|
|
1099
|
+
"## Sources",
|
|
1100
|
+
"",
|
|
1101
|
+
]
|
|
1102
|
+
for payload in paper_payloads:
|
|
1103
|
+
lines.append(f"- `{payload['slug']}`: {payload['title']}")
|
|
1104
|
+
lines.extend(
|
|
1105
|
+
[
|
|
1106
|
+
"",
|
|
1107
|
+
"## Reuse Boundary",
|
|
1108
|
+
"",
|
|
1109
|
+
"- Reproduce section order, paragraph roles, table/figure function, and bridge logic.",
|
|
1110
|
+
"- Do not copy wording, claims, experimental conclusions, metrics, or terminology names.",
|
|
1111
|
+
"- If only one reference supports a structure, treat it as a single-template pattern, not a universal rule.",
|
|
1112
|
+
"",
|
|
1113
|
+
"## Multi-Template Write Procedure",
|
|
1114
|
+
"",
|
|
1115
|
+
"1. Pick 2-3 closest section templates for the current paper section.",
|
|
1116
|
+
"2. For experiment sections, preserve protocol slots when present: datasets, splits, baselines, metrics, implementation details, main results, ablations, sensitivity analysis, and appendix links.",
|
|
1117
|
+
"3. Build a mini-outline from common slots and current-paper evidence.",
|
|
1118
|
+
"4. Add required table/figure assets with local before/after bridge functions.",
|
|
1119
|
+
"5. Draft with current-paper terminology and evidence only.",
|
|
1120
|
+
"",
|
|
1121
|
+
"## Table/Figure Planning Rule",
|
|
1122
|
+
"",
|
|
1123
|
+
"Every major table or figure should answer a reader question, appear near the subsection claim it supports, and have one bridge sentence before and one interpretation sentence after it.",
|
|
1124
|
+
]
|
|
1125
|
+
)
|
|
1126
|
+
(output_dir / "aggregate-template-playbook.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
1127
|
+
|
|
1128
|
+
|
|
1129
|
+
def process_source(source: str, output_dir: Path, cache_dir: Path) -> dict:
|
|
1130
|
+
materialized_path, source_kind = materialize_source(source, cache_dir)
|
|
1131
|
+
if not materialized_path.exists():
|
|
1132
|
+
raise FileNotFoundError(f"source not found: {source}")
|
|
1133
|
+
text, method, page_count = read_source_text(materialized_path)
|
|
1134
|
+
title = detect_title(text, materialized_path.stem)
|
|
1135
|
+
slug = slugify(title or materialized_path.stem)
|
|
1136
|
+
sections = split_sections(text)
|
|
1137
|
+
roles = build_paragraph_roles(sections)
|
|
1138
|
+
assets = extract_assets(sections)
|
|
1139
|
+
metadata = write_paper_artifacts(
|
|
1140
|
+
output_dir / slug,
|
|
1141
|
+
source,
|
|
1142
|
+
materialized_path,
|
|
1143
|
+
source_kind,
|
|
1144
|
+
title,
|
|
1145
|
+
method,
|
|
1146
|
+
page_count,
|
|
1147
|
+
sections,
|
|
1148
|
+
roles,
|
|
1149
|
+
assets,
|
|
1150
|
+
)
|
|
1151
|
+
return {
|
|
1152
|
+
"slug": slug,
|
|
1153
|
+
"title": title,
|
|
1154
|
+
"metadata": metadata,
|
|
1155
|
+
"sections": sections,
|
|
1156
|
+
"roles": roles,
|
|
1157
|
+
"assets": assets,
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
|
|
1161
|
+
def main() -> int:
|
|
1162
|
+
args = parse_args()
|
|
1163
|
+
output_dir = Path(args.output_dir).expanduser().resolve()
|
|
1164
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1165
|
+
cache_dir = output_dir / ".cache"
|
|
1166
|
+
|
|
1167
|
+
payloads: list[dict] = []
|
|
1168
|
+
source_entries: list[dict] = []
|
|
1169
|
+
try:
|
|
1170
|
+
for source in args.sources:
|
|
1171
|
+
payload = process_source(source, output_dir, cache_dir)
|
|
1172
|
+
payloads.append(payload)
|
|
1173
|
+
source_entries.append(
|
|
1174
|
+
{
|
|
1175
|
+
"source": source,
|
|
1176
|
+
"slug": payload["slug"],
|
|
1177
|
+
"title": payload["title"],
|
|
1178
|
+
"artifact_dir": str(output_dir / payload["slug"]),
|
|
1179
|
+
}
|
|
1180
|
+
)
|
|
1181
|
+
except RuntimeError as exc:
|
|
1182
|
+
if "No PDF text extractor succeeded" in str(exc):
|
|
1183
|
+
bootstrapped = rerun_with_uv_pypdf()
|
|
1184
|
+
if bootstrapped is not None:
|
|
1185
|
+
return bootstrapped
|
|
1186
|
+
raise
|
|
1187
|
+
|
|
1188
|
+
(output_dir / "sources.json").write_text(
|
|
1189
|
+
json.dumps({"sources": source_entries}, indent=2, ensure_ascii=False),
|
|
1190
|
+
encoding="utf-8",
|
|
1191
|
+
)
|
|
1192
|
+
build_section_templates(output_dir, payloads)
|
|
1193
|
+
build_visual_templates(output_dir, payloads)
|
|
1194
|
+
write_aggregate_playbook(output_dir, payloads)
|
|
1195
|
+
print(f"reference template patterns written to {output_dir}")
|
|
1196
|
+
return 0
|
|
1197
|
+
|
|
1198
|
+
|
|
1199
|
+
if __name__ == "__main__":
|
|
1200
|
+
raise SystemExit(main())
|