superlab 0.1.64 → 0.1.65
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/lab_write_contract.json +4 -4
- package/package-assets/claude/commands/lab/write.md +1 -1
- package/package-assets/claude/commands/lab-write.md +1 -1
- package/package-assets/claude/commands/lab:write.md +1 -1
- package/package-assets/claude/commands/lab/357/274/232write.md +1 -1
- package/package-assets/codex/prompts/lab/write.md +1 -1
- package/package-assets/codex/prompts/lab-write.md +1 -1
- package/package-assets/codex/prompts/lab:write.md +1 -1
- package/package-assets/codex/prompts/lab/357/274/232write.md +1 -1
- package/package-assets/shared/lab/.managed/scripts/extract_reference_paper_structure.py +910 -0
- package/package-assets/shared/lab/.managed/scripts/validate_manuscript_delivery.py +57 -0
- package/package-assets/shared/lab/.managed/scripts/validate_section_draft.py +106 -0
- package/package-assets/shared/lab/.managed/templates/reference-template-intake.md +40 -0
- package/package-assets/shared/lab/.managed/templates/write-iteration.md +27 -0
- package/package-assets/shared/skills/lab/stages/write.md +17 -0
- package/package.json +1 -1
|
@@ -0,0 +1,910 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Extract reusable writing templates from reference papers.
|
|
3
|
+
|
|
4
|
+
This script is intentionally lightweight. It extracts structure, paragraph
|
|
5
|
+
roles, and visual/table roles for `/lab:write`; it does not summarize or copy
|
|
6
|
+
paper content.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import hashlib
|
|
13
|
+
import html
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import shutil
|
|
18
|
+
import subprocess
|
|
19
|
+
import sys
|
|
20
|
+
import tempfile
|
|
21
|
+
import urllib.parse
|
|
22
|
+
import urllib.request
|
|
23
|
+
from dataclasses import asdict, dataclass
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
SECTION_ALIASES = {
|
|
28
|
+
"abstract": ("abstract", "摘要"),
|
|
29
|
+
"introduction": ("introduction", "intro", "引言", "绪论"),
|
|
30
|
+
"related-work": ("related work", "background", "literature", "相关工作", "文献综述"),
|
|
31
|
+
"method": (
|
|
32
|
+
"method",
|
|
33
|
+
"methods",
|
|
34
|
+
"methodology",
|
|
35
|
+
"approach",
|
|
36
|
+
"model",
|
|
37
|
+
"algorithm",
|
|
38
|
+
"方法",
|
|
39
|
+
"模型",
|
|
40
|
+
"算法",
|
|
41
|
+
),
|
|
42
|
+
"experiments": (
|
|
43
|
+
"experiment",
|
|
44
|
+
"experiments",
|
|
45
|
+
"evaluation",
|
|
46
|
+
"empirical",
|
|
47
|
+
"results",
|
|
48
|
+
"main results",
|
|
49
|
+
"ablation",
|
|
50
|
+
"sensitivity",
|
|
51
|
+
"coverage",
|
|
52
|
+
"实验",
|
|
53
|
+
"评估",
|
|
54
|
+
"结果",
|
|
55
|
+
),
|
|
56
|
+
"discussion": ("discussion", "analysis", "讨论", "分析"),
|
|
57
|
+
"conclusion": ("conclusion", "conclusions", "future work", "结论", "总结"),
|
|
58
|
+
"references": ("references", "bibliography", "参考文献"),
|
|
59
|
+
"appendix": ("appendix", "supplement", "附录"),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
CANONICAL_HEADING_TITLES = {
|
|
63
|
+
"abstract",
|
|
64
|
+
"introduction",
|
|
65
|
+
"related work",
|
|
66
|
+
"background",
|
|
67
|
+
"method",
|
|
68
|
+
"methods",
|
|
69
|
+
"methodology",
|
|
70
|
+
"approach",
|
|
71
|
+
"model",
|
|
72
|
+
"algorithm",
|
|
73
|
+
"experiments",
|
|
74
|
+
"experiment",
|
|
75
|
+
"experimental setup",
|
|
76
|
+
"experimental setups",
|
|
77
|
+
"evaluation",
|
|
78
|
+
"results",
|
|
79
|
+
"main results",
|
|
80
|
+
"ablation study",
|
|
81
|
+
"sensitivity analysis",
|
|
82
|
+
"discussion",
|
|
83
|
+
"conclusion",
|
|
84
|
+
"conclusions",
|
|
85
|
+
"references",
|
|
86
|
+
"bibliography",
|
|
87
|
+
"appendix",
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class SectionRecord:
|
|
93
|
+
title: str
|
|
94
|
+
section_type: str
|
|
95
|
+
level: int
|
|
96
|
+
start_line: int
|
|
97
|
+
end_line: int
|
|
98
|
+
start_page: int | None
|
|
99
|
+
end_page: int | None
|
|
100
|
+
content: str
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class AssetRecord:
|
|
105
|
+
asset_type: str
|
|
106
|
+
asset_id: str
|
|
107
|
+
caption: str
|
|
108
|
+
appears_in_section: str
|
|
109
|
+
appears_in_title: str
|
|
110
|
+
evidence_role: str
|
|
111
|
+
local_role: str
|
|
112
|
+
placement_logic: str
|
|
113
|
+
text_bridge_before: str
|
|
114
|
+
text_bridge_after: str
|
|
115
|
+
reuse_guidance: str
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def parse_args() -> argparse.Namespace:
|
|
119
|
+
parser = argparse.ArgumentParser(
|
|
120
|
+
description="Extract reference-paper structure templates for /lab:write."
|
|
121
|
+
)
|
|
122
|
+
parser.add_argument("sources", nargs="+", help="Local paper paths or http(s) PDF/HTML URLs")
|
|
123
|
+
parser.add_argument(
|
|
124
|
+
"--output-dir",
|
|
125
|
+
required=True,
|
|
126
|
+
help="Directory to write reference-pattern artifacts",
|
|
127
|
+
)
|
|
128
|
+
return parser.parse_args()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def slugify(value: str) -> str:
|
|
132
|
+
value = re.sub(r"\.[A-Za-z0-9]+$", "", value)
|
|
133
|
+
slug = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-").lower()
|
|
134
|
+
return slug[:80] or "reference-paper"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def short_excerpt(text: str, limit: int = 240) -> str:
|
|
138
|
+
cleaned = re.sub(r"\s+", " ", text).strip()
|
|
139
|
+
if len(cleaned) <= limit:
|
|
140
|
+
return cleaned
|
|
141
|
+
return cleaned[: limit - 3].rstrip() + "..."
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def materialize_source(source: str, cache_dir: Path) -> tuple[Path, str]:
|
|
145
|
+
parsed = urllib.parse.urlparse(source)
|
|
146
|
+
if parsed.scheme in {"http", "https"}:
|
|
147
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
148
|
+
suffix = Path(parsed.path).suffix or ".pdf"
|
|
149
|
+
name = slugify(Path(parsed.path).name or parsed.netloc)
|
|
150
|
+
digest = hashlib.sha256(source.encode("utf-8")).hexdigest()[:10]
|
|
151
|
+
target = cache_dir / f"{name}-{digest}{suffix}"
|
|
152
|
+
urllib.request.urlretrieve(source, target)
|
|
153
|
+
return target, "url"
|
|
154
|
+
return Path(source).expanduser().resolve(), "local"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def extract_pdf_text(path: Path) -> tuple[str, str, int | None]:
|
|
158
|
+
errors: list[str] = []
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
from pypdf import PdfReader # type: ignore
|
|
162
|
+
|
|
163
|
+
reader = PdfReader(str(path))
|
|
164
|
+
page_texts = []
|
|
165
|
+
for index, page in enumerate(reader.pages, start=1):
|
|
166
|
+
page_texts.append(f"[Page {index}]\n{page.extract_text() or ''}")
|
|
167
|
+
return "\n\n".join(page_texts), "pypdf", len(reader.pages)
|
|
168
|
+
except Exception as exc:
|
|
169
|
+
errors.append(f"pypdf: {exc}")
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
import pdfplumber # type: ignore
|
|
173
|
+
|
|
174
|
+
page_texts = []
|
|
175
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
176
|
+
for index, page in enumerate(pdf.pages, start=1):
|
|
177
|
+
page_texts.append(f"[Page {index}]\n{page.extract_text() or ''}")
|
|
178
|
+
return "\n\n".join(page_texts), "pdfplumber", len(pdf.pages)
|
|
179
|
+
except Exception as exc:
|
|
180
|
+
errors.append(f"pdfplumber: {exc}")
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
import fitz # type: ignore
|
|
184
|
+
|
|
185
|
+
doc = fitz.open(str(path))
|
|
186
|
+
page_texts = [f"[Page {i + 1}]\n{page.get_text()}" for i, page in enumerate(doc)]
|
|
187
|
+
page_count = len(doc)
|
|
188
|
+
doc.close()
|
|
189
|
+
return "\n\n".join(page_texts), "pymupdf", page_count
|
|
190
|
+
except Exception as exc:
|
|
191
|
+
errors.append(f"pymupdf: {exc}")
|
|
192
|
+
|
|
193
|
+
if shutil.which("pdftotext"):
|
|
194
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
195
|
+
out_path = Path(tmp_dir) / "paper.txt"
|
|
196
|
+
command = ["pdftotext", "-layout", str(path), str(out_path)]
|
|
197
|
+
completed = subprocess.run(command, capture_output=True, text=True, check=False)
|
|
198
|
+
if completed.returncode == 0:
|
|
199
|
+
return out_path.read_text(encoding="utf-8", errors="ignore"), "pdftotext", None
|
|
200
|
+
errors.append(f"pdftotext: {completed.stderr.strip()}")
|
|
201
|
+
|
|
202
|
+
raise RuntimeError(
|
|
203
|
+
"No PDF text extractor succeeded. Install pypdf, pdfplumber, PyMuPDF, or pdftotext. "
|
|
204
|
+
+ " | ".join(errors)
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def find_uv() -> str | None:
|
|
209
|
+
uv_path = shutil.which("uv")
|
|
210
|
+
if uv_path:
|
|
211
|
+
return uv_path
|
|
212
|
+
local_uv = Path.home() / ".local" / "bin" / "uv"
|
|
213
|
+
if local_uv.exists():
|
|
214
|
+
return str(local_uv)
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def rerun_with_uv_pypdf() -> int | None:
|
|
219
|
+
if os.environ.get("LAB_REFERENCE_PDF_BOOTSTRAPPED"):
|
|
220
|
+
return None
|
|
221
|
+
uv_path = find_uv()
|
|
222
|
+
if not uv_path:
|
|
223
|
+
return None
|
|
224
|
+
env = os.environ.copy()
|
|
225
|
+
env["LAB_REFERENCE_PDF_BOOTSTRAPPED"] = "1"
|
|
226
|
+
command = [
|
|
227
|
+
uv_path,
|
|
228
|
+
"run",
|
|
229
|
+
"--with",
|
|
230
|
+
"pypdf",
|
|
231
|
+
"python3",
|
|
232
|
+
str(Path(__file__).resolve()),
|
|
233
|
+
*sys.argv[1:],
|
|
234
|
+
]
|
|
235
|
+
return subprocess.run(command, env=env, check=False).returncode
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def strip_html(text: str) -> str:
|
|
239
|
+
text = re.sub(r"(?is)<(script|style).*?</\1>", " ", text)
|
|
240
|
+
text = re.sub(r"(?i)<br\s*/?>", "\n", text)
|
|
241
|
+
text = re.sub(r"(?i)</(p|div|section|article|h[1-6]|li|tr)>", "\n", text)
|
|
242
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
243
|
+
return html.unescape(re.sub(r"\n{3,}", "\n\n", text))
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def read_source_text(path: Path) -> tuple[str, str, int | None]:
|
|
247
|
+
suffix = path.suffix.lower()
|
|
248
|
+
if suffix == ".pdf":
|
|
249
|
+
return extract_pdf_text(path)
|
|
250
|
+
raw = path.read_text(encoding="utf-8", errors="ignore")
|
|
251
|
+
if suffix in {".html", ".htm"}:
|
|
252
|
+
return strip_html(raw), "html-text", None
|
|
253
|
+
return raw, "plain-text", None
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def detect_title(text: str, fallback: str) -> str:
|
|
257
|
+
for line in text.splitlines():
|
|
258
|
+
stripped = re.sub(r"^#+\s*", "", line).strip()
|
|
259
|
+
if not stripped or stripped.startswith("[Page "):
|
|
260
|
+
continue
|
|
261
|
+
if is_caption_line(stripped):
|
|
262
|
+
continue
|
|
263
|
+
if len(stripped) < 180:
|
|
264
|
+
return stripped
|
|
265
|
+
return fallback
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def normalize_heading(line: str) -> str:
|
|
269
|
+
line = re.sub(r"^#+\s*", "", line.strip())
|
|
270
|
+
return line.strip()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def section_type_for_title(title: str) -> str:
|
|
274
|
+
lowered = title.lower()
|
|
275
|
+
lowered = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", lowered).strip()
|
|
276
|
+
for section_type, aliases in SECTION_ALIASES.items():
|
|
277
|
+
if any(alias in lowered for alias in aliases):
|
|
278
|
+
return section_type
|
|
279
|
+
return "other"
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def heading_level(title: str) -> int:
|
|
283
|
+
markdown = re.match(r"^(#{1,6})\s+", title)
|
|
284
|
+
if markdown:
|
|
285
|
+
return len(markdown.group(1))
|
|
286
|
+
numeric = re.match(r"^(\d+(?:\.\d+)*)\.?\s+", title.strip())
|
|
287
|
+
if numeric:
|
|
288
|
+
return min(1 + numeric.group(1).count("."), 4)
|
|
289
|
+
return 1
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def is_caption_line(line: str) -> bool:
|
|
293
|
+
return bool(re.match(r"^(?:Table|TABLE|Figure|FIGURE|Fig\.?)\s+[A-Za-z0-9.]+", line.strip()))
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def is_heading_line(line: str) -> bool:
|
|
297
|
+
stripped = line.strip()
|
|
298
|
+
if not stripped or stripped.startswith("[Page ") or is_caption_line(stripped):
|
|
299
|
+
return False
|
|
300
|
+
if len(stripped) > 140:
|
|
301
|
+
return False
|
|
302
|
+
if stripped.endswith((".", ",", ";", ":")) and not re.match(r"^\d+(?:\.\d+)*\.?\s+", stripped):
|
|
303
|
+
return False
|
|
304
|
+
if re.match(r"^#{1,4}\s+\S", stripped):
|
|
305
|
+
return True
|
|
306
|
+
lowered = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", stripped.lower()).strip()
|
|
307
|
+
lowered = re.sub(r"^[ivx]+\.\s+", "", lowered).strip()
|
|
308
|
+
lowered = re.sub(r"^appendix\s+[a-z](?:\.\d+)*\.?\s*", "", lowered).strip()
|
|
309
|
+
if lowered in CANONICAL_HEADING_TITLES:
|
|
310
|
+
return True
|
|
311
|
+
if re.match(r"^\d+(?:\.\d+)*\.?\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
|
|
312
|
+
return True
|
|
313
|
+
if re.match(r"^[A-Z]\.\d+(?:\.\d+)*\.?\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
|
|
314
|
+
return True
|
|
315
|
+
if re.match(r"^Appendix\s+[A-Z](?:\.\d+)*\.?\s*[:\-]?\s*[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
|
|
316
|
+
return True
|
|
317
|
+
if re.match(r"^[IVX]+\.\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
|
|
318
|
+
return True
|
|
319
|
+
return False
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def split_sections(text: str) -> list[SectionRecord]:
|
|
323
|
+
lines = text.splitlines()
|
|
324
|
+
headings: list[tuple[int, str, int | None]] = []
|
|
325
|
+
current_page: int | None = None
|
|
326
|
+
line_pages: dict[int, int | None] = {}
|
|
327
|
+
|
|
328
|
+
for index, line in enumerate(lines):
|
|
329
|
+
page_match = re.match(r"^\[Page\s+(\d+)\]$", line.strip())
|
|
330
|
+
if page_match:
|
|
331
|
+
current_page = int(page_match.group(1))
|
|
332
|
+
line_pages[index] = current_page
|
|
333
|
+
if is_heading_line(line):
|
|
334
|
+
headings.append((index, normalize_heading(line), current_page))
|
|
335
|
+
|
|
336
|
+
if not headings:
|
|
337
|
+
content = "\n".join(line for line in lines if not line.startswith("[Page "))
|
|
338
|
+
return [
|
|
339
|
+
SectionRecord(
|
|
340
|
+
title="Full Text",
|
|
341
|
+
section_type="other",
|
|
342
|
+
level=1,
|
|
343
|
+
start_line=1,
|
|
344
|
+
end_line=len(lines),
|
|
345
|
+
start_page=None,
|
|
346
|
+
end_page=None,
|
|
347
|
+
content=content,
|
|
348
|
+
)
|
|
349
|
+
]
|
|
350
|
+
|
|
351
|
+
sections: list[SectionRecord] = []
|
|
352
|
+
for pos, (start_index, title, start_page) in enumerate(headings):
|
|
353
|
+
end_index = headings[pos + 1][0] - 1 if pos + 1 < len(headings) else len(lines) - 1
|
|
354
|
+
content_lines = [
|
|
355
|
+
line for line in lines[start_index + 1 : end_index + 1] if not line.startswith("[Page ")
|
|
356
|
+
]
|
|
357
|
+
end_page = line_pages.get(end_index)
|
|
358
|
+
sections.append(
|
|
359
|
+
SectionRecord(
|
|
360
|
+
title=title,
|
|
361
|
+
section_type=section_type_for_title(title),
|
|
362
|
+
level=heading_level(title),
|
|
363
|
+
start_line=start_index + 1,
|
|
364
|
+
end_line=end_index + 1,
|
|
365
|
+
start_page=start_page,
|
|
366
|
+
end_page=end_page,
|
|
367
|
+
content="\n".join(content_lines).strip(),
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
infer_section_types(sections)
|
|
371
|
+
return sections
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def infer_section_types(sections: list[SectionRecord]) -> None:
|
|
375
|
+
"""Recover section families when PDF text only preserves numbered headings.
|
|
376
|
+
|
|
377
|
+
Many academic PDF extractors keep subsection titles such as "4.2 Feature
|
|
378
|
+
Selection" but lose the visual hierarchy that tells us they belong to the
|
|
379
|
+
Method section. For template extraction, section family is more useful than
|
|
380
|
+
the literal heading string, so unknown subsections inherit from their nearest
|
|
381
|
+
known parent. Unknown top-level technical sections before Experiments are
|
|
382
|
+
treated as method-style sections, which matches common ML paper structure.
|
|
383
|
+
"""
|
|
384
|
+
|
|
385
|
+
first_experiment_index = next(
|
|
386
|
+
(index for index, section in enumerate(sections) if section.section_type == "experiments"),
|
|
387
|
+
None,
|
|
388
|
+
)
|
|
389
|
+
first_intro_index = next(
|
|
390
|
+
(index for index, section in enumerate(sections) if section.section_type == "introduction"),
|
|
391
|
+
None,
|
|
392
|
+
)
|
|
393
|
+
parent_by_level: dict[int, str] = {}
|
|
394
|
+
inheritable = {"related-work", "method", "experiments", "discussion", "conclusion", "appendix"}
|
|
395
|
+
|
|
396
|
+
for index, section in enumerate(sections):
|
|
397
|
+
for level in list(parent_by_level):
|
|
398
|
+
if level >= section.level:
|
|
399
|
+
parent_by_level.pop(level, None)
|
|
400
|
+
|
|
401
|
+
if section.section_type == "other":
|
|
402
|
+
inherited = None
|
|
403
|
+
for level in range(section.level - 1, 0, -1):
|
|
404
|
+
parent_type = parent_by_level.get(level)
|
|
405
|
+
if parent_type in inheritable:
|
|
406
|
+
inherited = parent_type
|
|
407
|
+
break
|
|
408
|
+
if inherited:
|
|
409
|
+
section.section_type = inherited
|
|
410
|
+
elif (
|
|
411
|
+
section.level == 1
|
|
412
|
+
and first_experiment_index is not None
|
|
413
|
+
and index < first_experiment_index
|
|
414
|
+
and (first_intro_index is None or index > first_intro_index)
|
|
415
|
+
):
|
|
416
|
+
section.section_type = "method"
|
|
417
|
+
|
|
418
|
+
parent_by_level[section.level] = section.section_type
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def paragraph_role(section_type: str, paragraph: str) -> str:
|
|
422
|
+
lowered = paragraph.lower()
|
|
423
|
+
if is_caption_line(paragraph):
|
|
424
|
+
return "visual_or_table_anchor"
|
|
425
|
+
if section_type == "abstract":
|
|
426
|
+
return "abstract_summary"
|
|
427
|
+
if any(token in lowered for token in ("limitation", "future work", "caveat", "drift", "局限")):
|
|
428
|
+
return "limitation_boundary"
|
|
429
|
+
if any(token in lowered for token in ("ablation", "component", "without", "remove")):
|
|
430
|
+
return "ablation_interpretation"
|
|
431
|
+
if any(token in lowered for token in ("result", "outperform", "improve", "coverage", "interval length", "gain")):
|
|
432
|
+
return "result_interpretation"
|
|
433
|
+
if any(token in lowered for token in ("baseline", "compare", "comparator")):
|
|
434
|
+
return "comparator_setup"
|
|
435
|
+
if any(token in lowered for token in ("dataset", "benchmark", "metric", "protocol", "split", "auuc", "qini")):
|
|
436
|
+
return "experimental_protocol"
|
|
437
|
+
if any(token in lowered for token in ("contribution", "we propose", "our method", "framework", "model")):
|
|
438
|
+
return "contribution_or_method_claim"
|
|
439
|
+
if any(token in lowered for token in ("existing", "requires", "problem", "gap", "lack", "overfit")):
|
|
440
|
+
return "problem_or_gap"
|
|
441
|
+
if section_type == "method":
|
|
442
|
+
return "method_exposition"
|
|
443
|
+
if section_type == "experiments":
|
|
444
|
+
return "experiment_exposition"
|
|
445
|
+
return "section_exposition"
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def split_paragraphs(text: str) -> list[str]:
|
|
449
|
+
chunks = re.split(r"\n\s*\n", text)
|
|
450
|
+
paragraphs: list[str] = []
|
|
451
|
+
for chunk in chunks:
|
|
452
|
+
cleaned = "\n".join(line.strip() for line in chunk.splitlines() if line.strip())
|
|
453
|
+
if cleaned:
|
|
454
|
+
paragraphs.append(cleaned)
|
|
455
|
+
return paragraphs
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def caption_match(line: str) -> re.Match[str] | None:
|
|
459
|
+
return re.match(
|
|
460
|
+
r"^(?P<kind>Table|TABLE|Figure|FIGURE|Fig\.?)\s+(?P<id>[A-Za-z0-9.]+)\s*[:.\-]?\s*(?P<caption>.*)$",
|
|
461
|
+
line.strip(),
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def classify_asset_role(asset_type: str, caption: str) -> str:
|
|
466
|
+
lowered = caption.lower()
|
|
467
|
+
if "ablation" in lowered or "component" in lowered:
|
|
468
|
+
return "ablation"
|
|
469
|
+
if any(token in lowered for token in ("coverage", "interval", "trade-off", "tradeoff", "sensitivity", "shift")):
|
|
470
|
+
return "uncertainty_or_tradeoff"
|
|
471
|
+
if any(token in lowered for token in ("main", "result", "performance", "ranking", "auuc", "qini")):
|
|
472
|
+
return "main_results"
|
|
473
|
+
if any(token in lowered for token in ("dataset", "statistics", "summary", "treatment assignment")):
|
|
474
|
+
return "dataset_or_protocol"
|
|
475
|
+
if any(token in lowered for token in ("overview", "framework", "pipeline", "architecture", "model")):
|
|
476
|
+
return "method_overview"
|
|
477
|
+
if any(token in lowered for token in ("case", "example")):
|
|
478
|
+
return "case_analysis"
|
|
479
|
+
return "supporting_evidence" if asset_type == "table" else "conceptual_visual"
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def caption_needs_continuation(caption: str) -> bool:
|
|
483
|
+
caption = caption.strip()
|
|
484
|
+
return bool(caption) and len(caption) < 140 and not re.search(r"[.!?)]$", caption)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def extend_caption(caption: str, lines: list[str], start_index: int) -> str:
|
|
488
|
+
parts = [caption.strip()]
|
|
489
|
+
next_index = start_index + 1
|
|
490
|
+
while caption_needs_continuation(" ".join(parts)) and next_index < len(lines) and len(parts) < 4:
|
|
491
|
+
candidate = lines[next_index].strip()
|
|
492
|
+
next_index += 1
|
|
493
|
+
if not candidate:
|
|
494
|
+
continue
|
|
495
|
+
if candidate.startswith("[Page ") or is_caption_line(candidate) or is_heading_line(candidate):
|
|
496
|
+
break
|
|
497
|
+
if len(candidate) > 180:
|
|
498
|
+
break
|
|
499
|
+
parts.append(candidate)
|
|
500
|
+
return re.sub(r"\s+", " ", " ".join(parts)).strip()
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def asset_role_text(evidence_role: str) -> tuple[str, str, str, str]:
|
|
504
|
+
mapping = {
|
|
505
|
+
"main_results": (
|
|
506
|
+
"answers the primary comparison question",
|
|
507
|
+
"state the comparison question and ranking metric before the table",
|
|
508
|
+
"interpret ranking, margin, and evidence strength after the table",
|
|
509
|
+
"place after protocol, metrics, and comparator definitions",
|
|
510
|
+
),
|
|
511
|
+
"ablation": (
|
|
512
|
+
"isolates which component supports the claimed gain",
|
|
513
|
+
"name the design choice being tested before the asset",
|
|
514
|
+
"explain the component-level implication after the asset",
|
|
515
|
+
"place after main results so the reader first sees the headline effect",
|
|
516
|
+
),
|
|
517
|
+
"dataset_or_protocol": (
|
|
518
|
+
"defines the evaluation substrate and comparison scope",
|
|
519
|
+
"tell the reader why the dataset or protocol matters",
|
|
520
|
+
"connect the protocol to the later comparison table",
|
|
521
|
+
"place before baseline and result interpretation",
|
|
522
|
+
),
|
|
523
|
+
"method_overview": (
|
|
524
|
+
"orients the reader to the method pipeline",
|
|
525
|
+
"introduce the mechanism or module sequence before the figure",
|
|
526
|
+
"map visual elements to later method prose after the figure",
|
|
527
|
+
"place near the beginning of the method section",
|
|
528
|
+
),
|
|
529
|
+
"uncertainty_or_tradeoff": (
|
|
530
|
+
"explains a robustness, uncertainty, or trade-off pattern",
|
|
531
|
+
"state the trade-off or diagnostic question before the asset",
|
|
532
|
+
"explain the mechanism or limitation revealed by the asset",
|
|
533
|
+
"place after the main result table or in analysis/sensitivity subsections",
|
|
534
|
+
),
|
|
535
|
+
}
|
|
536
|
+
return mapping.get(
|
|
537
|
+
evidence_role,
|
|
538
|
+
(
|
|
539
|
+
"supports a local subsection claim",
|
|
540
|
+
"state the local question before the asset",
|
|
541
|
+
"explain the takeaway after the asset",
|
|
542
|
+
"place next to the subsection claim it supports",
|
|
543
|
+
),
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def extract_assets(sections: list[SectionRecord]) -> list[AssetRecord]:
|
|
548
|
+
assets: list[AssetRecord] = []
|
|
549
|
+
for section in sections:
|
|
550
|
+
lines = section.content.splitlines()
|
|
551
|
+
for index, line in enumerate(lines):
|
|
552
|
+
match = caption_match(line)
|
|
553
|
+
if not match:
|
|
554
|
+
continue
|
|
555
|
+
raw_kind = match.group("kind").lower()
|
|
556
|
+
asset_type = "table" if raw_kind.startswith("table") else "figure"
|
|
557
|
+
caption = match.group("caption").strip() or line.strip()
|
|
558
|
+
caption = extend_caption(caption, lines, index)
|
|
559
|
+
evidence_role = classify_asset_role(asset_type, caption)
|
|
560
|
+
local_role, before, after, placement = asset_role_text(evidence_role)
|
|
561
|
+
assets.append(
|
|
562
|
+
AssetRecord(
|
|
563
|
+
asset_type=asset_type,
|
|
564
|
+
asset_id=match.group("id"),
|
|
565
|
+
caption=short_excerpt(caption, 180),
|
|
566
|
+
appears_in_section=section.section_type,
|
|
567
|
+
appears_in_title=section.title,
|
|
568
|
+
evidence_role=evidence_role,
|
|
569
|
+
local_role=local_role,
|
|
570
|
+
placement_logic=placement,
|
|
571
|
+
text_bridge_before=before,
|
|
572
|
+
text_bridge_after=after,
|
|
573
|
+
reuse_guidance="Reuse the asset function and placement logic only; do not copy caption wording or claims.",
|
|
574
|
+
)
|
|
575
|
+
)
|
|
576
|
+
return assets
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def build_paragraph_roles(sections: list[SectionRecord]) -> list[dict]:
|
|
580
|
+
roles: list[dict] = []
|
|
581
|
+
for section in sections:
|
|
582
|
+
for index, paragraph in enumerate(split_paragraphs(section.content), start=1):
|
|
583
|
+
roles.append(
|
|
584
|
+
{
|
|
585
|
+
"section": section.section_type,
|
|
586
|
+
"section_title": section.title,
|
|
587
|
+
"paragraph_index": index,
|
|
588
|
+
"role": paragraph_role(section.section_type, paragraph),
|
|
589
|
+
"excerpt": short_excerpt(paragraph),
|
|
590
|
+
"reuse_guidance": "Reuse the paragraph function, not the source wording.",
|
|
591
|
+
}
|
|
592
|
+
)
|
|
593
|
+
return roles
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def section_slots(section: SectionRecord, paragraph_roles: list[dict], assets: list[AssetRecord]) -> list[str]:
|
|
597
|
+
slots: list[str] = []
|
|
598
|
+
title = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", section.title).strip()
|
|
599
|
+
if title and title.lower() not in {"experiments", "experiment", "results"}:
|
|
600
|
+
slots.append(title.lower())
|
|
601
|
+
for role in paragraph_roles:
|
|
602
|
+
if role["section_title"] == section.title and role["role"] not in slots:
|
|
603
|
+
slots.append(role["role"].replace("_", " "))
|
|
604
|
+
for asset in assets:
|
|
605
|
+
if asset.appears_in_title == section.title:
|
|
606
|
+
slots.append(f"{asset.asset_type}: {asset.evidence_role}")
|
|
607
|
+
return slots
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def write_paper_artifacts(
|
|
611
|
+
paper_dir: Path,
|
|
612
|
+
source: str,
|
|
613
|
+
materialized_path: Path,
|
|
614
|
+
source_kind: str,
|
|
615
|
+
title: str,
|
|
616
|
+
method: str,
|
|
617
|
+
page_count: int | None,
|
|
618
|
+
sections: list[SectionRecord],
|
|
619
|
+
roles: list[dict],
|
|
620
|
+
assets: list[AssetRecord],
|
|
621
|
+
) -> dict:
|
|
622
|
+
paper_dir.mkdir(parents=True, exist_ok=True)
|
|
623
|
+
metadata = {
|
|
624
|
+
"source": source,
|
|
625
|
+
"source_kind": source_kind,
|
|
626
|
+
"materialized_path": str(materialized_path),
|
|
627
|
+
"title": title,
|
|
628
|
+
"extraction_method": method,
|
|
629
|
+
"page_count": page_count,
|
|
630
|
+
"section_count": len(sections),
|
|
631
|
+
"visual_asset_count": len(assets),
|
|
632
|
+
}
|
|
633
|
+
(paper_dir / "metadata.json").write_text(json.dumps(metadata, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
634
|
+
(paper_dir / "section-map.json").write_text(
|
|
635
|
+
json.dumps(
|
|
636
|
+
[
|
|
637
|
+
{
|
|
638
|
+
key: value
|
|
639
|
+
for key, value in asdict(section).items()
|
|
640
|
+
if key != "content"
|
|
641
|
+
}
|
|
642
|
+
for section in sections
|
|
643
|
+
],
|
|
644
|
+
indent=2,
|
|
645
|
+
ensure_ascii=False,
|
|
646
|
+
),
|
|
647
|
+
encoding="utf-8",
|
|
648
|
+
)
|
|
649
|
+
(paper_dir / "paragraph-roles.json").write_text(json.dumps(roles, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
650
|
+
(paper_dir / "visual-assets.json").write_text(
|
|
651
|
+
json.dumps([asdict(asset) for asset in assets], indent=2, ensure_ascii=False),
|
|
652
|
+
encoding="utf-8",
|
|
653
|
+
)
|
|
654
|
+
write_section_logic(paper_dir / "section-logic.md", title, sections, roles, assets)
|
|
655
|
+
write_writing_patterns(paper_dir / "writing-patterns.md", title, sections, roles, assets)
|
|
656
|
+
(paper_dir / "extraction-report.md").write_text(
|
|
657
|
+
"\n".join(
|
|
658
|
+
[
|
|
659
|
+
f"# Extraction Report: {title}",
|
|
660
|
+
"",
|
|
661
|
+
f"- Source: `{source}`",
|
|
662
|
+
f"- Extraction method: `{method}`",
|
|
663
|
+
f"- Sections detected: {len(sections)}",
|
|
664
|
+
f"- Visual/table assets detected: {len(assets)}",
|
|
665
|
+
"- Boundary: this artifact extracts reusable structure only; it must not be used as evidence for the current paper.",
|
|
666
|
+
]
|
|
667
|
+
)
|
|
668
|
+
+ "\n",
|
|
669
|
+
encoding="utf-8",
|
|
670
|
+
)
|
|
671
|
+
return metadata
|
|
672
|
+
|
|
673
|
+
|
|
674
|
+
def write_section_logic(path: Path, title: str, sections: list[SectionRecord], roles: list[dict], assets: list[AssetRecord]) -> None:
|
|
675
|
+
lines = [f"# Section Logic: {title}", ""]
|
|
676
|
+
for section in sections:
|
|
677
|
+
section_roles = [role for role in roles if role["section_title"] == section.title]
|
|
678
|
+
section_assets = [asset for asset in assets if asset.appears_in_title == section.title]
|
|
679
|
+
lines.append(f"## {section.title}")
|
|
680
|
+
lines.append("")
|
|
681
|
+
lines.append(f"- Section type: `{section.section_type}`")
|
|
682
|
+
if section_roles:
|
|
683
|
+
role_list = ", ".join(dict.fromkeys(role["role"] for role in section_roles))
|
|
684
|
+
lines.append(f"- Paragraph roles: {role_list}")
|
|
685
|
+
if section_assets:
|
|
686
|
+
asset_list = ", ".join(f"{asset.asset_type}:{asset.evidence_role}" for asset in section_assets)
|
|
687
|
+
lines.append(f"- Asset roles: {asset_list}")
|
|
688
|
+
lines.append("- Reuse: preserve the slot order and rhetorical function, not the wording.")
|
|
689
|
+
lines.append("")
|
|
690
|
+
path.write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
def write_writing_patterns(path: Path, title: str, sections: list[SectionRecord], roles: list[dict], assets: list[AssetRecord]) -> None:
|
|
694
|
+
lines = [f"# Writing Patterns: {title}", ""]
|
|
695
|
+
lines.append("## Template Use")
|
|
696
|
+
lines.append("")
|
|
697
|
+
lines.append("- Use this paper as one structural template among several references.")
|
|
698
|
+
lines.append("- Reproduce section slots, paragraph functions, and asset placement logic only.")
|
|
699
|
+
lines.append("- Do not copy wording, claims, metrics, or conclusions.")
|
|
700
|
+
lines.append("")
|
|
701
|
+
lines.append("## Observed Section Slots")
|
|
702
|
+
lines.append("")
|
|
703
|
+
for section in sections:
|
|
704
|
+
lines.append(f"- {section.section_type}: {section.title}")
|
|
705
|
+
lines.append("")
|
|
706
|
+
lines.append("## Visual/Table Pattern")
|
|
707
|
+
lines.append("")
|
|
708
|
+
if assets:
|
|
709
|
+
for asset in assets:
|
|
710
|
+
lines.append(f"- {asset.asset_type} {asset.asset_id}: {asset.evidence_role} -> {asset.local_role}")
|
|
711
|
+
else:
|
|
712
|
+
lines.append("- No table or figure caption was detected.")
|
|
713
|
+
path.write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
|
|
714
|
+
|
|
715
|
+
|
|
716
|
+
def merge_unique(values: list[str]) -> list[str]:
|
|
717
|
+
seen: set[str] = set()
|
|
718
|
+
merged: list[str] = []
|
|
719
|
+
for value in values:
|
|
720
|
+
normalized = re.sub(r"\s+", " ", value.strip())
|
|
721
|
+
key = normalized.lower()
|
|
722
|
+
if not normalized or key in seen:
|
|
723
|
+
continue
|
|
724
|
+
seen.add(key)
|
|
725
|
+
merged.append(normalized)
|
|
726
|
+
return merged
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def build_section_templates(output_dir: Path, paper_payloads: list[dict]) -> None:
|
|
730
|
+
target = output_dir / "section-templates"
|
|
731
|
+
target.mkdir(parents=True, exist_ok=True)
|
|
732
|
+
section_types = ["abstract", "introduction", "related-work", "method", "experiments", "discussion", "conclusion"]
|
|
733
|
+
for section_type in section_types:
|
|
734
|
+
source_papers: list[str] = []
|
|
735
|
+
observed_titles: list[str] = []
|
|
736
|
+
slots: list[str] = []
|
|
737
|
+
paragraph_roles: list[str] = []
|
|
738
|
+
asset_roles: list[dict] = []
|
|
739
|
+
for payload in paper_payloads:
|
|
740
|
+
matching_sections = [section for section in payload["sections"] if section.section_type == section_type]
|
|
741
|
+
if section_type == "experiments":
|
|
742
|
+
matching_sections.extend(
|
|
743
|
+
section for section in payload["sections"] if section.section_type in {"discussion"}
|
|
744
|
+
)
|
|
745
|
+
if not matching_sections:
|
|
746
|
+
continue
|
|
747
|
+
source_papers.append(payload["slug"])
|
|
748
|
+
for section in matching_sections:
|
|
749
|
+
observed_titles.append(section.title)
|
|
750
|
+
slots.extend(section_slots(section, payload["roles"], payload["assets"]))
|
|
751
|
+
paragraph_roles.extend(
|
|
752
|
+
role["role"] for role in payload["roles"] if role["section"] == section_type
|
|
753
|
+
)
|
|
754
|
+
asset_roles.extend(
|
|
755
|
+
{
|
|
756
|
+
"asset_type": asset.asset_type,
|
|
757
|
+
"evidence_role": asset.evidence_role,
|
|
758
|
+
"local_role": asset.local_role,
|
|
759
|
+
"source_paper": payload["slug"],
|
|
760
|
+
}
|
|
761
|
+
for asset in payload["assets"]
|
|
762
|
+
if asset.appears_in_section == section_type
|
|
763
|
+
or (section_type == "experiments" and asset.appears_in_section in {"experiments", "discussion"})
|
|
764
|
+
)
|
|
765
|
+
if not source_papers:
|
|
766
|
+
continue
|
|
767
|
+
template = {
|
|
768
|
+
"section": section_type,
|
|
769
|
+
"template_id": f"{section_type}-multi-reference-template",
|
|
770
|
+
"source_papers": merge_unique(source_papers),
|
|
771
|
+
"observed_titles": merge_unique(observed_titles),
|
|
772
|
+
"section_slots": merge_unique(slots),
|
|
773
|
+
"paragraph_roles": merge_unique(paragraph_roles),
|
|
774
|
+
"asset_roles": asset_roles,
|
|
775
|
+
"reuse_rule": "Reuse structure only; do not copy wording, claims, metrics, or conclusions from reference papers.",
|
|
776
|
+
}
|
|
777
|
+
(target / f"{section_type}.json").write_text(
|
|
778
|
+
json.dumps(template, indent=2, ensure_ascii=False),
|
|
779
|
+
encoding="utf-8",
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def build_visual_templates(output_dir: Path, paper_payloads: list[dict]) -> None:
|
|
784
|
+
target = output_dir / "visual-templates"
|
|
785
|
+
target.mkdir(parents=True, exist_ok=True)
|
|
786
|
+
asset_roles: list[dict] = []
|
|
787
|
+
for payload in paper_payloads:
|
|
788
|
+
for asset in payload["assets"]:
|
|
789
|
+
if asset.appears_in_section in {"experiments", "discussion", "method"}:
|
|
790
|
+
item = asdict(asset)
|
|
791
|
+
item["source_paper"] = payload["slug"]
|
|
792
|
+
asset_roles.append(item)
|
|
793
|
+
template = {
|
|
794
|
+
"template_id": "experiment-visual-and-table-template",
|
|
795
|
+
"asset_roles": asset_roles,
|
|
796
|
+
"reuse_rule": "Use table and figure functions, placement, and bridge logic to plan current paper assets; do not copy captions or data.",
|
|
797
|
+
}
|
|
798
|
+
(target / "experiment-assets.json").write_text(
|
|
799
|
+
json.dumps(template, indent=2, ensure_ascii=False),
|
|
800
|
+
encoding="utf-8",
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
def write_aggregate_playbook(output_dir: Path, paper_payloads: list[dict]) -> None:
|
|
805
|
+
lines = [
|
|
806
|
+
"# Aggregate Template Playbook",
|
|
807
|
+
"",
|
|
808
|
+
"Purpose: help `/lab:write` reproduce mature paper structure from multiple reference templates.",
|
|
809
|
+
"",
|
|
810
|
+
"## Sources",
|
|
811
|
+
"",
|
|
812
|
+
]
|
|
813
|
+
for payload in paper_payloads:
|
|
814
|
+
lines.append(f"- `{payload['slug']}`: {payload['title']}")
|
|
815
|
+
lines.extend(
|
|
816
|
+
[
|
|
817
|
+
"",
|
|
818
|
+
"## Reuse Boundary",
|
|
819
|
+
"",
|
|
820
|
+
"- Reproduce section order, paragraph roles, table/figure function, and bridge logic.",
|
|
821
|
+
"- Do not copy wording, claims, experimental conclusions, metrics, or terminology names.",
|
|
822
|
+
"- If only one reference supports a structure, treat it as a single-template pattern, not a universal rule.",
|
|
823
|
+
"",
|
|
824
|
+
"## Multi-Template Write Procedure",
|
|
825
|
+
"",
|
|
826
|
+
"1. Pick 2-3 closest section templates for the current paper section.",
|
|
827
|
+
"2. Build a mini-outline from common slots and current-paper evidence.",
|
|
828
|
+
"3. Add required table/figure assets with local before/after bridge functions.",
|
|
829
|
+
"4. Draft with current-paper terminology and evidence only.",
|
|
830
|
+
"",
|
|
831
|
+
"## Table/Figure Planning Rule",
|
|
832
|
+
"",
|
|
833
|
+
"Every major table or figure should answer a reader question, appear near the subsection claim it supports, and have one bridge sentence before and one interpretation sentence after it.",
|
|
834
|
+
]
|
|
835
|
+
)
|
|
836
|
+
(output_dir / "aggregate-template-playbook.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
def process_source(source: str, output_dir: Path, cache_dir: Path) -> dict:
|
|
840
|
+
materialized_path, source_kind = materialize_source(source, cache_dir)
|
|
841
|
+
if not materialized_path.exists():
|
|
842
|
+
raise FileNotFoundError(f"source not found: {source}")
|
|
843
|
+
text, method, page_count = read_source_text(materialized_path)
|
|
844
|
+
title = detect_title(text, materialized_path.stem)
|
|
845
|
+
slug = slugify(title or materialized_path.stem)
|
|
846
|
+
sections = split_sections(text)
|
|
847
|
+
roles = build_paragraph_roles(sections)
|
|
848
|
+
assets = extract_assets(sections)
|
|
849
|
+
metadata = write_paper_artifacts(
|
|
850
|
+
output_dir / slug,
|
|
851
|
+
source,
|
|
852
|
+
materialized_path,
|
|
853
|
+
source_kind,
|
|
854
|
+
title,
|
|
855
|
+
method,
|
|
856
|
+
page_count,
|
|
857
|
+
sections,
|
|
858
|
+
roles,
|
|
859
|
+
assets,
|
|
860
|
+
)
|
|
861
|
+
return {
|
|
862
|
+
"slug": slug,
|
|
863
|
+
"title": title,
|
|
864
|
+
"metadata": metadata,
|
|
865
|
+
"sections": sections,
|
|
866
|
+
"roles": roles,
|
|
867
|
+
"assets": assets,
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def main() -> int:
|
|
872
|
+
args = parse_args()
|
|
873
|
+
output_dir = Path(args.output_dir).expanduser().resolve()
|
|
874
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
875
|
+
cache_dir = output_dir / ".cache"
|
|
876
|
+
|
|
877
|
+
payloads: list[dict] = []
|
|
878
|
+
source_entries: list[dict] = []
|
|
879
|
+
try:
|
|
880
|
+
for source in args.sources:
|
|
881
|
+
payload = process_source(source, output_dir, cache_dir)
|
|
882
|
+
payloads.append(payload)
|
|
883
|
+
source_entries.append(
|
|
884
|
+
{
|
|
885
|
+
"source": source,
|
|
886
|
+
"slug": payload["slug"],
|
|
887
|
+
"title": payload["title"],
|
|
888
|
+
"artifact_dir": str(output_dir / payload["slug"]),
|
|
889
|
+
}
|
|
890
|
+
)
|
|
891
|
+
except RuntimeError as exc:
|
|
892
|
+
if "No PDF text extractor succeeded" in str(exc):
|
|
893
|
+
bootstrapped = rerun_with_uv_pypdf()
|
|
894
|
+
if bootstrapped is not None:
|
|
895
|
+
return bootstrapped
|
|
896
|
+
raise
|
|
897
|
+
|
|
898
|
+
(output_dir / "sources.json").write_text(
|
|
899
|
+
json.dumps({"sources": source_entries}, indent=2, ensure_ascii=False),
|
|
900
|
+
encoding="utf-8",
|
|
901
|
+
)
|
|
902
|
+
build_section_templates(output_dir, payloads)
|
|
903
|
+
build_visual_templates(output_dir, payloads)
|
|
904
|
+
write_aggregate_playbook(output_dir, payloads)
|
|
905
|
+
print(f"reference template patterns written to {output_dir}")
|
|
906
|
+
return 0
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
if __name__ == "__main__":
|
|
910
|
+
raise SystemExit(main())
|