superlab 0.1.63 → 0.1.65

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/lib/auto_state.cjs +3 -0
  2. package/lib/i18n.cjs +6 -4
  3. package/lib/install.cjs +1 -1
  4. package/lib/lab_write_contract.json +4 -4
  5. package/lib/rule_preflight.cjs +49 -1
  6. package/package-assets/claude/commands/lab/write.md +1 -1
  7. package/package-assets/claude/commands/lab-write.md +1 -1
  8. package/package-assets/claude/commands/lab:write.md +1 -1
  9. package/package-assets/claude/commands/lab/357/274/232write.md +1 -1
  10. package/package-assets/codex/prompts/lab/write.md +1 -1
  11. package/package-assets/codex/prompts/lab-write.md +1 -1
  12. package/package-assets/codex/prompts/lab:write.md +1 -1
  13. package/package-assets/codex/prompts/lab/357/274/232write.md +1 -1
  14. package/package-assets/shared/lab/.managed/scripts/extract_reference_paper_structure.py +910 -0
  15. package/package-assets/shared/lab/.managed/scripts/paper_topology.py +91 -0
  16. package/package-assets/shared/lab/.managed/scripts/render_rule_preflight.py +115 -0
  17. package/package-assets/shared/lab/.managed/scripts/validate_manuscript_delivery.py +59 -0
  18. package/package-assets/shared/lab/.managed/scripts/validate_paper_topology.py +83 -0
  19. package/package-assets/shared/lab/.managed/scripts/validate_rule_preflight.py +183 -0
  20. package/package-assets/shared/lab/.managed/scripts/validate_section_draft.py +134 -12
  21. package/package-assets/shared/lab/.managed/templates/iteration-report.md +1 -0
  22. package/package-assets/shared/lab/.managed/templates/reference-template-intake.md +40 -0
  23. package/package-assets/shared/lab/.managed/templates/write-iteration.md +28 -0
  24. package/package-assets/shared/lab/context/auto-status.md +1 -0
  25. package/package-assets/shared/skills/lab/SKILL.md +2 -0
  26. package/package-assets/shared/skills/lab/stages/auto.md +1 -1
  27. package/package-assets/shared/skills/lab/stages/write.md +21 -3
  28. package/package.json +1 -1
@@ -0,0 +1,910 @@
1
+ #!/usr/bin/env python3
2
+ """Extract reusable writing templates from reference papers.
3
+
4
+ This script is intentionally lightweight. It extracts structure, paragraph
5
+ roles, and visual/table roles for `/lab:write`; it does not summarize or copy
6
+ paper content.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import hashlib
13
+ import html
14
+ import json
15
+ import os
16
+ import re
17
+ import shutil
18
+ import subprocess
19
+ import sys
20
+ import tempfile
21
+ import urllib.parse
22
+ import urllib.request
23
+ from dataclasses import asdict, dataclass
24
+ from pathlib import Path
25
+
26
+
27
+ SECTION_ALIASES = {
28
+ "abstract": ("abstract", "摘要"),
29
+ "introduction": ("introduction", "intro", "引言", "绪论"),
30
+ "related-work": ("related work", "background", "literature", "相关工作", "文献综述"),
31
+ "method": (
32
+ "method",
33
+ "methods",
34
+ "methodology",
35
+ "approach",
36
+ "model",
37
+ "algorithm",
38
+ "方法",
39
+ "模型",
40
+ "算法",
41
+ ),
42
+ "experiments": (
43
+ "experiment",
44
+ "experiments",
45
+ "evaluation",
46
+ "empirical",
47
+ "results",
48
+ "main results",
49
+ "ablation",
50
+ "sensitivity",
51
+ "coverage",
52
+ "实验",
53
+ "评估",
54
+ "结果",
55
+ ),
56
+ "discussion": ("discussion", "analysis", "讨论", "分析"),
57
+ "conclusion": ("conclusion", "conclusions", "future work", "结论", "总结"),
58
+ "references": ("references", "bibliography", "参考文献"),
59
+ "appendix": ("appendix", "supplement", "附录"),
60
+ }
61
+
62
+ CANONICAL_HEADING_TITLES = {
63
+ "abstract",
64
+ "introduction",
65
+ "related work",
66
+ "background",
67
+ "method",
68
+ "methods",
69
+ "methodology",
70
+ "approach",
71
+ "model",
72
+ "algorithm",
73
+ "experiments",
74
+ "experiment",
75
+ "experimental setup",
76
+ "experimental setups",
77
+ "evaluation",
78
+ "results",
79
+ "main results",
80
+ "ablation study",
81
+ "sensitivity analysis",
82
+ "discussion",
83
+ "conclusion",
84
+ "conclusions",
85
+ "references",
86
+ "bibliography",
87
+ "appendix",
88
+ }
89
+
90
+
91
+ @dataclass
92
+ class SectionRecord:
93
+ title: str
94
+ section_type: str
95
+ level: int
96
+ start_line: int
97
+ end_line: int
98
+ start_page: int | None
99
+ end_page: int | None
100
+ content: str
101
+
102
+
103
+ @dataclass
104
+ class AssetRecord:
105
+ asset_type: str
106
+ asset_id: str
107
+ caption: str
108
+ appears_in_section: str
109
+ appears_in_title: str
110
+ evidence_role: str
111
+ local_role: str
112
+ placement_logic: str
113
+ text_bridge_before: str
114
+ text_bridge_after: str
115
+ reuse_guidance: str
116
+
117
+
118
+ def parse_args() -> argparse.Namespace:
119
+ parser = argparse.ArgumentParser(
120
+ description="Extract reference-paper structure templates for /lab:write."
121
+ )
122
+ parser.add_argument("sources", nargs="+", help="Local paper paths or http(s) PDF/HTML URLs")
123
+ parser.add_argument(
124
+ "--output-dir",
125
+ required=True,
126
+ help="Directory to write reference-pattern artifacts",
127
+ )
128
+ return parser.parse_args()
129
+
130
+
131
+ def slugify(value: str) -> str:
132
+ value = re.sub(r"\.[A-Za-z0-9]+$", "", value)
133
+ slug = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-").lower()
134
+ return slug[:80] or "reference-paper"
135
+
136
+
137
+ def short_excerpt(text: str, limit: int = 240) -> str:
138
+ cleaned = re.sub(r"\s+", " ", text).strip()
139
+ if len(cleaned) <= limit:
140
+ return cleaned
141
+ return cleaned[: limit - 3].rstrip() + "..."
142
+
143
+
144
+ def materialize_source(source: str, cache_dir: Path) -> tuple[Path, str]:
145
+ parsed = urllib.parse.urlparse(source)
146
+ if parsed.scheme in {"http", "https"}:
147
+ cache_dir.mkdir(parents=True, exist_ok=True)
148
+ suffix = Path(parsed.path).suffix or ".pdf"
149
+ name = slugify(Path(parsed.path).name or parsed.netloc)
150
+ digest = hashlib.sha256(source.encode("utf-8")).hexdigest()[:10]
151
+ target = cache_dir / f"{name}-{digest}{suffix}"
152
+ urllib.request.urlretrieve(source, target)
153
+ return target, "url"
154
+ return Path(source).expanduser().resolve(), "local"
155
+
156
+
157
+ def extract_pdf_text(path: Path) -> tuple[str, str, int | None]:
158
+ errors: list[str] = []
159
+
160
+ try:
161
+ from pypdf import PdfReader # type: ignore
162
+
163
+ reader = PdfReader(str(path))
164
+ page_texts = []
165
+ for index, page in enumerate(reader.pages, start=1):
166
+ page_texts.append(f"[Page {index}]\n{page.extract_text() or ''}")
167
+ return "\n\n".join(page_texts), "pypdf", len(reader.pages)
168
+ except Exception as exc:
169
+ errors.append(f"pypdf: {exc}")
170
+
171
+ try:
172
+ import pdfplumber # type: ignore
173
+
174
+ page_texts = []
175
+ with pdfplumber.open(str(path)) as pdf:
176
+ for index, page in enumerate(pdf.pages, start=1):
177
+ page_texts.append(f"[Page {index}]\n{page.extract_text() or ''}")
178
+ return "\n\n".join(page_texts), "pdfplumber", len(pdf.pages)
179
+ except Exception as exc:
180
+ errors.append(f"pdfplumber: {exc}")
181
+
182
+ try:
183
+ import fitz # type: ignore
184
+
185
+ doc = fitz.open(str(path))
186
+ page_texts = [f"[Page {i + 1}]\n{page.get_text()}" for i, page in enumerate(doc)]
187
+ page_count = len(doc)
188
+ doc.close()
189
+ return "\n\n".join(page_texts), "pymupdf", page_count
190
+ except Exception as exc:
191
+ errors.append(f"pymupdf: {exc}")
192
+
193
+ if shutil.which("pdftotext"):
194
+ with tempfile.TemporaryDirectory() as tmp_dir:
195
+ out_path = Path(tmp_dir) / "paper.txt"
196
+ command = ["pdftotext", "-layout", str(path), str(out_path)]
197
+ completed = subprocess.run(command, capture_output=True, text=True, check=False)
198
+ if completed.returncode == 0:
199
+ return out_path.read_text(encoding="utf-8", errors="ignore"), "pdftotext", None
200
+ errors.append(f"pdftotext: {completed.stderr.strip()}")
201
+
202
+ raise RuntimeError(
203
+ "No PDF text extractor succeeded. Install pypdf, pdfplumber, PyMuPDF, or pdftotext. "
204
+ + " | ".join(errors)
205
+ )
206
+
207
+
208
+ def find_uv() -> str | None:
209
+ uv_path = shutil.which("uv")
210
+ if uv_path:
211
+ return uv_path
212
+ local_uv = Path.home() / ".local" / "bin" / "uv"
213
+ if local_uv.exists():
214
+ return str(local_uv)
215
+ return None
216
+
217
+
218
+ def rerun_with_uv_pypdf() -> int | None:
219
+ if os.environ.get("LAB_REFERENCE_PDF_BOOTSTRAPPED"):
220
+ return None
221
+ uv_path = find_uv()
222
+ if not uv_path:
223
+ return None
224
+ env = os.environ.copy()
225
+ env["LAB_REFERENCE_PDF_BOOTSTRAPPED"] = "1"
226
+ command = [
227
+ uv_path,
228
+ "run",
229
+ "--with",
230
+ "pypdf",
231
+ "python3",
232
+ str(Path(__file__).resolve()),
233
+ *sys.argv[1:],
234
+ ]
235
+ return subprocess.run(command, env=env, check=False).returncode
236
+
237
+
238
+ def strip_html(text: str) -> str:
239
+ text = re.sub(r"(?is)<(script|style).*?</\1>", " ", text)
240
+ text = re.sub(r"(?i)<br\s*/?>", "\n", text)
241
+ text = re.sub(r"(?i)</(p|div|section|article|h[1-6]|li|tr)>", "\n", text)
242
+ text = re.sub(r"<[^>]+>", " ", text)
243
+ return html.unescape(re.sub(r"\n{3,}", "\n\n", text))
244
+
245
+
246
+ def read_source_text(path: Path) -> tuple[str, str, int | None]:
247
+ suffix = path.suffix.lower()
248
+ if suffix == ".pdf":
249
+ return extract_pdf_text(path)
250
+ raw = path.read_text(encoding="utf-8", errors="ignore")
251
+ if suffix in {".html", ".htm"}:
252
+ return strip_html(raw), "html-text", None
253
+ return raw, "plain-text", None
254
+
255
+
256
+ def detect_title(text: str, fallback: str) -> str:
257
+ for line in text.splitlines():
258
+ stripped = re.sub(r"^#+\s*", "", line).strip()
259
+ if not stripped or stripped.startswith("[Page "):
260
+ continue
261
+ if is_caption_line(stripped):
262
+ continue
263
+ if len(stripped) < 180:
264
+ return stripped
265
+ return fallback
266
+
267
+
268
+ def normalize_heading(line: str) -> str:
269
+ line = re.sub(r"^#+\s*", "", line.strip())
270
+ return line.strip()
271
+
272
+
273
+ def section_type_for_title(title: str) -> str:
274
+ lowered = title.lower()
275
+ lowered = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", lowered).strip()
276
+ for section_type, aliases in SECTION_ALIASES.items():
277
+ if any(alias in lowered for alias in aliases):
278
+ return section_type
279
+ return "other"
280
+
281
+
282
+ def heading_level(title: str) -> int:
283
+ markdown = re.match(r"^(#{1,6})\s+", title)
284
+ if markdown:
285
+ return len(markdown.group(1))
286
+ numeric = re.match(r"^(\d+(?:\.\d+)*)\.?\s+", title.strip())
287
+ if numeric:
288
+ return min(1 + numeric.group(1).count("."), 4)
289
+ return 1
290
+
291
+
292
+ def is_caption_line(line: str) -> bool:
293
+ return bool(re.match(r"^(?:Table|TABLE|Figure|FIGURE|Fig\.?)\s+[A-Za-z0-9.]+", line.strip()))
294
+
295
+
296
+ def is_heading_line(line: str) -> bool:
297
+ stripped = line.strip()
298
+ if not stripped or stripped.startswith("[Page ") or is_caption_line(stripped):
299
+ return False
300
+ if len(stripped) > 140:
301
+ return False
302
+ if stripped.endswith((".", ",", ";", ":")) and not re.match(r"^\d+(?:\.\d+)*\.?\s+", stripped):
303
+ return False
304
+ if re.match(r"^#{1,4}\s+\S", stripped):
305
+ return True
306
+ lowered = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", stripped.lower()).strip()
307
+ lowered = re.sub(r"^[ivx]+\.\s+", "", lowered).strip()
308
+ lowered = re.sub(r"^appendix\s+[a-z](?:\.\d+)*\.?\s*", "", lowered).strip()
309
+ if lowered in CANONICAL_HEADING_TITLES:
310
+ return True
311
+ if re.match(r"^\d+(?:\.\d+)*\.?\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
312
+ return True
313
+ if re.match(r"^[A-Z]\.\d+(?:\.\d+)*\.?\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
314
+ return True
315
+ if re.match(r"^Appendix\s+[A-Z](?:\.\d+)*\.?\s*[:\-]?\s*[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
316
+ return True
317
+ if re.match(r"^[IVX]+\.\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
318
+ return True
319
+ return False
320
+
321
+
322
+ def split_sections(text: str) -> list[SectionRecord]:
323
+ lines = text.splitlines()
324
+ headings: list[tuple[int, str, int | None]] = []
325
+ current_page: int | None = None
326
+ line_pages: dict[int, int | None] = {}
327
+
328
+ for index, line in enumerate(lines):
329
+ page_match = re.match(r"^\[Page\s+(\d+)\]$", line.strip())
330
+ if page_match:
331
+ current_page = int(page_match.group(1))
332
+ line_pages[index] = current_page
333
+ if is_heading_line(line):
334
+ headings.append((index, normalize_heading(line), current_page))
335
+
336
+ if not headings:
337
+ content = "\n".join(line for line in lines if not line.startswith("[Page "))
338
+ return [
339
+ SectionRecord(
340
+ title="Full Text",
341
+ section_type="other",
342
+ level=1,
343
+ start_line=1,
344
+ end_line=len(lines),
345
+ start_page=None,
346
+ end_page=None,
347
+ content=content,
348
+ )
349
+ ]
350
+
351
+ sections: list[SectionRecord] = []
352
+ for pos, (start_index, title, start_page) in enumerate(headings):
353
+ end_index = headings[pos + 1][0] - 1 if pos + 1 < len(headings) else len(lines) - 1
354
+ content_lines = [
355
+ line for line in lines[start_index + 1 : end_index + 1] if not line.startswith("[Page ")
356
+ ]
357
+ end_page = line_pages.get(end_index)
358
+ sections.append(
359
+ SectionRecord(
360
+ title=title,
361
+ section_type=section_type_for_title(title),
362
+ level=heading_level(title),
363
+ start_line=start_index + 1,
364
+ end_line=end_index + 1,
365
+ start_page=start_page,
366
+ end_page=end_page,
367
+ content="\n".join(content_lines).strip(),
368
+ )
369
+ )
370
+ infer_section_types(sections)
371
+ return sections
372
+
373
+
374
+ def infer_section_types(sections: list[SectionRecord]) -> None:
375
+ """Recover section families when PDF text only preserves numbered headings.
376
+
377
+ Many academic PDF extractors keep subsection titles such as "4.2 Feature
378
+ Selection" but lose the visual hierarchy that tells us they belong to the
379
+ Method section. For template extraction, section family is more useful than
380
+ the literal heading string, so unknown subsections inherit from their nearest
381
+ known parent. Unknown top-level technical sections before Experiments are
382
+ treated as method-style sections, which matches common ML paper structure.
383
+ """
384
+
385
+ first_experiment_index = next(
386
+ (index for index, section in enumerate(sections) if section.section_type == "experiments"),
387
+ None,
388
+ )
389
+ first_intro_index = next(
390
+ (index for index, section in enumerate(sections) if section.section_type == "introduction"),
391
+ None,
392
+ )
393
+ parent_by_level: dict[int, str] = {}
394
+ inheritable = {"related-work", "method", "experiments", "discussion", "conclusion", "appendix"}
395
+
396
+ for index, section in enumerate(sections):
397
+ for level in list(parent_by_level):
398
+ if level >= section.level:
399
+ parent_by_level.pop(level, None)
400
+
401
+ if section.section_type == "other":
402
+ inherited = None
403
+ for level in range(section.level - 1, 0, -1):
404
+ parent_type = parent_by_level.get(level)
405
+ if parent_type in inheritable:
406
+ inherited = parent_type
407
+ break
408
+ if inherited:
409
+ section.section_type = inherited
410
+ elif (
411
+ section.level == 1
412
+ and first_experiment_index is not None
413
+ and index < first_experiment_index
414
+ and (first_intro_index is None or index > first_intro_index)
415
+ ):
416
+ section.section_type = "method"
417
+
418
+ parent_by_level[section.level] = section.section_type
419
+
420
+
421
+ def paragraph_role(section_type: str, paragraph: str) -> str:
422
+ lowered = paragraph.lower()
423
+ if is_caption_line(paragraph):
424
+ return "visual_or_table_anchor"
425
+ if section_type == "abstract":
426
+ return "abstract_summary"
427
+ if any(token in lowered for token in ("limitation", "future work", "caveat", "drift", "局限")):
428
+ return "limitation_boundary"
429
+ if any(token in lowered for token in ("ablation", "component", "without", "remove")):
430
+ return "ablation_interpretation"
431
+ if any(token in lowered for token in ("result", "outperform", "improve", "coverage", "interval length", "gain")):
432
+ return "result_interpretation"
433
+ if any(token in lowered for token in ("baseline", "compare", "comparator")):
434
+ return "comparator_setup"
435
+ if any(token in lowered for token in ("dataset", "benchmark", "metric", "protocol", "split", "auuc", "qini")):
436
+ return "experimental_protocol"
437
+ if any(token in lowered for token in ("contribution", "we propose", "our method", "framework", "model")):
438
+ return "contribution_or_method_claim"
439
+ if any(token in lowered for token in ("existing", "requires", "problem", "gap", "lack", "overfit")):
440
+ return "problem_or_gap"
441
+ if section_type == "method":
442
+ return "method_exposition"
443
+ if section_type == "experiments":
444
+ return "experiment_exposition"
445
+ return "section_exposition"
446
+
447
+
448
+ def split_paragraphs(text: str) -> list[str]:
449
+ chunks = re.split(r"\n\s*\n", text)
450
+ paragraphs: list[str] = []
451
+ for chunk in chunks:
452
+ cleaned = "\n".join(line.strip() for line in chunk.splitlines() if line.strip())
453
+ if cleaned:
454
+ paragraphs.append(cleaned)
455
+ return paragraphs
456
+
457
+
458
+ def caption_match(line: str) -> re.Match[str] | None:
459
+ return re.match(
460
+ r"^(?P<kind>Table|TABLE|Figure|FIGURE|Fig\.?)\s+(?P<id>[A-Za-z0-9.]+)\s*[:.\-]?\s*(?P<caption>.*)$",
461
+ line.strip(),
462
+ )
463
+
464
+
465
+ def classify_asset_role(asset_type: str, caption: str) -> str:
466
+ lowered = caption.lower()
467
+ if "ablation" in lowered or "component" in lowered:
468
+ return "ablation"
469
+ if any(token in lowered for token in ("coverage", "interval", "trade-off", "tradeoff", "sensitivity", "shift")):
470
+ return "uncertainty_or_tradeoff"
471
+ if any(token in lowered for token in ("main", "result", "performance", "ranking", "auuc", "qini")):
472
+ return "main_results"
473
+ if any(token in lowered for token in ("dataset", "statistics", "summary", "treatment assignment")):
474
+ return "dataset_or_protocol"
475
+ if any(token in lowered for token in ("overview", "framework", "pipeline", "architecture", "model")):
476
+ return "method_overview"
477
+ if any(token in lowered for token in ("case", "example")):
478
+ return "case_analysis"
479
+ return "supporting_evidence" if asset_type == "table" else "conceptual_visual"
480
+
481
+
482
+ def caption_needs_continuation(caption: str) -> bool:
483
+ caption = caption.strip()
484
+ return bool(caption) and len(caption) < 140 and not re.search(r"[.!?)]$", caption)
485
+
486
+
487
+ def extend_caption(caption: str, lines: list[str], start_index: int) -> str:
488
+ parts = [caption.strip()]
489
+ next_index = start_index + 1
490
+ while caption_needs_continuation(" ".join(parts)) and next_index < len(lines) and len(parts) < 4:
491
+ candidate = lines[next_index].strip()
492
+ next_index += 1
493
+ if not candidate:
494
+ continue
495
+ if candidate.startswith("[Page ") or is_caption_line(candidate) or is_heading_line(candidate):
496
+ break
497
+ if len(candidate) > 180:
498
+ break
499
+ parts.append(candidate)
500
+ return re.sub(r"\s+", " ", " ".join(parts)).strip()
501
+
502
+
503
+ def asset_role_text(evidence_role: str) -> tuple[str, str, str, str]:
504
+ mapping = {
505
+ "main_results": (
506
+ "answers the primary comparison question",
507
+ "state the comparison question and ranking metric before the table",
508
+ "interpret ranking, margin, and evidence strength after the table",
509
+ "place after protocol, metrics, and comparator definitions",
510
+ ),
511
+ "ablation": (
512
+ "isolates which component supports the claimed gain",
513
+ "name the design choice being tested before the asset",
514
+ "explain the component-level implication after the asset",
515
+ "place after main results so the reader first sees the headline effect",
516
+ ),
517
+ "dataset_or_protocol": (
518
+ "defines the evaluation substrate and comparison scope",
519
+ "tell the reader why the dataset or protocol matters",
520
+ "connect the protocol to the later comparison table",
521
+ "place before baseline and result interpretation",
522
+ ),
523
+ "method_overview": (
524
+ "orients the reader to the method pipeline",
525
+ "introduce the mechanism or module sequence before the figure",
526
+ "map visual elements to later method prose after the figure",
527
+ "place near the beginning of the method section",
528
+ ),
529
+ "uncertainty_or_tradeoff": (
530
+ "explains a robustness, uncertainty, or trade-off pattern",
531
+ "state the trade-off or diagnostic question before the asset",
532
+ "explain the mechanism or limitation revealed by the asset",
533
+ "place after the main result table or in analysis/sensitivity subsections",
534
+ ),
535
+ }
536
+ return mapping.get(
537
+ evidence_role,
538
+ (
539
+ "supports a local subsection claim",
540
+ "state the local question before the asset",
541
+ "explain the takeaway after the asset",
542
+ "place next to the subsection claim it supports",
543
+ ),
544
+ )
545
+
546
+
547
+ def extract_assets(sections: list[SectionRecord]) -> list[AssetRecord]:
548
+ assets: list[AssetRecord] = []
549
+ for section in sections:
550
+ lines = section.content.splitlines()
551
+ for index, line in enumerate(lines):
552
+ match = caption_match(line)
553
+ if not match:
554
+ continue
555
+ raw_kind = match.group("kind").lower()
556
+ asset_type = "table" if raw_kind.startswith("table") else "figure"
557
+ caption = match.group("caption").strip() or line.strip()
558
+ caption = extend_caption(caption, lines, index)
559
+ evidence_role = classify_asset_role(asset_type, caption)
560
+ local_role, before, after, placement = asset_role_text(evidence_role)
561
+ assets.append(
562
+ AssetRecord(
563
+ asset_type=asset_type,
564
+ asset_id=match.group("id"),
565
+ caption=short_excerpt(caption, 180),
566
+ appears_in_section=section.section_type,
567
+ appears_in_title=section.title,
568
+ evidence_role=evidence_role,
569
+ local_role=local_role,
570
+ placement_logic=placement,
571
+ text_bridge_before=before,
572
+ text_bridge_after=after,
573
+ reuse_guidance="Reuse the asset function and placement logic only; do not copy caption wording or claims.",
574
+ )
575
+ )
576
+ return assets
577
+
578
+
579
+ def build_paragraph_roles(sections: list[SectionRecord]) -> list[dict]:
580
+ roles: list[dict] = []
581
+ for section in sections:
582
+ for index, paragraph in enumerate(split_paragraphs(section.content), start=1):
583
+ roles.append(
584
+ {
585
+ "section": section.section_type,
586
+ "section_title": section.title,
587
+ "paragraph_index": index,
588
+ "role": paragraph_role(section.section_type, paragraph),
589
+ "excerpt": short_excerpt(paragraph),
590
+ "reuse_guidance": "Reuse the paragraph function, not the source wording.",
591
+ }
592
+ )
593
+ return roles
594
+
595
+
596
+ def section_slots(section: SectionRecord, paragraph_roles: list[dict], assets: list[AssetRecord]) -> list[str]:
597
+ slots: list[str] = []
598
+ title = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", section.title).strip()
599
+ if title and title.lower() not in {"experiments", "experiment", "results"}:
600
+ slots.append(title.lower())
601
+ for role in paragraph_roles:
602
+ if role["section_title"] == section.title and role["role"] not in slots:
603
+ slots.append(role["role"].replace("_", " "))
604
+ for asset in assets:
605
+ if asset.appears_in_title == section.title:
606
+ slots.append(f"{asset.asset_type}: {asset.evidence_role}")
607
+ return slots
608
+
609
+
610
+ def write_paper_artifacts(
611
+ paper_dir: Path,
612
+ source: str,
613
+ materialized_path: Path,
614
+ source_kind: str,
615
+ title: str,
616
+ method: str,
617
+ page_count: int | None,
618
+ sections: list[SectionRecord],
619
+ roles: list[dict],
620
+ assets: list[AssetRecord],
621
+ ) -> dict:
622
+ paper_dir.mkdir(parents=True, exist_ok=True)
623
+ metadata = {
624
+ "source": source,
625
+ "source_kind": source_kind,
626
+ "materialized_path": str(materialized_path),
627
+ "title": title,
628
+ "extraction_method": method,
629
+ "page_count": page_count,
630
+ "section_count": len(sections),
631
+ "visual_asset_count": len(assets),
632
+ }
633
+ (paper_dir / "metadata.json").write_text(json.dumps(metadata, indent=2, ensure_ascii=False), encoding="utf-8")
634
+ (paper_dir / "section-map.json").write_text(
635
+ json.dumps(
636
+ [
637
+ {
638
+ key: value
639
+ for key, value in asdict(section).items()
640
+ if key != "content"
641
+ }
642
+ for section in sections
643
+ ],
644
+ indent=2,
645
+ ensure_ascii=False,
646
+ ),
647
+ encoding="utf-8",
648
+ )
649
+ (paper_dir / "paragraph-roles.json").write_text(json.dumps(roles, indent=2, ensure_ascii=False), encoding="utf-8")
650
+ (paper_dir / "visual-assets.json").write_text(
651
+ json.dumps([asdict(asset) for asset in assets], indent=2, ensure_ascii=False),
652
+ encoding="utf-8",
653
+ )
654
+ write_section_logic(paper_dir / "section-logic.md", title, sections, roles, assets)
655
+ write_writing_patterns(paper_dir / "writing-patterns.md", title, sections, roles, assets)
656
+ (paper_dir / "extraction-report.md").write_text(
657
+ "\n".join(
658
+ [
659
+ f"# Extraction Report: {title}",
660
+ "",
661
+ f"- Source: `{source}`",
662
+ f"- Extraction method: `{method}`",
663
+ f"- Sections detected: {len(sections)}",
664
+ f"- Visual/table assets detected: {len(assets)}",
665
+ "- Boundary: this artifact extracts reusable structure only; it must not be used as evidence for the current paper.",
666
+ ]
667
+ )
668
+ + "\n",
669
+ encoding="utf-8",
670
+ )
671
+ return metadata
672
+
673
+
674
+ def write_section_logic(path: Path, title: str, sections: list[SectionRecord], roles: list[dict], assets: list[AssetRecord]) -> None:
675
+ lines = [f"# Section Logic: {title}", ""]
676
+ for section in sections:
677
+ section_roles = [role for role in roles if role["section_title"] == section.title]
678
+ section_assets = [asset for asset in assets if asset.appears_in_title == section.title]
679
+ lines.append(f"## {section.title}")
680
+ lines.append("")
681
+ lines.append(f"- Section type: `{section.section_type}`")
682
+ if section_roles:
683
+ role_list = ", ".join(dict.fromkeys(role["role"] for role in section_roles))
684
+ lines.append(f"- Paragraph roles: {role_list}")
685
+ if section_assets:
686
+ asset_list = ", ".join(f"{asset.asset_type}:{asset.evidence_role}" for asset in section_assets)
687
+ lines.append(f"- Asset roles: {asset_list}")
688
+ lines.append("- Reuse: preserve the slot order and rhetorical function, not the wording.")
689
+ lines.append("")
690
+ path.write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
691
+
692
+
693
+ def write_writing_patterns(path: Path, title: str, sections: list[SectionRecord], roles: list[dict], assets: list[AssetRecord]) -> None:
694
+ lines = [f"# Writing Patterns: {title}", ""]
695
+ lines.append("## Template Use")
696
+ lines.append("")
697
+ lines.append("- Use this paper as one structural template among several references.")
698
+ lines.append("- Reproduce section slots, paragraph functions, and asset placement logic only.")
699
+ lines.append("- Do not copy wording, claims, metrics, or conclusions.")
700
+ lines.append("")
701
+ lines.append("## Observed Section Slots")
702
+ lines.append("")
703
+ for section in sections:
704
+ lines.append(f"- {section.section_type}: {section.title}")
705
+ lines.append("")
706
+ lines.append("## Visual/Table Pattern")
707
+ lines.append("")
708
+ if assets:
709
+ for asset in assets:
710
+ lines.append(f"- {asset.asset_type} {asset.asset_id}: {asset.evidence_role} -> {asset.local_role}")
711
+ else:
712
+ lines.append("- No table or figure caption was detected.")
713
+ path.write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
714
+
715
+
716
+ def merge_unique(values: list[str]) -> list[str]:
717
+ seen: set[str] = set()
718
+ merged: list[str] = []
719
+ for value in values:
720
+ normalized = re.sub(r"\s+", " ", value.strip())
721
+ key = normalized.lower()
722
+ if not normalized or key in seen:
723
+ continue
724
+ seen.add(key)
725
+ merged.append(normalized)
726
+ return merged
727
+
728
+
729
+ def build_section_templates(output_dir: Path, paper_payloads: list[dict]) -> None:
730
+ target = output_dir / "section-templates"
731
+ target.mkdir(parents=True, exist_ok=True)
732
+ section_types = ["abstract", "introduction", "related-work", "method", "experiments", "discussion", "conclusion"]
733
+ for section_type in section_types:
734
+ source_papers: list[str] = []
735
+ observed_titles: list[str] = []
736
+ slots: list[str] = []
737
+ paragraph_roles: list[str] = []
738
+ asset_roles: list[dict] = []
739
+ for payload in paper_payloads:
740
+ matching_sections = [section for section in payload["sections"] if section.section_type == section_type]
741
+ if section_type == "experiments":
742
+ matching_sections.extend(
743
+ section for section in payload["sections"] if section.section_type in {"discussion"}
744
+ )
745
+ if not matching_sections:
746
+ continue
747
+ source_papers.append(payload["slug"])
748
+ for section in matching_sections:
749
+ observed_titles.append(section.title)
750
+ slots.extend(section_slots(section, payload["roles"], payload["assets"]))
751
+ paragraph_roles.extend(
752
+ role["role"] for role in payload["roles"] if role["section"] == section_type
753
+ )
754
+ asset_roles.extend(
755
+ {
756
+ "asset_type": asset.asset_type,
757
+ "evidence_role": asset.evidence_role,
758
+ "local_role": asset.local_role,
759
+ "source_paper": payload["slug"],
760
+ }
761
+ for asset in payload["assets"]
762
+ if asset.appears_in_section == section_type
763
+ or (section_type == "experiments" and asset.appears_in_section in {"experiments", "discussion"})
764
+ )
765
+ if not source_papers:
766
+ continue
767
+ template = {
768
+ "section": section_type,
769
+ "template_id": f"{section_type}-multi-reference-template",
770
+ "source_papers": merge_unique(source_papers),
771
+ "observed_titles": merge_unique(observed_titles),
772
+ "section_slots": merge_unique(slots),
773
+ "paragraph_roles": merge_unique(paragraph_roles),
774
+ "asset_roles": asset_roles,
775
+ "reuse_rule": "Reuse structure only; do not copy wording, claims, metrics, or conclusions from reference papers.",
776
+ }
777
+ (target / f"{section_type}.json").write_text(
778
+ json.dumps(template, indent=2, ensure_ascii=False),
779
+ encoding="utf-8",
780
+ )
781
+
782
+
783
+ def build_visual_templates(output_dir: Path, paper_payloads: list[dict]) -> None:
784
+ target = output_dir / "visual-templates"
785
+ target.mkdir(parents=True, exist_ok=True)
786
+ asset_roles: list[dict] = []
787
+ for payload in paper_payloads:
788
+ for asset in payload["assets"]:
789
+ if asset.appears_in_section in {"experiments", "discussion", "method"}:
790
+ item = asdict(asset)
791
+ item["source_paper"] = payload["slug"]
792
+ asset_roles.append(item)
793
+ template = {
794
+ "template_id": "experiment-visual-and-table-template",
795
+ "asset_roles": asset_roles,
796
+ "reuse_rule": "Use table and figure functions, placement, and bridge logic to plan current paper assets; do not copy captions or data.",
797
+ }
798
+ (target / "experiment-assets.json").write_text(
799
+ json.dumps(template, indent=2, ensure_ascii=False),
800
+ encoding="utf-8",
801
+ )
802
+
803
+
804
+ def write_aggregate_playbook(output_dir: Path, paper_payloads: list[dict]) -> None:
805
+ lines = [
806
+ "# Aggregate Template Playbook",
807
+ "",
808
+ "Purpose: help `/lab:write` reproduce mature paper structure from multiple reference templates.",
809
+ "",
810
+ "## Sources",
811
+ "",
812
+ ]
813
+ for payload in paper_payloads:
814
+ lines.append(f"- `{payload['slug']}`: {payload['title']}")
815
+ lines.extend(
816
+ [
817
+ "",
818
+ "## Reuse Boundary",
819
+ "",
820
+ "- Reproduce section order, paragraph roles, table/figure function, and bridge logic.",
821
+ "- Do not copy wording, claims, experimental conclusions, metrics, or terminology names.",
822
+ "- If only one reference supports a structure, treat it as a single-template pattern, not a universal rule.",
823
+ "",
824
+ "## Multi-Template Write Procedure",
825
+ "",
826
+ "1. Pick 2-3 closest section templates for the current paper section.",
827
+ "2. Build a mini-outline from common slots and current-paper evidence.",
828
+ "3. Add required table/figure assets with local before/after bridge functions.",
829
+ "4. Draft with current-paper terminology and evidence only.",
830
+ "",
831
+ "## Table/Figure Planning Rule",
832
+ "",
833
+ "Every major table or figure should answer a reader question, appear near the subsection claim it supports, and have one bridge sentence before and one interpretation sentence after it.",
834
+ ]
835
+ )
836
+ (output_dir / "aggregate-template-playbook.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
837
+
838
+
839
+ def process_source(source: str, output_dir: Path, cache_dir: Path) -> dict:
840
+ materialized_path, source_kind = materialize_source(source, cache_dir)
841
+ if not materialized_path.exists():
842
+ raise FileNotFoundError(f"source not found: {source}")
843
+ text, method, page_count = read_source_text(materialized_path)
844
+ title = detect_title(text, materialized_path.stem)
845
+ slug = slugify(title or materialized_path.stem)
846
+ sections = split_sections(text)
847
+ roles = build_paragraph_roles(sections)
848
+ assets = extract_assets(sections)
849
+ metadata = write_paper_artifacts(
850
+ output_dir / slug,
851
+ source,
852
+ materialized_path,
853
+ source_kind,
854
+ title,
855
+ method,
856
+ page_count,
857
+ sections,
858
+ roles,
859
+ assets,
860
+ )
861
+ return {
862
+ "slug": slug,
863
+ "title": title,
864
+ "metadata": metadata,
865
+ "sections": sections,
866
+ "roles": roles,
867
+ "assets": assets,
868
+ }
869
+
870
+
871
+ def main() -> int:
872
+ args = parse_args()
873
+ output_dir = Path(args.output_dir).expanduser().resolve()
874
+ output_dir.mkdir(parents=True, exist_ok=True)
875
+ cache_dir = output_dir / ".cache"
876
+
877
+ payloads: list[dict] = []
878
+ source_entries: list[dict] = []
879
+ try:
880
+ for source in args.sources:
881
+ payload = process_source(source, output_dir, cache_dir)
882
+ payloads.append(payload)
883
+ source_entries.append(
884
+ {
885
+ "source": source,
886
+ "slug": payload["slug"],
887
+ "title": payload["title"],
888
+ "artifact_dir": str(output_dir / payload["slug"]),
889
+ }
890
+ )
891
+ except RuntimeError as exc:
892
+ if "No PDF text extractor succeeded" in str(exc):
893
+ bootstrapped = rerun_with_uv_pypdf()
894
+ if bootstrapped is not None:
895
+ return bootstrapped
896
+ raise
897
+
898
+ (output_dir / "sources.json").write_text(
899
+ json.dumps({"sources": source_entries}, indent=2, ensure_ascii=False),
900
+ encoding="utf-8",
901
+ )
902
+ build_section_templates(output_dir, payloads)
903
+ build_visual_templates(output_dir, payloads)
904
+ write_aggregate_playbook(output_dir, payloads)
905
+ print(f"reference template patterns written to {output_dir}")
906
+ return 0
907
+
908
+
909
+ if __name__ == "__main__":
910
+ raise SystemExit(main())