superlab 0.1.64 → 0.1.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1200 @@
1
+ #!/usr/bin/env python3
2
+ """Extract reusable writing templates from reference papers.
3
+
4
+ This script is intentionally lightweight. It extracts structure, paragraph
5
+ roles, and visual/table roles for `/lab:write`; it does not summarize or copy
6
+ paper content.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import hashlib
13
+ import html
14
+ import json
15
+ import os
16
+ import re
17
+ import shutil
18
+ import subprocess
19
+ import sys
20
+ import tempfile
21
+ import urllib.parse
22
+ import urllib.request
23
+ from dataclasses import asdict, dataclass
24
+ from pathlib import Path
25
+
26
+
27
+ SECTION_ALIASES = {
28
+ "abstract": ("abstract", "摘要"),
29
+ "introduction": ("introduction", "intro", "引言", "绪论"),
30
+ "related-work": ("related work", "background", "literature", "相关工作", "文献综述"),
31
+ "method": (
32
+ "method",
33
+ "methods",
34
+ "methodology",
35
+ "approach",
36
+ "model",
37
+ "algorithm",
38
+ "方法",
39
+ "模型",
40
+ "算法",
41
+ ),
42
+ "experiments": (
43
+ "experiment",
44
+ "experiments",
45
+ "evaluation",
46
+ "empirical",
47
+ "results",
48
+ "main results",
49
+ "ablation",
50
+ "sensitivity",
51
+ "coverage",
52
+ "实验",
53
+ "评估",
54
+ "结果",
55
+ ),
56
+ "discussion": ("discussion", "analysis", "讨论", "分析"),
57
+ "conclusion": ("conclusion", "conclusions", "future work", "结论", "总结"),
58
+ "references": ("references", "bibliography", "参考文献"),
59
+ "appendix": ("appendix", "supplement", "附录"),
60
+ }
61
+
62
+ CANONICAL_HEADING_TITLES = {
63
+ "abstract",
64
+ "introduction",
65
+ "related work",
66
+ "background",
67
+ "method",
68
+ "methods",
69
+ "methodology",
70
+ "approach",
71
+ "model",
72
+ "algorithm",
73
+ "experiments",
74
+ "experiment",
75
+ "experimental setup",
76
+ "experimental setups",
77
+ "evaluation",
78
+ "results",
79
+ "main results",
80
+ "ablation study",
81
+ "sensitivity analysis",
82
+ "discussion",
83
+ "conclusion",
84
+ "conclusions",
85
+ "references",
86
+ "bibliography",
87
+ "appendix",
88
+ }
89
+
90
+ EXPERIMENT_PROTOCOL_SLOT_GUIDANCE = {
91
+ "dataset_description": {
92
+ "reader_question": "Which datasets define the evaluation scope, and why are they relevant?",
93
+ "placement_guidance": "place before baselines, metrics, and main results",
94
+ },
95
+ "dataset_statistics": {
96
+ "reader_question": "What dataset scale, feature, treatment, or split facts constrain interpretation?",
97
+ "placement_guidance": "place near dataset descriptions or move detailed statistics to appendix with a main-text pointer",
98
+ },
99
+ "split_protocol": {
100
+ "reader_question": "How are train, validation, test, seed, or sampling decisions made?",
101
+ "placement_guidance": "place before metrics and results so comparisons have a fixed protocol",
102
+ },
103
+ "baseline_setup": {
104
+ "reader_question": "Which comparator families are included, and what role does each comparator play?",
105
+ "placement_guidance": "place after datasets and before the main comparison table",
106
+ },
107
+ "metric_definition": {
108
+ "reader_question": "Which metrics decide ranking, what do they measure, and which direction is better?",
109
+ "placement_guidance": "place before the first result table and repeat local definitions in table notes when needed",
110
+ },
111
+ "implementation_details": {
112
+ "reader_question": "Which tuning, validation, training, or hardware details are needed for reproducibility?",
113
+ "placement_guidance": "place after metrics or in appendix when details are long",
114
+ },
115
+ "main_results": {
116
+ "reader_question": "What is the primary comparison result under the declared protocol?",
117
+ "placement_guidance": "place after setup, baselines, and metrics",
118
+ },
119
+ "ablation": {
120
+ "reader_question": "Which component or design choice accounts for the claimed effect?",
121
+ "placement_guidance": "place after the main results",
122
+ },
123
+ "sensitivity": {
124
+ "reader_question": "How stable is the result under relevant protocol or hyperparameter changes?",
125
+ "placement_guidance": "place after ablations or in an analysis subsection",
126
+ },
127
+ "appendix_dataset_statistics": {
128
+ "reader_question": "Which detailed dataset facts support the compact dataset setup in the main experiments?",
129
+ "placement_guidance": "link from the main experimental setup and keep detailed tables in appendix",
130
+ },
131
+ "appendix_baseline_metric_details": {
132
+ "reader_question": "Which baseline, metric, or implementation details are too long for the main setup?",
133
+ "placement_guidance": "link from the main setup and keep long comparator or metric definitions in appendix",
134
+ },
135
+ }
136
+
137
+
138
+ @dataclass
139
+ class SectionRecord:
140
+ title: str
141
+ section_type: str
142
+ level: int
143
+ start_line: int
144
+ end_line: int
145
+ start_page: int | None
146
+ end_page: int | None
147
+ content: str
148
+
149
+
150
+ @dataclass
151
+ class AssetRecord:
152
+ asset_type: str
153
+ asset_id: str
154
+ caption: str
155
+ appears_in_section: str
156
+ appears_in_title: str
157
+ evidence_role: str
158
+ local_role: str
159
+ placement_logic: str
160
+ text_bridge_before: str
161
+ text_bridge_after: str
162
+ reuse_guidance: str
163
+
164
+
165
+ def parse_args() -> argparse.Namespace:
166
+ parser = argparse.ArgumentParser(
167
+ description="Extract reference-paper structure templates for /lab:write."
168
+ )
169
+ parser.add_argument("sources", nargs="+", help="Local paper paths or http(s) PDF/HTML URLs")
170
+ parser.add_argument(
171
+ "--output-dir",
172
+ required=True,
173
+ help="Directory to write reference-pattern artifacts",
174
+ )
175
+ return parser.parse_args()
176
+
177
+
178
+ def slugify(value: str) -> str:
179
+ value = re.sub(r"\.[A-Za-z0-9]+$", "", value)
180
+ slug = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-").lower()
181
+ return slug[:80] or "reference-paper"
182
+
183
+
184
+ def short_excerpt(text: str, limit: int = 240) -> str:
185
+ cleaned = re.sub(r"\s+", " ", text).strip()
186
+ if len(cleaned) <= limit:
187
+ return cleaned
188
+ return cleaned[: limit - 3].rstrip() + "..."
189
+
190
+
191
+ def materialize_source(source: str, cache_dir: Path) -> tuple[Path, str]:
192
+ parsed = urllib.parse.urlparse(source)
193
+ if parsed.scheme in {"http", "https"}:
194
+ cache_dir.mkdir(parents=True, exist_ok=True)
195
+ suffix = Path(parsed.path).suffix or ".pdf"
196
+ name = slugify(Path(parsed.path).name or parsed.netloc)
197
+ digest = hashlib.sha256(source.encode("utf-8")).hexdigest()[:10]
198
+ target = cache_dir / f"{name}-{digest}{suffix}"
199
+ urllib.request.urlretrieve(source, target)
200
+ return target, "url"
201
+ return Path(source).expanduser().resolve(), "local"
202
+
203
+
204
+ def extract_pdf_text(path: Path) -> tuple[str, str, int | None]:
205
+ errors: list[str] = []
206
+
207
+ try:
208
+ from pypdf import PdfReader # type: ignore
209
+
210
+ reader = PdfReader(str(path))
211
+ page_texts = []
212
+ for index, page in enumerate(reader.pages, start=1):
213
+ page_texts.append(f"[Page {index}]\n{page.extract_text() or ''}")
214
+ return "\n\n".join(page_texts), "pypdf", len(reader.pages)
215
+ except Exception as exc:
216
+ errors.append(f"pypdf: {exc}")
217
+
218
+ try:
219
+ import pdfplumber # type: ignore
220
+
221
+ page_texts = []
222
+ with pdfplumber.open(str(path)) as pdf:
223
+ for index, page in enumerate(pdf.pages, start=1):
224
+ page_texts.append(f"[Page {index}]\n{page.extract_text() or ''}")
225
+ return "\n\n".join(page_texts), "pdfplumber", len(pdf.pages)
226
+ except Exception as exc:
227
+ errors.append(f"pdfplumber: {exc}")
228
+
229
+ try:
230
+ import fitz # type: ignore
231
+
232
+ doc = fitz.open(str(path))
233
+ page_texts = [f"[Page {i + 1}]\n{page.get_text()}" for i, page in enumerate(doc)]
234
+ page_count = len(doc)
235
+ doc.close()
236
+ return "\n\n".join(page_texts), "pymupdf", page_count
237
+ except Exception as exc:
238
+ errors.append(f"pymupdf: {exc}")
239
+
240
+ if shutil.which("pdftotext"):
241
+ with tempfile.TemporaryDirectory() as tmp_dir:
242
+ out_path = Path(tmp_dir) / "paper.txt"
243
+ command = ["pdftotext", "-layout", str(path), str(out_path)]
244
+ completed = subprocess.run(command, capture_output=True, text=True, check=False)
245
+ if completed.returncode == 0:
246
+ return out_path.read_text(encoding="utf-8", errors="ignore"), "pdftotext", None
247
+ errors.append(f"pdftotext: {completed.stderr.strip()}")
248
+
249
+ raise RuntimeError(
250
+ "No PDF text extractor succeeded. Install pypdf, pdfplumber, PyMuPDF, or pdftotext. "
251
+ + " | ".join(errors)
252
+ )
253
+
254
+
255
+ def find_uv() -> str | None:
256
+ uv_path = shutil.which("uv")
257
+ if uv_path:
258
+ return uv_path
259
+ local_uv = Path.home() / ".local" / "bin" / "uv"
260
+ if local_uv.exists():
261
+ return str(local_uv)
262
+ return None
263
+
264
+
265
+ def rerun_with_uv_pypdf() -> int | None:
266
+ if os.environ.get("LAB_REFERENCE_PDF_BOOTSTRAPPED"):
267
+ return None
268
+ uv_path = find_uv()
269
+ if not uv_path:
270
+ return None
271
+ env = os.environ.copy()
272
+ env["LAB_REFERENCE_PDF_BOOTSTRAPPED"] = "1"
273
+ command = [
274
+ uv_path,
275
+ "run",
276
+ "--with",
277
+ "pypdf",
278
+ "python3",
279
+ str(Path(__file__).resolve()),
280
+ *sys.argv[1:],
281
+ ]
282
+ return subprocess.run(command, env=env, check=False).returncode
283
+
284
+
285
+ def strip_html(text: str) -> str:
286
+ text = re.sub(r"(?is)<(script|style).*?</\1>", " ", text)
287
+ text = re.sub(r"(?i)<br\s*/?>", "\n", text)
288
+ text = re.sub(r"(?i)</(p|div|section|article|h[1-6]|li|tr)>", "\n", text)
289
+ text = re.sub(r"<[^>]+>", " ", text)
290
+ return html.unescape(re.sub(r"\n{3,}", "\n\n", text))
291
+
292
+
293
+ def read_source_text(path: Path) -> tuple[str, str, int | None]:
294
+ suffix = path.suffix.lower()
295
+ if suffix == ".pdf":
296
+ return extract_pdf_text(path)
297
+ raw = path.read_text(encoding="utf-8", errors="ignore")
298
+ if suffix in {".html", ".htm"}:
299
+ return strip_html(raw), "html-text", None
300
+ return raw, "plain-text", None
301
+
302
+
303
+ def detect_title(text: str, fallback: str) -> str:
304
+ for line in text.splitlines():
305
+ stripped = re.sub(r"^#+\s*", "", line).strip()
306
+ if not stripped or stripped.startswith("[Page "):
307
+ continue
308
+ if is_caption_line(stripped):
309
+ continue
310
+ if len(stripped) < 180:
311
+ return stripped
312
+ return fallback
313
+
314
+
315
+ def normalize_heading(line: str) -> str:
316
+ line = re.sub(r"^#+\s*", "", line.strip())
317
+ return line.strip()
318
+
319
+
320
+ def section_type_for_title(title: str) -> str:
321
+ lowered = title.lower()
322
+ if re.match(r"^(appendix\b|[a-z]\.\d+(?:\.\d+)*\.?\s+)", lowered.strip()):
323
+ return "appendix"
324
+ lowered = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", lowered).strip()
325
+ for section_type, aliases in SECTION_ALIASES.items():
326
+ if any(alias in lowered for alias in aliases):
327
+ return section_type
328
+ return "other"
329
+
330
+
331
+ def heading_level(title: str) -> int:
332
+ markdown = re.match(r"^(#{1,6})\s+", title)
333
+ if markdown:
334
+ return len(markdown.group(1))
335
+ numeric = re.match(r"^(\d+(?:\.\d+)*)\.?\s+", title.strip())
336
+ if numeric:
337
+ return min(1 + numeric.group(1).count("."), 4)
338
+ return 1
339
+
340
+
341
+ def is_caption_line(line: str) -> bool:
342
+ return bool(re.match(r"^(?:Table|TABLE|Figure|FIGURE|Fig\.?)\s+[A-Za-z0-9.]+", line.strip()))
343
+
344
+
345
+ def is_heading_line(line: str) -> bool:
346
+ stripped = line.strip()
347
+ if not stripped or stripped.startswith("[Page ") or is_caption_line(stripped):
348
+ return False
349
+ if len(stripped) > 140:
350
+ return False
351
+ if stripped.endswith((".", ",", ";", ":")) and not re.match(r"^\d+(?:\.\d+)*\.?\s+", stripped):
352
+ return False
353
+ if re.match(r"^#{1,4}\s+\S", stripped):
354
+ return True
355
+ lowered = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", stripped.lower()).strip()
356
+ lowered = re.sub(r"^[ivx]+\.\s+", "", lowered).strip()
357
+ lowered = re.sub(r"^appendix\s+[a-z](?:\.\d+)*\.?\s*", "", lowered).strip()
358
+ if lowered in CANONICAL_HEADING_TITLES:
359
+ return True
360
+ if re.match(r"^\d+(?:\.\d+)*\.?\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
361
+ return True
362
+ if re.match(r"^[A-Z]\.\d+(?:\.\d+)*\.?\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
363
+ return True
364
+ if re.match(r"^Appendix\s+[A-Z](?:\.\d+)*\.?\s*[:\-]?\s*[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
365
+ return True
366
+ if re.match(r"^[IVX]+\.\s+[A-Z][A-Za-z0-9 ,:;()/-]{2,}$", stripped):
367
+ return True
368
+ return False
369
+
370
+
371
+ def split_sections(text: str) -> list[SectionRecord]:
372
+ lines = text.splitlines()
373
+ headings: list[tuple[int, str, int | None]] = []
374
+ current_page: int | None = None
375
+ line_pages: dict[int, int | None] = {}
376
+
377
+ for index, line in enumerate(lines):
378
+ page_match = re.match(r"^\[Page\s+(\d+)\]$", line.strip())
379
+ if page_match:
380
+ current_page = int(page_match.group(1))
381
+ line_pages[index] = current_page
382
+ if is_heading_line(line):
383
+ headings.append((index, normalize_heading(line), current_page))
384
+
385
+ if not headings:
386
+ content = "\n".join(line for line in lines if not line.startswith("[Page "))
387
+ return [
388
+ SectionRecord(
389
+ title="Full Text",
390
+ section_type="other",
391
+ level=1,
392
+ start_line=1,
393
+ end_line=len(lines),
394
+ start_page=None,
395
+ end_page=None,
396
+ content=content,
397
+ )
398
+ ]
399
+
400
+ sections: list[SectionRecord] = []
401
+ for pos, (start_index, title, start_page) in enumerate(headings):
402
+ end_index = headings[pos + 1][0] - 1 if pos + 1 < len(headings) else len(lines) - 1
403
+ content_lines = [
404
+ line for line in lines[start_index + 1 : end_index + 1] if not line.startswith("[Page ")
405
+ ]
406
+ end_page = line_pages.get(end_index)
407
+ sections.append(
408
+ SectionRecord(
409
+ title=title,
410
+ section_type=section_type_for_title(title),
411
+ level=heading_level(title),
412
+ start_line=start_index + 1,
413
+ end_line=end_index + 1,
414
+ start_page=start_page,
415
+ end_page=end_page,
416
+ content="\n".join(content_lines).strip(),
417
+ )
418
+ )
419
+ infer_section_types(sections)
420
+ return sections
421
+
422
+
423
+ def infer_section_types(sections: list[SectionRecord]) -> None:
424
+ """Recover section families when PDF text only preserves numbered headings.
425
+
426
+ Many academic PDF extractors keep subsection titles such as "4.2 Feature
427
+ Selection" but lose the visual hierarchy that tells us they belong to the
428
+ Method section. For template extraction, section family is more useful than
429
+ the literal heading string, so unknown subsections inherit from their nearest
430
+ known parent. Unknown top-level technical sections before Experiments are
431
+ treated as method-style sections, which matches common ML paper structure.
432
+ """
433
+
434
+ first_experiment_index = next(
435
+ (index for index, section in enumerate(sections) if section.section_type == "experiments"),
436
+ None,
437
+ )
438
+ first_intro_index = next(
439
+ (index for index, section in enumerate(sections) if section.section_type == "introduction"),
440
+ None,
441
+ )
442
+ parent_by_level: dict[int, str] = {}
443
+ inheritable = {"related-work", "method", "experiments", "discussion", "conclusion", "appendix"}
444
+
445
+ for index, section in enumerate(sections):
446
+ for level in list(parent_by_level):
447
+ if level >= section.level:
448
+ parent_by_level.pop(level, None)
449
+
450
+ if section.section_type == "other":
451
+ inherited = None
452
+ for level in range(section.level - 1, 0, -1):
453
+ parent_type = parent_by_level.get(level)
454
+ if parent_type in inheritable:
455
+ inherited = parent_type
456
+ break
457
+ if inherited:
458
+ section.section_type = inherited
459
+ elif (
460
+ section.level == 1
461
+ and first_experiment_index is not None
462
+ and index < first_experiment_index
463
+ and (first_intro_index is None or index > first_intro_index)
464
+ ):
465
+ section.section_type = "method"
466
+
467
+ parent_by_level[section.level] = section.section_type
468
+
469
+
470
+ def paragraph_role(section_type: str, paragraph: str) -> str:
471
+ lowered = paragraph.lower()
472
+ if is_caption_line(paragraph):
473
+ return "visual_or_table_anchor"
474
+ if section_type == "abstract":
475
+ return "abstract_summary"
476
+ if any(token in lowered for token in ("dataset statistics", "statistics of the dataset", "sample count", "feature dimension")):
477
+ return "dataset_statistics"
478
+ if re.search(r"\bdatasets?\s*[:.]", lowered) or any(
479
+ token in lowered for token in ("benchmark dataset", "public dataset", "semi-synthetic dataset")
480
+ ):
481
+ return "dataset_description"
482
+ if re.search(r"\bbaselines?\s*[:.]", lowered) or any(
483
+ token in lowered for token in ("baseline family", "comparator", "compare with")
484
+ ):
485
+ return "baseline_setup"
486
+ if re.search(r"\bmetrics?\s*[:.]", lowered) or any(
487
+ token in lowered
488
+ for token in (
489
+ "metric definition",
490
+ "we report",
491
+ "primary metric",
492
+ "secondary metric",
493
+ "higher is better",
494
+ "lower is better",
495
+ "auuc",
496
+ "qini",
497
+ )
498
+ ):
499
+ return "metric_definition"
500
+ if any(
501
+ token in lowered
502
+ for token in (
503
+ "train/test",
504
+ "train, validation",
505
+ "training split",
506
+ "validation split",
507
+ "test split",
508
+ "random split",
509
+ "repeated split",
510
+ "sampling",
511
+ "seed",
512
+ "protocol",
513
+ )
514
+ ):
515
+ return "split_protocol"
516
+ if any(
517
+ token in lowered
518
+ for token in (
519
+ "implementation detail",
520
+ "hyperparameter",
521
+ "tuning",
522
+ "learning rate",
523
+ "epoch",
524
+ "batch size",
525
+ "hardware",
526
+ "gpu",
527
+ )
528
+ ):
529
+ return "implementation_details"
530
+ if any(token in lowered for token in ("limitation", "future work", "caveat", "drift", "局限")):
531
+ return "limitation_boundary"
532
+ if any(token in lowered for token in ("ablation", "component", "without", "remove")):
533
+ return "ablation_interpretation"
534
+ if any(token in lowered for token in ("result", "outperform", "improve", "coverage", "interval length", "gain")):
535
+ return "result_interpretation"
536
+ if any(token in lowered for token in ("baseline", "compare", "comparator")):
537
+ return "comparator_setup"
538
+ if any(token in lowered for token in ("dataset", "benchmark", "metric", "protocol", "split", "auuc", "qini")):
539
+ return "experimental_protocol"
540
+ if any(token in lowered for token in ("contribution", "we propose", "our method", "framework", "model")):
541
+ return "contribution_or_method_claim"
542
+ if any(token in lowered for token in ("existing", "requires", "problem", "gap", "lack", "overfit")):
543
+ return "problem_or_gap"
544
+ if section_type == "method":
545
+ return "method_exposition"
546
+ if section_type == "experiments":
547
+ return "experiment_exposition"
548
+ return "section_exposition"
549
+
550
+
551
+ def split_paragraphs(text: str) -> list[str]:
552
+ chunks = re.split(r"\n\s*\n", text)
553
+ paragraphs: list[str] = []
554
+ for chunk in chunks:
555
+ cleaned = "\n".join(line.strip() for line in chunk.splitlines() if line.strip())
556
+ if cleaned:
557
+ paragraphs.append(cleaned)
558
+ return paragraphs
559
+
560
+
561
+ def caption_match(line: str) -> re.Match[str] | None:
562
+ return re.match(
563
+ r"^(?P<kind>Table|TABLE|Figure|FIGURE|Fig\.?)\s+(?P<id>[A-Za-z0-9.]+)\s*[:.\-]?\s*(?P<caption>.*)$",
564
+ line.strip(),
565
+ )
566
+
567
+
568
+ def classify_asset_role(asset_type: str, caption: str) -> str:
569
+ lowered = caption.lower()
570
+ if "ablation" in lowered or "component" in lowered:
571
+ return "ablation"
572
+ if any(token in lowered for token in ("coverage", "interval", "trade-off", "tradeoff", "sensitivity", "shift")):
573
+ return "uncertainty_or_tradeoff"
574
+ if any(token in lowered for token in ("main", "result", "performance", "ranking", "auuc", "qini")):
575
+ return "main_results"
576
+ if any(token in lowered for token in ("dataset", "statistics", "summary", "treatment assignment")):
577
+ return "dataset_or_protocol"
578
+ if any(token in lowered for token in ("overview", "framework", "pipeline", "architecture", "model")):
579
+ return "method_overview"
580
+ if any(token in lowered for token in ("case", "example")):
581
+ return "case_analysis"
582
+ return "supporting_evidence" if asset_type == "table" else "conceptual_visual"
583
+
584
+
585
+ def caption_needs_continuation(caption: str) -> bool:
586
+ caption = caption.strip()
587
+ return bool(caption) and len(caption) < 140 and not re.search(r"[.!?)]$", caption)
588
+
589
+
590
+ def extend_caption(caption: str, lines: list[str], start_index: int) -> str:
591
+ parts = [caption.strip()]
592
+ next_index = start_index + 1
593
+ while caption_needs_continuation(" ".join(parts)) and next_index < len(lines) and len(parts) < 4:
594
+ candidate = lines[next_index].strip()
595
+ next_index += 1
596
+ if not candidate:
597
+ continue
598
+ if candidate.startswith("[Page ") or is_caption_line(candidate) or is_heading_line(candidate):
599
+ break
600
+ if len(candidate) > 180:
601
+ break
602
+ parts.append(candidate)
603
+ return re.sub(r"\s+", " ", " ".join(parts)).strip()
604
+
605
+
606
+ def asset_role_text(evidence_role: str) -> tuple[str, str, str, str]:
607
+ mapping = {
608
+ "main_results": (
609
+ "answers the primary comparison question",
610
+ "state the comparison question and ranking metric before the table",
611
+ "interpret ranking, margin, and evidence strength after the table",
612
+ "place after protocol, metrics, and comparator definitions",
613
+ ),
614
+ "ablation": (
615
+ "isolates which component supports the claimed gain",
616
+ "name the design choice being tested before the asset",
617
+ "explain the component-level implication after the asset",
618
+ "place after main results so the reader first sees the headline effect",
619
+ ),
620
+ "dataset_or_protocol": (
621
+ "defines the evaluation substrate and comparison scope",
622
+ "tell the reader why the dataset or protocol matters",
623
+ "connect the protocol to the later comparison table",
624
+ "place before baseline and result interpretation",
625
+ ),
626
+ "method_overview": (
627
+ "orients the reader to the method pipeline",
628
+ "introduce the mechanism or module sequence before the figure",
629
+ "map visual elements to later method prose after the figure",
630
+ "place near the beginning of the method section",
631
+ ),
632
+ "uncertainty_or_tradeoff": (
633
+ "explains a robustness, uncertainty, or trade-off pattern",
634
+ "state the trade-off or diagnostic question before the asset",
635
+ "explain the mechanism or limitation revealed by the asset",
636
+ "place after the main result table or in analysis/sensitivity subsections",
637
+ ),
638
+ }
639
+ return mapping.get(
640
+ evidence_role,
641
+ (
642
+ "supports a local subsection claim",
643
+ "state the local question before the asset",
644
+ "explain the takeaway after the asset",
645
+ "place next to the subsection claim it supports",
646
+ ),
647
+ )
648
+
649
+
650
+ def extract_assets(sections: list[SectionRecord]) -> list[AssetRecord]:
651
+ assets: list[AssetRecord] = []
652
+ for section in sections:
653
+ lines = section.content.splitlines()
654
+ for index, line in enumerate(lines):
655
+ match = caption_match(line)
656
+ if not match:
657
+ continue
658
+ raw_kind = match.group("kind").lower()
659
+ asset_type = "table" if raw_kind.startswith("table") else "figure"
660
+ caption = match.group("caption").strip() or line.strip()
661
+ caption = extend_caption(caption, lines, index)
662
+ evidence_role = classify_asset_role(asset_type, caption)
663
+ local_role, before, after, placement = asset_role_text(evidence_role)
664
+ assets.append(
665
+ AssetRecord(
666
+ asset_type=asset_type,
667
+ asset_id=match.group("id"),
668
+ caption=short_excerpt(caption, 180),
669
+ appears_in_section=section.section_type,
670
+ appears_in_title=section.title,
671
+ evidence_role=evidence_role,
672
+ local_role=local_role,
673
+ placement_logic=placement,
674
+ text_bridge_before=before,
675
+ text_bridge_after=after,
676
+ reuse_guidance="Reuse the asset function and placement logic only; do not copy caption wording or claims.",
677
+ )
678
+ )
679
+ return assets
680
+
681
+
682
+ def build_paragraph_roles(sections: list[SectionRecord]) -> list[dict]:
683
+ roles: list[dict] = []
684
+ for section in sections:
685
+ for index, paragraph in enumerate(split_paragraphs(section.content), start=1):
686
+ roles.append(
687
+ {
688
+ "section": section.section_type,
689
+ "section_title": section.title,
690
+ "paragraph_index": index,
691
+ "role": paragraph_role(section.section_type, paragraph),
692
+ "excerpt": short_excerpt(paragraph),
693
+ "reuse_guidance": "Reuse the paragraph function, not the source wording.",
694
+ }
695
+ )
696
+ return roles
697
+
698
+
699
+ def section_slots(section: SectionRecord, paragraph_roles: list[dict], assets: list[AssetRecord]) -> list[str]:
700
+ slots: list[str] = []
701
+ title = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", section.title).strip()
702
+ if title and title.lower() not in {"experiments", "experiment", "results"}:
703
+ slots.append(title.lower())
704
+ for role in paragraph_roles:
705
+ if role["section_title"] == section.title and role["role"] not in slots:
706
+ slots.append(role["role"].replace("_", " "))
707
+ for asset in assets:
708
+ if asset.appears_in_title == section.title:
709
+ slots.append(f"{asset.asset_type}: {asset.evidence_role}")
710
+ return slots
711
+
712
+
713
+ def write_paper_artifacts(
714
+ paper_dir: Path,
715
+ source: str,
716
+ materialized_path: Path,
717
+ source_kind: str,
718
+ title: str,
719
+ method: str,
720
+ page_count: int | None,
721
+ sections: list[SectionRecord],
722
+ roles: list[dict],
723
+ assets: list[AssetRecord],
724
+ ) -> dict:
725
+ paper_dir.mkdir(parents=True, exist_ok=True)
726
+ metadata = {
727
+ "source": source,
728
+ "source_kind": source_kind,
729
+ "materialized_path": str(materialized_path),
730
+ "title": title,
731
+ "extraction_method": method,
732
+ "page_count": page_count,
733
+ "section_count": len(sections),
734
+ "visual_asset_count": len(assets),
735
+ }
736
+ (paper_dir / "metadata.json").write_text(json.dumps(metadata, indent=2, ensure_ascii=False), encoding="utf-8")
737
+ (paper_dir / "section-map.json").write_text(
738
+ json.dumps(
739
+ [
740
+ {
741
+ key: value
742
+ for key, value in asdict(section).items()
743
+ if key != "content"
744
+ }
745
+ for section in sections
746
+ ],
747
+ indent=2,
748
+ ensure_ascii=False,
749
+ ),
750
+ encoding="utf-8",
751
+ )
752
+ (paper_dir / "paragraph-roles.json").write_text(json.dumps(roles, indent=2, ensure_ascii=False), encoding="utf-8")
753
+ (paper_dir / "visual-assets.json").write_text(
754
+ json.dumps([asdict(asset) for asset in assets], indent=2, ensure_ascii=False),
755
+ encoding="utf-8",
756
+ )
757
+ write_section_logic(paper_dir / "section-logic.md", title, sections, roles, assets)
758
+ write_writing_patterns(paper_dir / "writing-patterns.md", title, sections, roles, assets)
759
+ (paper_dir / "extraction-report.md").write_text(
760
+ "\n".join(
761
+ [
762
+ f"# Extraction Report: {title}",
763
+ "",
764
+ f"- Source: `{source}`",
765
+ f"- Extraction method: `{method}`",
766
+ f"- Sections detected: {len(sections)}",
767
+ f"- Visual/table assets detected: {len(assets)}",
768
+ "- Boundary: this artifact extracts reusable structure only; it must not be used as evidence for the current paper.",
769
+ ]
770
+ )
771
+ + "\n",
772
+ encoding="utf-8",
773
+ )
774
+ return metadata
775
+
776
+
777
+ def write_section_logic(path: Path, title: str, sections: list[SectionRecord], roles: list[dict], assets: list[AssetRecord]) -> None:
778
+ lines = [f"# Section Logic: {title}", ""]
779
+ for section in sections:
780
+ section_roles = [role for role in roles if role["section_title"] == section.title]
781
+ section_assets = [asset for asset in assets if asset.appears_in_title == section.title]
782
+ lines.append(f"## {section.title}")
783
+ lines.append("")
784
+ lines.append(f"- Section type: `{section.section_type}`")
785
+ if section_roles:
786
+ role_list = ", ".join(dict.fromkeys(role["role"] for role in section_roles))
787
+ lines.append(f"- Paragraph roles: {role_list}")
788
+ if section_assets:
789
+ asset_list = ", ".join(f"{asset.asset_type}:{asset.evidence_role}" for asset in section_assets)
790
+ lines.append(f"- Asset roles: {asset_list}")
791
+ lines.append("- Reuse: preserve the slot order and rhetorical function, not the wording.")
792
+ lines.append("")
793
+ path.write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
794
+
795
+
796
+ def write_writing_patterns(path: Path, title: str, sections: list[SectionRecord], roles: list[dict], assets: list[AssetRecord]) -> None:
797
+ lines = [f"# Writing Patterns: {title}", ""]
798
+ lines.append("## Template Use")
799
+ lines.append("")
800
+ lines.append("- Use this paper as one structural template among several references.")
801
+ lines.append("- Reproduce section slots, paragraph functions, and asset placement logic only.")
802
+ lines.append("- Do not copy wording, claims, metrics, or conclusions.")
803
+ lines.append("")
804
+ lines.append("## Observed Section Slots")
805
+ lines.append("")
806
+ for section in sections:
807
+ lines.append(f"- {section.section_type}: {section.title}")
808
+ lines.append("")
809
+ lines.append("## Visual/Table Pattern")
810
+ lines.append("")
811
+ if assets:
812
+ for asset in assets:
813
+ lines.append(f"- {asset.asset_type} {asset.asset_id}: {asset.evidence_role} -> {asset.local_role}")
814
+ else:
815
+ lines.append("- No table or figure caption was detected.")
816
+ path.write_text("\n".join(lines).strip() + "\n", encoding="utf-8")
817
+
818
+
819
+ def merge_unique(values: list[str]) -> list[str]:
820
+ seen: set[str] = set()
821
+ merged: list[str] = []
822
+ for value in values:
823
+ normalized = re.sub(r"\s+", " ", value.strip())
824
+ key = normalized.lower()
825
+ if not normalized or key in seen:
826
+ continue
827
+ seen.add(key)
828
+ merged.append(normalized)
829
+ return merged
830
+
831
+
832
+ def normalized_section_title(title: str) -> str:
833
+ title = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", title.strip())
834
+ title = re.sub(r"^[A-Z]\.\d+(?:\.\d+)*\.?\s+", "", title)
835
+ title = re.sub(r"^Appendix\s+[A-Z](?:\.\d+)*\.?\s*[:\-]?\s*", "", title, flags=re.IGNORECASE)
836
+ return re.sub(r"\s+", " ", title).strip().lower()
837
+
838
+
839
+ def experiment_slots_from_signal(section: SectionRecord, role: str, text: str) -> list[str]:
840
+ title = normalized_section_title(section.title)
841
+ lowered = f"{title} {text.lower()}"
842
+ is_appendix = section.section_type == "appendix"
843
+ slots: list[str] = []
844
+
845
+ if is_appendix and "dataset" in lowered and any(
846
+ token in lowered for token in ("statistics", "statistic", "summary", "sample", "feature")
847
+ ):
848
+ slots.append("appendix_dataset_statistics")
849
+ if is_appendix and any(token in lowered for token in ("baseline", "metric", "implementation", "hyperparameter")):
850
+ slots.append("appendix_baseline_metric_details")
851
+ if role == "dataset_statistics" or (
852
+ "dataset" in lowered and any(token in lowered for token in ("statistics", "summary", "sample", "feature"))
853
+ ):
854
+ slots.append("dataset_statistics")
855
+ if role == "dataset_description" or "datasets" in title or "dataset" in title:
856
+ slots.append("dataset_description")
857
+ if role == "split_protocol" or any(
858
+ token in lowered
859
+ for token in ("split", "seed", "sampling", "train/test", "validation", "protocol")
860
+ ):
861
+ slots.append("split_protocol")
862
+ if role == "baseline_setup" or "baseline" in lowered or "comparator" in lowered:
863
+ slots.append("baseline_setup")
864
+ if role == "metric_definition" or "metric" in lowered or "auuc" in lowered or "qini" in lowered:
865
+ slots.append("metric_definition")
866
+ if role == "implementation_details" or any(
867
+ token in lowered
868
+ for token in ("implementation", "hyperparameter", "tuning", "epoch", "learning rate", "hardware")
869
+ ):
870
+ slots.append("implementation_details")
871
+ if "ablation" in lowered:
872
+ slots.append("ablation")
873
+ if "sensitivity" in lowered or "shift" in lowered or "trade-off" in lowered or "tradeoff" in lowered:
874
+ slots.append("sensitivity")
875
+ if role == "result_interpretation" or any(
876
+ token in lowered for token in ("main result", "overall performance", "performance", "results and discussion")
877
+ ):
878
+ slots.append("main_results")
879
+ return list(dict.fromkeys(slots))
880
+
881
+
882
+ def is_experiment_protocol_section(section: SectionRecord) -> bool:
883
+ if section.section_type in {"experiments", "discussion"}:
884
+ return True
885
+ if section.section_type != "appendix":
886
+ return False
887
+ title = normalized_section_title(section.title)
888
+ return any(
889
+ token in title
890
+ for token in (
891
+ "dataset",
892
+ "baseline",
893
+ "metric",
894
+ "experiment",
895
+ "experimental",
896
+ "setup",
897
+ "result",
898
+ "ablation",
899
+ "sensitivity",
900
+ "complexity",
901
+ "online",
902
+ )
903
+ )
904
+
905
+
906
+ def slot_payload(
907
+ *,
908
+ source_paper: str,
909
+ slot: str,
910
+ section: SectionRecord,
911
+ evidence_excerpt: str,
912
+ paragraph_index: int | None = None,
913
+ asset_type: str | None = None,
914
+ asset_id: str | None = None,
915
+ ) -> dict:
916
+ guidance = EXPERIMENT_PROTOCOL_SLOT_GUIDANCE[slot]
917
+ payload = {
918
+ "slot": slot,
919
+ "source_paper": source_paper,
920
+ "source_heading": section.title,
921
+ "source_section_type": section.section_type,
922
+ "paragraph_index": paragraph_index,
923
+ "evidence_excerpt": short_excerpt(evidence_excerpt),
924
+ "reader_question": guidance["reader_question"],
925
+ "placement_guidance": guidance["placement_guidance"],
926
+ "linked_main_section": "experiments" if slot.startswith("appendix_") else "",
927
+ "reuse_guidance": "Reuse this protocol role and placement logic only; do not copy wording, claims, metrics, data, or conclusions.",
928
+ }
929
+ if asset_type and asset_id:
930
+ payload["asset_type"] = asset_type
931
+ payload["asset_id"] = asset_id
932
+ return payload
933
+
934
+
935
+ def build_experiment_protocol_slots_for_payload(payload: dict) -> list[dict]:
936
+ slots: list[dict] = []
937
+ sections_by_title = {section.title: section for section in payload["sections"]}
938
+
939
+ for role in payload["roles"]:
940
+ section = sections_by_title.get(role["section_title"])
941
+ if not section:
942
+ continue
943
+ if not is_experiment_protocol_section(section):
944
+ continue
945
+ role_slots = experiment_slots_from_signal(section, role["role"], role["excerpt"])
946
+ if not role_slots:
947
+ continue
948
+ for slot in role_slots:
949
+ slots.append(
950
+ slot_payload(
951
+ source_paper=payload["slug"],
952
+ slot=slot,
953
+ section=section,
954
+ paragraph_index=role["paragraph_index"],
955
+ evidence_excerpt=role["excerpt"],
956
+ )
957
+ )
958
+
959
+ for asset in payload["assets"]:
960
+ section = sections_by_title.get(asset.appears_in_title)
961
+ if not section:
962
+ continue
963
+ if not is_experiment_protocol_section(section):
964
+ continue
965
+ asset_slots = experiment_slots_from_signal(section, asset.evidence_role, asset.caption)
966
+ if asset.evidence_role == "dataset_or_protocol" and section.section_type == "appendix":
967
+ asset_slots = ["appendix_dataset_statistics"]
968
+ if not asset_slots:
969
+ continue
970
+ for slot in asset_slots:
971
+ slots.append(
972
+ slot_payload(
973
+ source_paper=payload["slug"],
974
+ slot=slot,
975
+ section=section,
976
+ evidence_excerpt=asset.caption,
977
+ asset_type=asset.asset_type,
978
+ asset_id=asset.asset_id,
979
+ )
980
+ )
981
+
982
+ seen: set[tuple[str, str, str, str]] = set()
983
+ unique_slots: list[dict] = []
984
+ for item in slots:
985
+ key = (
986
+ item["source_paper"],
987
+ item["slot"],
988
+ item["source_heading"].lower(),
989
+ item["evidence_excerpt"].lower(),
990
+ )
991
+ if key in seen:
992
+ continue
993
+ seen.add(key)
994
+ unique_slots.append(item)
995
+ return unique_slots
996
+
997
+
998
+ def build_section_templates(output_dir: Path, paper_payloads: list[dict]) -> None:
999
+ target = output_dir / "section-templates"
1000
+ target.mkdir(parents=True, exist_ok=True)
1001
+ section_types = ["abstract", "introduction", "related-work", "method", "experiments", "discussion", "conclusion"]
1002
+ for section_type in section_types:
1003
+ source_papers: list[str] = []
1004
+ observed_titles: list[str] = []
1005
+ slots: list[str] = []
1006
+ paragraph_roles: list[str] = []
1007
+ asset_roles: list[dict] = []
1008
+ for payload in paper_payloads:
1009
+ matching_sections = [section for section in payload["sections"] if section.section_type == section_type]
1010
+ if section_type == "experiments":
1011
+ matching_sections.extend(
1012
+ section for section in payload["sections"] if section.section_type in {"discussion"}
1013
+ )
1014
+ if not matching_sections:
1015
+ continue
1016
+ source_papers.append(payload["slug"])
1017
+ for section in matching_sections:
1018
+ observed_titles.append(section.title)
1019
+ slots.extend(section_slots(section, payload["roles"], payload["assets"]))
1020
+ paragraph_roles.extend(
1021
+ role["role"] for role in payload["roles"] if role["section"] == section_type
1022
+ )
1023
+ asset_roles.extend(
1024
+ {
1025
+ "asset_type": asset.asset_type,
1026
+ "evidence_role": asset.evidence_role,
1027
+ "local_role": asset.local_role,
1028
+ "source_paper": payload["slug"],
1029
+ }
1030
+ for asset in payload["assets"]
1031
+ if asset.appears_in_section == section_type
1032
+ or (section_type == "experiments" and asset.appears_in_section in {"experiments", "discussion"})
1033
+ )
1034
+ if not source_papers:
1035
+ continue
1036
+ template = {
1037
+ "section": section_type,
1038
+ "template_id": f"{section_type}-multi-reference-template",
1039
+ "source_papers": merge_unique(source_papers),
1040
+ "observed_titles": merge_unique(observed_titles),
1041
+ "section_slots": merge_unique(slots),
1042
+ "paragraph_roles": merge_unique(paragraph_roles),
1043
+ "asset_roles": asset_roles,
1044
+ "reuse_rule": "Reuse structure only; do not copy wording, claims, metrics, or conclusions from reference papers.",
1045
+ }
1046
+ if section_type == "experiments":
1047
+ protocol_slots: list[dict] = []
1048
+ for payload in paper_payloads:
1049
+ protocol_slots.extend(build_experiment_protocol_slots_for_payload(payload))
1050
+ template["experiment_protocol_slots"] = protocol_slots
1051
+ (target / f"{section_type}.json").write_text(
1052
+ json.dumps(template, indent=2, ensure_ascii=False),
1053
+ encoding="utf-8",
1054
+ )
1055
+ if section_type == "experiments":
1056
+ (target / "experiments-protocol.json").write_text(
1057
+ json.dumps(
1058
+ {
1059
+ "section": "experiments",
1060
+ "template_id": "experiments-protocol-slots",
1061
+ "source_papers": template["source_papers"],
1062
+ "experiment_protocol_slots": template["experiment_protocol_slots"],
1063
+ "reuse_rule": "Reuse experiment setup topology only: dataset, split, baseline, metric, implementation, result, ablation, sensitivity, and appendix-link roles.",
1064
+ },
1065
+ indent=2,
1066
+ ensure_ascii=False,
1067
+ ),
1068
+ encoding="utf-8",
1069
+ )
1070
+
1071
+
1072
+ def build_visual_templates(output_dir: Path, paper_payloads: list[dict]) -> None:
1073
+ target = output_dir / "visual-templates"
1074
+ target.mkdir(parents=True, exist_ok=True)
1075
+ asset_roles: list[dict] = []
1076
+ for payload in paper_payloads:
1077
+ for asset in payload["assets"]:
1078
+ if asset.appears_in_section in {"experiments", "discussion", "method"}:
1079
+ item = asdict(asset)
1080
+ item["source_paper"] = payload["slug"]
1081
+ asset_roles.append(item)
1082
+ template = {
1083
+ "template_id": "experiment-visual-and-table-template",
1084
+ "asset_roles": asset_roles,
1085
+ "reuse_rule": "Use table and figure functions, placement, and bridge logic to plan current paper assets; do not copy captions or data.",
1086
+ }
1087
+ (target / "experiment-assets.json").write_text(
1088
+ json.dumps(template, indent=2, ensure_ascii=False),
1089
+ encoding="utf-8",
1090
+ )
1091
+
1092
+
1093
+ def write_aggregate_playbook(output_dir: Path, paper_payloads: list[dict]) -> None:
1094
+ lines = [
1095
+ "# Aggregate Template Playbook",
1096
+ "",
1097
+ "Purpose: help `/lab:write` reproduce mature paper structure from multiple reference templates.",
1098
+ "",
1099
+ "## Sources",
1100
+ "",
1101
+ ]
1102
+ for payload in paper_payloads:
1103
+ lines.append(f"- `{payload['slug']}`: {payload['title']}")
1104
+ lines.extend(
1105
+ [
1106
+ "",
1107
+ "## Reuse Boundary",
1108
+ "",
1109
+ "- Reproduce section order, paragraph roles, table/figure function, and bridge logic.",
1110
+ "- Do not copy wording, claims, experimental conclusions, metrics, or terminology names.",
1111
+ "- If only one reference supports a structure, treat it as a single-template pattern, not a universal rule.",
1112
+ "",
1113
+ "## Multi-Template Write Procedure",
1114
+ "",
1115
+ "1. Pick 2-3 closest section templates for the current paper section.",
1116
+ "2. For experiment sections, preserve protocol slots when present: datasets, splits, baselines, metrics, implementation details, main results, ablations, sensitivity analysis, and appendix links.",
1117
+ "3. Build a mini-outline from common slots and current-paper evidence.",
1118
+ "4. Add required table/figure assets with local before/after bridge functions.",
1119
+ "5. Draft with current-paper terminology and evidence only.",
1120
+ "",
1121
+ "## Table/Figure Planning Rule",
1122
+ "",
1123
+ "Every major table or figure should answer a reader question, appear near the subsection claim it supports, and have one bridge sentence before and one interpretation sentence after it.",
1124
+ ]
1125
+ )
1126
+ (output_dir / "aggregate-template-playbook.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
1127
+
1128
+
1129
+ def process_source(source: str, output_dir: Path, cache_dir: Path) -> dict:
1130
+ materialized_path, source_kind = materialize_source(source, cache_dir)
1131
+ if not materialized_path.exists():
1132
+ raise FileNotFoundError(f"source not found: {source}")
1133
+ text, method, page_count = read_source_text(materialized_path)
1134
+ title = detect_title(text, materialized_path.stem)
1135
+ slug = slugify(title or materialized_path.stem)
1136
+ sections = split_sections(text)
1137
+ roles = build_paragraph_roles(sections)
1138
+ assets = extract_assets(sections)
1139
+ metadata = write_paper_artifacts(
1140
+ output_dir / slug,
1141
+ source,
1142
+ materialized_path,
1143
+ source_kind,
1144
+ title,
1145
+ method,
1146
+ page_count,
1147
+ sections,
1148
+ roles,
1149
+ assets,
1150
+ )
1151
+ return {
1152
+ "slug": slug,
1153
+ "title": title,
1154
+ "metadata": metadata,
1155
+ "sections": sections,
1156
+ "roles": roles,
1157
+ "assets": assets,
1158
+ }
1159
+
1160
+
1161
+ def main() -> int:
1162
+ args = parse_args()
1163
+ output_dir = Path(args.output_dir).expanduser().resolve()
1164
+ output_dir.mkdir(parents=True, exist_ok=True)
1165
+ cache_dir = output_dir / ".cache"
1166
+
1167
+ payloads: list[dict] = []
1168
+ source_entries: list[dict] = []
1169
+ try:
1170
+ for source in args.sources:
1171
+ payload = process_source(source, output_dir, cache_dir)
1172
+ payloads.append(payload)
1173
+ source_entries.append(
1174
+ {
1175
+ "source": source,
1176
+ "slug": payload["slug"],
1177
+ "title": payload["title"],
1178
+ "artifact_dir": str(output_dir / payload["slug"]),
1179
+ }
1180
+ )
1181
+ except RuntimeError as exc:
1182
+ if "No PDF text extractor succeeded" in str(exc):
1183
+ bootstrapped = rerun_with_uv_pypdf()
1184
+ if bootstrapped is not None:
1185
+ return bootstrapped
1186
+ raise
1187
+
1188
+ (output_dir / "sources.json").write_text(
1189
+ json.dumps({"sources": source_entries}, indent=2, ensure_ascii=False),
1190
+ encoding="utf-8",
1191
+ )
1192
+ build_section_templates(output_dir, payloads)
1193
+ build_visual_templates(output_dir, payloads)
1194
+ write_aggregate_playbook(output_dir, payloads)
1195
+ print(f"reference template patterns written to {output_dir}")
1196
+ return 0
1197
+
1198
+
1199
+ if __name__ == "__main__":
1200
+ raise SystemExit(main())