superlab 0.1.71 → 0.1.72
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/i18n.cjs +89 -6
- package/lib/lab_idea_contract.json +4 -4
- package/lib/lab_write_contract.json +1 -1
- package/package-assets/claude/commands/lab/idea.md +1 -1
- package/package-assets/claude/commands/lab/report.md +1 -0
- package/package-assets/claude/commands/lab/write.md +1 -0
- package/package-assets/claude/commands/lab-idea.md +1 -1
- package/package-assets/claude/commands/lab-report.md +1 -0
- package/package-assets/claude/commands/lab-write.md +1 -0
- package/package-assets/claude/commands/lab:idea.md +1 -1
- package/package-assets/claude/commands/lab:report.md +1 -0
- package/package-assets/claude/commands/lab:write.md +1 -0
- package/package-assets/claude/commands/lab/357/274/232idea.md +1 -1
- package/package-assets/claude/commands/lab/357/274/232report.md +1 -0
- package/package-assets/claude/commands/lab/357/274/232write.md +1 -0
- package/package-assets/codex/prompts/lab/idea.md +1 -1
- package/package-assets/codex/prompts/lab/report.md +1 -0
- package/package-assets/codex/prompts/lab/write.md +1 -1
- package/package-assets/codex/prompts/lab-idea.md +1 -1
- package/package-assets/codex/prompts/lab-report.md +1 -0
- package/package-assets/codex/prompts/lab-write.md +1 -1
- package/package-assets/codex/prompts/lab:idea.md +1 -1
- package/package-assets/codex/prompts/lab:report.md +1 -0
- package/package-assets/codex/prompts/lab:write.md +1 -1
- package/package-assets/codex/prompts/lab/357/274/232idea.md +1 -1
- package/package-assets/codex/prompts/lab/357/274/232report.md +1 -0
- package/package-assets/codex/prompts/lab/357/274/232write.md +1 -1
- package/package-assets/shared/lab/.managed/scripts/validate_collaborator_report.py +55 -1
- package/package-assets/shared/lab/.managed/scripts/validate_idea_artifact.py +75 -0
- package/package-assets/shared/lab/.managed/scripts/validate_section_draft.py +119 -0
- package/package-assets/shared/lab/.managed/scripts/validate_stage_report.py +246 -0
- package/package-assets/shared/lab/.managed/templates/final-report.md +11 -0
- package/package-assets/shared/lab/.managed/templates/idea.md +18 -0
- package/package-assets/shared/lab/.managed/templates/main-tables.md +6 -0
- package/package-assets/shared/lab/.managed/templates/paper-plan.md +9 -0
- package/package-assets/shared/lab/.managed/templates/stage-report.md +19 -0
- package/package-assets/shared/lab/.managed/templates/write-iteration.md +13 -0
- package/package-assets/shared/skills/lab/SKILL.md +18 -0
- package/package-assets/shared/skills/lab/references/paper-writing/abstract.md +14 -0
- package/package-assets/shared/skills/lab/references/paper-writing/conclusion.md +13 -0
- package/package-assets/shared/skills/lab/references/paper-writing/experiments.md +19 -0
- package/package-assets/shared/skills/lab/references/paper-writing/introduction.md +17 -2
- package/package-assets/shared/skills/lab/references/paper-writing/method.md +10 -0
- package/package-assets/shared/skills/lab/references/paper-writing/section-style-policies.md +10 -1
- package/package-assets/shared/skills/lab/stages/auto.md +20 -0
- package/package-assets/shared/skills/lab/stages/data.md +3 -0
- package/package-assets/shared/skills/lab/stages/framing.md +3 -0
- package/package-assets/shared/skills/lab/stages/idea.md +33 -19
- package/package-assets/shared/skills/lab/stages/iterate.md +3 -0
- package/package-assets/shared/skills/lab/stages/report.md +11 -0
- package/package-assets/shared/skills/lab/stages/review.md +3 -0
- package/package-assets/shared/skills/lab/stages/run.md +3 -0
- package/package-assets/shared/skills/lab/stages/spec.md +3 -0
- package/package-assets/shared/skills/lab/stages/write.md +12 -0
- package/package.json +1 -1
|
@@ -261,10 +261,102 @@ INTERNAL_CONFIG_LABEL_PATTERN = re.compile(
|
|
|
261
261
|
r"\b[a-z]{1,4}\d+(?:[-_][a-z]?\d+(?:\.\d+)?){1,4}\b",
|
|
262
262
|
flags=re.IGNORECASE,
|
|
263
263
|
)
|
|
264
|
+
ISOLATED_INSIGHT_HEADING_PATTERN = re.compile(
|
|
265
|
+
r"\\(?:sub)*section\*?\s*\{\s*(?:our\s+)?insights?\s*\}|"
|
|
266
|
+
r"\\(?:sub)*section\*?\s*\{\s*(?:核心)?洞见\s*\}",
|
|
267
|
+
flags=re.IGNORECASE,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def has_insight_contrast(text: str) -> bool:
|
|
272
|
+
return contains_any(
|
|
273
|
+
text,
|
|
274
|
+
(
|
|
275
|
+
"counterintuitively",
|
|
276
|
+
"we observe",
|
|
277
|
+
"we find",
|
|
278
|
+
"a closer inspection",
|
|
279
|
+
"while it might seem",
|
|
280
|
+
"existing work assumes",
|
|
281
|
+
"common assumption",
|
|
282
|
+
"suggesting that",
|
|
283
|
+
"reveals that",
|
|
284
|
+
"反直觉",
|
|
285
|
+
"我们观察到",
|
|
286
|
+
"我们发现",
|
|
287
|
+
"进一步检查",
|
|
288
|
+
"通常假设",
|
|
289
|
+
"常规认知",
|
|
290
|
+
"这表明",
|
|
291
|
+
"揭示",
|
|
292
|
+
),
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def has_mechanism_explanation(text: str) -> bool:
|
|
297
|
+
return contains_any(
|
|
298
|
+
text,
|
|
299
|
+
(
|
|
300
|
+
"because",
|
|
301
|
+
"mechanism",
|
|
302
|
+
"attributed to",
|
|
303
|
+
"explains",
|
|
304
|
+
"to capture",
|
|
305
|
+
"to model",
|
|
306
|
+
"based on",
|
|
307
|
+
"therefore",
|
|
308
|
+
"causal",
|
|
309
|
+
"causes",
|
|
310
|
+
"amplifies",
|
|
311
|
+
"propagates",
|
|
312
|
+
"因为",
|
|
313
|
+
"机制",
|
|
314
|
+
"归因于",
|
|
315
|
+
"解释",
|
|
316
|
+
"为了刻画",
|
|
317
|
+
"为了建模",
|
|
318
|
+
"基于",
|
|
319
|
+
"因此",
|
|
320
|
+
"因果",
|
|
321
|
+
"放大",
|
|
322
|
+
"传播",
|
|
323
|
+
),
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def has_diagnostic_interpretation(text: str) -> bool:
|
|
328
|
+
return contains_any(
|
|
329
|
+
text,
|
|
330
|
+
(
|
|
331
|
+
"supports our hypothesis",
|
|
332
|
+
"supports the hypothesis",
|
|
333
|
+
"diagnostic",
|
|
334
|
+
"mechanism",
|
|
335
|
+
"ablation shows",
|
|
336
|
+
"ablation indicates",
|
|
337
|
+
"this suggests",
|
|
338
|
+
"rather than",
|
|
339
|
+
"not merely",
|
|
340
|
+
"we attribute",
|
|
341
|
+
"验证了",
|
|
342
|
+
"支持假设",
|
|
343
|
+
"诊断",
|
|
344
|
+
"机制",
|
|
345
|
+
"消融表明",
|
|
346
|
+
"这表明",
|
|
347
|
+
"并非只是",
|
|
348
|
+
"不是单纯",
|
|
349
|
+
"归因于",
|
|
350
|
+
),
|
|
351
|
+
)
|
|
264
352
|
|
|
265
353
|
|
|
266
354
|
def check_common_section_gate_risks(text: str, issues: list[str]):
|
|
267
355
|
prose_text = strip_latex_commands(text)
|
|
356
|
+
if ISOLATED_INSIGHT_HEADING_PATTERN.search(text):
|
|
357
|
+
issues.append(
|
|
358
|
+
"paper prose should not isolate insight under an 'Our Insights' style heading; weave the insight into the section's motivation, mechanism, evidence, and boundary"
|
|
359
|
+
)
|
|
268
360
|
if contains_any(prose_text, SERVICE_STYLE_PHRASES):
|
|
269
361
|
issues.append(
|
|
270
362
|
"service-style or AI-assistant meta language appears in reader-facing prose; rewrite it as academic manuscript text"
|
|
@@ -530,6 +622,8 @@ def check_abstract(text: str, issues: list[str]):
|
|
|
530
622
|
issues.append("abstract should state the core challenge or gap")
|
|
531
623
|
if not contains_any(text, ("boundary", "bounded", "limitation", "however", "but", "局限", "边界", "限制")):
|
|
532
624
|
issues.append("abstract should include a bounded result or explicit limitation")
|
|
625
|
+
if not (has_insight_contrast(text) or has_mechanism_explanation(text)):
|
|
626
|
+
issues.append("abstract should include a one-sentence insight or mechanism, not only task, method, and metric results")
|
|
533
627
|
|
|
534
628
|
|
|
535
629
|
def check_introduction(text: str, issues: list[str]):
|
|
@@ -568,6 +662,10 @@ def check_introduction(text: str, issues: list[str]):
|
|
|
568
662
|
),
|
|
569
663
|
):
|
|
570
664
|
issues.append("introduction should explain what is missing in prior work")
|
|
665
|
+
if not has_insight_contrast(text):
|
|
666
|
+
issues.append("introduction should create an insight contrast, such as common assumption versus observed mechanism")
|
|
667
|
+
if not has_mechanism_explanation(text):
|
|
668
|
+
issues.append("introduction should state the root mechanism or explanation behind the gap")
|
|
571
669
|
|
|
572
670
|
|
|
573
671
|
def check_related_work(text: str, issues: list[str]):
|
|
@@ -592,6 +690,8 @@ def check_method(text: str, issues: list[str]):
|
|
|
592
690
|
issues.append("method should explain the concrete design")
|
|
593
691
|
if not contains_any(text, ("advantage", "benefit", "improves", "优势", "收益")):
|
|
594
692
|
issues.append("method should explain the technical advantage")
|
|
693
|
+
if not has_mechanism_explanation(text):
|
|
694
|
+
issues.append("method should present design choices as consequences of the core insight or mechanism, not as an API-style module list")
|
|
595
695
|
|
|
596
696
|
|
|
597
697
|
def check_experiments(text: str, issues: list[str]):
|
|
@@ -621,6 +721,8 @@ def check_experiments(text: str, issues: list[str]):
|
|
|
621
721
|
),
|
|
622
722
|
):
|
|
623
723
|
issues.append("experiments should include benchmark scene notes")
|
|
724
|
+
if not has_diagnostic_interpretation(text):
|
|
725
|
+
issues.append("experiments should interpret results as diagnostic evidence for the paper's mechanism or insight, not only restate table values")
|
|
624
726
|
|
|
625
727
|
|
|
626
728
|
def check_conclusion(text: str, issues: list[str]):
|
|
@@ -628,6 +730,23 @@ def check_conclusion(text: str, issues: list[str]):
|
|
|
628
730
|
issues.append("conclusion should state at least one limitation or boundary")
|
|
629
731
|
if not contains_any(text, ("future work", "next step", "future direction", "下一步", "未来工作")):
|
|
630
732
|
issues.append("conclusion should state one next step or future direction")
|
|
733
|
+
if not contains_any(
|
|
734
|
+
text,
|
|
735
|
+
(
|
|
736
|
+
"broader principle",
|
|
737
|
+
"suggests",
|
|
738
|
+
"indicates",
|
|
739
|
+
"implies",
|
|
740
|
+
"takeaway",
|
|
741
|
+
"principle",
|
|
742
|
+
"更广泛",
|
|
743
|
+
"表明",
|
|
744
|
+
"意味着",
|
|
745
|
+
"启示",
|
|
746
|
+
"原则",
|
|
747
|
+
),
|
|
748
|
+
):
|
|
749
|
+
issues.append("conclusion should state the broader principle or takeaway implied by the evidence")
|
|
631
750
|
|
|
632
751
|
|
|
633
752
|
SECTION_CHECKS = {
|
|
@@ -8,11 +8,14 @@ from pathlib import Path
|
|
|
8
8
|
REQUIRED_SECTIONS = {
|
|
9
9
|
"Rule Preflight": [r"^##\s+Rule Preflight\s*$", r"^##\s+规则预检\s*$"],
|
|
10
10
|
"Stage Identity": [r"^##\s+Stage Identity\s*$", r"^##\s+阶段身份\s*$"],
|
|
11
|
+
"Requested Outcome Mapping": [r"^##\s+Requested Outcome Mapping\s*$", r"^##\s+请求结果映射\s*$"],
|
|
11
12
|
"Core Explanation Table": [r"^##\s+Core Explanation Table\s*$", r"^##\s+核心说明表\s*$"],
|
|
12
13
|
"Evidence And Artifacts": [r"^##\s+Evidence And Artifacts\s*$", r"^##\s+证据与工件\s*$"],
|
|
13
14
|
"Next Action": [r"^##\s+Next Action\s*$", r"^##\s+下一步动作\s*$"],
|
|
14
15
|
}
|
|
15
16
|
|
|
17
|
+
REPAIR_CONTROL_SECTION = [r"^##\s+Repair Control\s*$", r"^##\s+修复控制\s*$"]
|
|
18
|
+
|
|
16
19
|
REQUIRED_CORE_ROWS = {
|
|
17
20
|
"stage": ("这是什么阶段", "what stage is this", "stage"),
|
|
18
21
|
"background": ("背景是什么", "background"),
|
|
@@ -86,6 +89,148 @@ WHY_MARKERS = (
|
|
|
86
89
|
"避免",
|
|
87
90
|
)
|
|
88
91
|
|
|
92
|
+
IMPROVEMENT_NEEDED_MARKERS = (
|
|
93
|
+
"need improvement",
|
|
94
|
+
"needs improvement",
|
|
95
|
+
"needs revision",
|
|
96
|
+
"must improve",
|
|
97
|
+
"should improve",
|
|
98
|
+
"needs repair",
|
|
99
|
+
"needs rerun",
|
|
100
|
+
"需要改进",
|
|
101
|
+
"需要修复",
|
|
102
|
+
"需要重跑",
|
|
103
|
+
"需要继续",
|
|
104
|
+
"仍需",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
NO_IMPROVEMENT_MARKERS = (
|
|
108
|
+
"no improvement needed",
|
|
109
|
+
"does not need improvement",
|
|
110
|
+
"not need improvement",
|
|
111
|
+
"无需改进",
|
|
112
|
+
"不需要改进",
|
|
113
|
+
"不需改进",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
STOP_DECISION_MARKERS = (
|
|
117
|
+
"decision: stop",
|
|
118
|
+
"decision:stop",
|
|
119
|
+
"decision: 停止",
|
|
120
|
+
"decision:停止",
|
|
121
|
+
"决策: stop",
|
|
122
|
+
"决策:stop",
|
|
123
|
+
"决策: 停止",
|
|
124
|
+
"决策:停止",
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
TERMINAL_BOUNDARY_MARKERS = (
|
|
128
|
+
"budget exhausted",
|
|
129
|
+
"budget boundary",
|
|
130
|
+
"exceeded budget",
|
|
131
|
+
"fatal",
|
|
132
|
+
"safety",
|
|
133
|
+
"invalid artifact",
|
|
134
|
+
"invalid metric",
|
|
135
|
+
"frozen core",
|
|
136
|
+
"outside the envelope",
|
|
137
|
+
"outside approved envelope",
|
|
138
|
+
"user requested stop",
|
|
139
|
+
"approval required",
|
|
140
|
+
"requires approval",
|
|
141
|
+
"escalation boundary",
|
|
142
|
+
"impossible",
|
|
143
|
+
"not allowed",
|
|
144
|
+
"terminal boundary",
|
|
145
|
+
"integrity",
|
|
146
|
+
"ethics",
|
|
147
|
+
"预算耗尽",
|
|
148
|
+
"预算边界",
|
|
149
|
+
"超过预算",
|
|
150
|
+
"致命",
|
|
151
|
+
"安全",
|
|
152
|
+
"无效工件",
|
|
153
|
+
"无效指标",
|
|
154
|
+
"冻结核心",
|
|
155
|
+
"超出边界",
|
|
156
|
+
"超出已批准",
|
|
157
|
+
"用户要求停止",
|
|
158
|
+
"需要批准",
|
|
159
|
+
"升级边界",
|
|
160
|
+
"不可能",
|
|
161
|
+
"不允许",
|
|
162
|
+
"终止边界",
|
|
163
|
+
"诚信",
|
|
164
|
+
"伦理",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
REPAIR_CONTROL_FIELDS = (
|
|
168
|
+
("Repair budget:", "修复预算:"),
|
|
169
|
+
("Repair attempts used:", "已用修复次数:"),
|
|
170
|
+
("Current failure class:", "当前失败类型:"),
|
|
171
|
+
("Repair hypothesis:", "修复假设:"),
|
|
172
|
+
("Evidence-changing knobs changed:", "改变证据解释的旋钮:"),
|
|
173
|
+
("Ordinary engineering fixes allowed:", "允许的普通工程修复:"),
|
|
174
|
+
("Frozen core unchanged:", "冻结核心不变:"),
|
|
175
|
+
("Forbidden repairs avoided:", "已避免的禁用修复:"),
|
|
176
|
+
("Confirmation check:", "确认验证:"),
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
FORBIDDEN_REPAIR_PATTERNS = (
|
|
180
|
+
r"\b(changed|modified|relaxed|lowered|loosened|rewrote)\s+(the\s+)?(primary\s+)?metric\b",
|
|
181
|
+
r"\b(changed|modified|relaxed|lowered|loosened)\s+(the\s+)?(target|threshold|target\s+range)\b",
|
|
182
|
+
r"\b(drop|dropped|delete|deleted|remove|removed|exclude|excluded)\s+(hard|failed|bad)\s+(cases|examples|samples)\b",
|
|
183
|
+
r"\b(changed|modified|rewrote)\s+(labels?|ground\s+truth)\b",
|
|
184
|
+
r"\b(changed|modified|swapped|replaced)\s+(the\s+)?(final\s+)?test\s+split\b",
|
|
185
|
+
r"\b(changed|modified|expanded)\s+(the\s+)?(paper-facing\s+)?claim\b",
|
|
186
|
+
r"\b(changed|modified|switched)\s+(the\s+)?(threat\s+model|reviewer\s+profile|dataset\s+scope)\b",
|
|
187
|
+
r"修改(主)?指标",
|
|
188
|
+
r"放宽(目标|阈值|目标区间)",
|
|
189
|
+
r"(删除|移除|剔除)(困难|失败|坏)(样本|案例)",
|
|
190
|
+
r"修改(标签|真值|测试集|主张|威胁模型|数据集范围)",
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
REPAIR_SUCCESS_MARKERS = (
|
|
194
|
+
"repair passed",
|
|
195
|
+
"repair succeeded",
|
|
196
|
+
"passed after repair",
|
|
197
|
+
"promotion is planned",
|
|
198
|
+
"promote",
|
|
199
|
+
"promotion",
|
|
200
|
+
"修复通过",
|
|
201
|
+
"修复成功",
|
|
202
|
+
"准备推广",
|
|
203
|
+
"推广",
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
NO_CONFIRMATION_MARKERS = (
|
|
207
|
+
"not needed",
|
|
208
|
+
"not required",
|
|
209
|
+
"none",
|
|
210
|
+
"n/a",
|
|
211
|
+
"无需",
|
|
212
|
+
"不需要",
|
|
213
|
+
"无",
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
CONFIRMATION_MARKERS = (
|
|
217
|
+
"confirmation",
|
|
218
|
+
"confirm",
|
|
219
|
+
"holdout",
|
|
220
|
+
"control",
|
|
221
|
+
"new seed",
|
|
222
|
+
"seed",
|
|
223
|
+
"batch",
|
|
224
|
+
"rerun",
|
|
225
|
+
"复验",
|
|
226
|
+
"确认",
|
|
227
|
+
"留出",
|
|
228
|
+
"对照",
|
|
229
|
+
"新 seed",
|
|
230
|
+
"批次",
|
|
231
|
+
"重跑",
|
|
232
|
+
)
|
|
233
|
+
|
|
89
234
|
|
|
90
235
|
def parse_args():
|
|
91
236
|
parser = argparse.ArgumentParser(description="Validate a plain-language lab stage report.")
|
|
@@ -162,6 +307,16 @@ def has_marker_with_value(body: str, markers: tuple[str, ...]) -> bool:
|
|
|
162
307
|
return False
|
|
163
308
|
|
|
164
309
|
|
|
310
|
+
def marker_value(body: str, markers: tuple[str, ...]) -> str:
|
|
311
|
+
for line in body.splitlines():
|
|
312
|
+
stripped = line.strip()
|
|
313
|
+
for marker in markers:
|
|
314
|
+
if marker not in stripped:
|
|
315
|
+
continue
|
|
316
|
+
return stripped.split(marker, 1)[1].strip()
|
|
317
|
+
return ""
|
|
318
|
+
|
|
319
|
+
|
|
165
320
|
def is_shallow(value: str | None) -> bool:
|
|
166
321
|
if value is None:
|
|
167
322
|
return True
|
|
@@ -174,6 +329,32 @@ def has_why(value: str) -> bool:
|
|
|
174
329
|
return any(marker in lowered for marker in WHY_MARKERS)
|
|
175
330
|
|
|
176
331
|
|
|
332
|
+
def improvement_is_needed(value: str | None) -> bool:
|
|
333
|
+
lowered = normalize(value or "")
|
|
334
|
+
if any(marker in lowered for marker in NO_IMPROVEMENT_MARKERS):
|
|
335
|
+
return False
|
|
336
|
+
return any(marker in lowered for marker in IMPROVEMENT_NEEDED_MARKERS)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def next_action_is_stop(body: str) -> bool:
|
|
340
|
+
lowered = normalize(body)
|
|
341
|
+
if any(marker in lowered for marker in STOP_DECISION_MARKERS):
|
|
342
|
+
return True
|
|
343
|
+
return re.search(r"^\s*-\s*(decision|决策)\s*[::]\s*(stop|停止)\b", body, flags=re.IGNORECASE | re.MULTILINE) is not None
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def has_terminal_boundary(value: str) -> bool:
|
|
347
|
+
lowered = normalize(value)
|
|
348
|
+
return any(marker in lowered for marker in TERMINAL_BOUNDARY_MARKERS)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def parse_repair_attempts(value: str) -> int | None:
|
|
352
|
+
match = re.search(r"\d+", value or "")
|
|
353
|
+
if not match:
|
|
354
|
+
return None
|
|
355
|
+
return int(match.group(0))
|
|
356
|
+
|
|
357
|
+
|
|
177
358
|
def validate_core_table(text: str) -> list[str]:
|
|
178
359
|
issues = []
|
|
179
360
|
rows = parse_core_table_rows(text)
|
|
@@ -217,6 +398,63 @@ def validate_evidence_section(text: str) -> list[str]:
|
|
|
217
398
|
return issues
|
|
218
399
|
|
|
219
400
|
|
|
401
|
+
def validate_requested_outcome_mapping(text: str) -> list[str]:
|
|
402
|
+
issues = []
|
|
403
|
+
body = extract_section(text, REQUIRED_SECTIONS["Requested Outcome Mapping"])
|
|
404
|
+
marker_groups = (
|
|
405
|
+
("Original request:", "原始请求:"),
|
|
406
|
+
("Requested deliverables:", "请求交付物:"),
|
|
407
|
+
("Completion mapping:", "完成映射:"),
|
|
408
|
+
("Response shape:", "回答形态:"),
|
|
409
|
+
)
|
|
410
|
+
if not body:
|
|
411
|
+
return ["Requested Outcome Mapping section is empty"]
|
|
412
|
+
for group in marker_groups:
|
|
413
|
+
if not any(marker in body for marker in group):
|
|
414
|
+
issues.append(f"Requested Outcome Mapping is missing '{group[0]}'")
|
|
415
|
+
continue
|
|
416
|
+
if not has_marker_with_value(body, group):
|
|
417
|
+
issues.append(f"Requested Outcome Mapping field '{group[0]}' must have a non-empty value")
|
|
418
|
+
return issues
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def validate_repair_control(text: str, expected_stage: str) -> list[str]:
|
|
422
|
+
body = extract_section(text, REPAIR_CONTROL_SECTION)
|
|
423
|
+
is_auto_stage = expected_stage.lower() == "auto"
|
|
424
|
+
if not body:
|
|
425
|
+
if is_auto_stage:
|
|
426
|
+
return ["Repair Control section is required for auto stage reports"]
|
|
427
|
+
return []
|
|
428
|
+
|
|
429
|
+
issues = []
|
|
430
|
+
for group in REPAIR_CONTROL_FIELDS:
|
|
431
|
+
if not any(marker in body for marker in group):
|
|
432
|
+
issues.append(f"Repair Control is missing '{group[0]}'")
|
|
433
|
+
continue
|
|
434
|
+
if not has_marker_with_value(body, group):
|
|
435
|
+
issues.append(f"Repair Control field '{group[0]}' must have a non-empty value")
|
|
436
|
+
|
|
437
|
+
for pattern in FORBIDDEN_REPAIR_PATTERNS:
|
|
438
|
+
if re.search(pattern, body, flags=re.IGNORECASE):
|
|
439
|
+
issues.append(f"Repair Control contains forbidden repair: {pattern}")
|
|
440
|
+
|
|
441
|
+
attempts = parse_repair_attempts(marker_value(body, ("Repair attempts used:", "已用修复次数:")))
|
|
442
|
+
confirmation = marker_value(body, ("Confirmation check:", "确认验证:"))
|
|
443
|
+
whole_text = normalize(text)
|
|
444
|
+
repair_succeeded = any(marker in whole_text for marker in REPAIR_SUCCESS_MARKERS)
|
|
445
|
+
if attempts and attempts > 0 and repair_succeeded:
|
|
446
|
+
normalized_confirmation = normalize(confirmation)
|
|
447
|
+
if (
|
|
448
|
+
any(marker in normalized_confirmation for marker in NO_CONFIRMATION_MARKERS)
|
|
449
|
+
or not any(marker in normalized_confirmation for marker in CONFIRMATION_MARKERS)
|
|
450
|
+
):
|
|
451
|
+
issues.append(
|
|
452
|
+
"Repair Control requires a confirmation check after a successful repair before promotion or final success"
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
return issues
|
|
456
|
+
|
|
457
|
+
|
|
220
458
|
def validate_rule_preflight(text: str) -> list[str]:
|
|
221
459
|
body = extract_section(text, REQUIRED_SECTIONS["Rule Preflight"])
|
|
222
460
|
marker_groups = (
|
|
@@ -248,6 +486,12 @@ def validate_next_action(text: str) -> list[str]:
|
|
|
248
486
|
return ["Next Action must choose continue, stop, revise, rerun, escalate, or handoff"]
|
|
249
487
|
if not has_why(body):
|
|
250
488
|
return ["Next Action must include why the next step is appropriate"]
|
|
489
|
+
rows = parse_core_table_rows(text)
|
|
490
|
+
improve_value = find_row_value(rows, REQUIRED_CORE_ROWS["improve_why"]) or ""
|
|
491
|
+
if improvement_is_needed(improve_value) and next_action_is_stop(body) and not has_terminal_boundary(body):
|
|
492
|
+
return [
|
|
493
|
+
"Next Action cannot stop after a recoverable improvement need without an explicit terminal boundary; choose continue, revise, rerun, or escalate, or state the budget/frozen-core/safety boundary"
|
|
494
|
+
]
|
|
251
495
|
return []
|
|
252
496
|
|
|
253
497
|
|
|
@@ -279,6 +523,8 @@ def validate(path: Path, expected_stage: str = "") -> list[str]:
|
|
|
279
523
|
if not missing_sections:
|
|
280
524
|
issues.extend(validate_rule_preflight(text))
|
|
281
525
|
issues.extend(validate_stage_identity(text, expected_stage))
|
|
526
|
+
issues.extend(validate_requested_outcome_mapping(text))
|
|
527
|
+
issues.extend(validate_repair_control(text, expected_stage))
|
|
282
528
|
issues.extend(validate_core_table(text))
|
|
283
529
|
issues.extend(validate_evidence_section(text))
|
|
284
530
|
issues.extend(validate_next_action(text))
|
|
@@ -9,6 +9,9 @@
|
|
|
9
9
|
## Reader Summary
|
|
10
10
|
|
|
11
11
|
- One-sentence conclusion:
|
|
12
|
+
- Core insight:
|
|
13
|
+
- Evidence that supports the insight:
|
|
14
|
+
- Decision or action implication:
|
|
12
15
|
- What is validated:
|
|
13
16
|
- What is still unproven:
|
|
14
17
|
- Biggest reporting risk:
|
|
@@ -35,6 +38,8 @@
|
|
|
35
38
|
|
|
36
39
|
- Approved method name:
|
|
37
40
|
- Plain-language method summary:
|
|
41
|
+
- Mechanism tested or explained:
|
|
42
|
+
- Why the design follows from the insight:
|
|
38
43
|
- What this method changes relative to prior work:
|
|
39
44
|
- Most relevant prior work or baseline anchors:
|
|
40
45
|
- What those prior methods do:
|
|
@@ -115,10 +120,16 @@
|
|
|
115
120
|
|
|
116
121
|
Summarize validated iteration outcomes.
|
|
117
122
|
|
|
123
|
+
- Diagnostic interpretation:
|
|
124
|
+
- What this teaches beyond the raw numbers:
|
|
125
|
+
|
|
118
126
|
## Ablations
|
|
119
127
|
|
|
120
128
|
Describe meaningful ablations and what they showed.
|
|
121
129
|
|
|
130
|
+
- Mechanism tested:
|
|
131
|
+
- What the ablation teaches beyond the delta:
|
|
132
|
+
|
|
122
133
|
## Failures
|
|
123
134
|
|
|
124
135
|
Preserve failed runs and rejected ideas.
|
|
@@ -159,6 +159,24 @@ Suggested levels:
|
|
|
159
159
|
- Expected advantage:
|
|
160
160
|
- Evidence needed to prove the advantage:
|
|
161
161
|
|
|
162
|
+
## Contribution vs Insight
|
|
163
|
+
|
|
164
|
+
- Contribution:
|
|
165
|
+
- Insight:
|
|
166
|
+
- Core insight anchor sentence:
|
|
167
|
+
- Why the insight matters beyond the artifact:
|
|
168
|
+
- Action or community value:
|
|
169
|
+
|
|
170
|
+
## Insight Evidence Chain
|
|
171
|
+
|
|
172
|
+
- Observation:
|
|
173
|
+
- Why existing explanations fail:
|
|
174
|
+
- Core insight:
|
|
175
|
+
- Mechanism:
|
|
176
|
+
- Validation tests:
|
|
177
|
+
- Generalization or action implication:
|
|
178
|
+
- Prediction:
|
|
179
|
+
|
|
162
180
|
## Rough Approach
|
|
163
181
|
|
|
164
182
|
- Plain-language description of how this would work:
|
|
@@ -36,27 +36,33 @@
|
|
|
36
36
|
- Table 2 is for:
|
|
37
37
|
- Table 3 is for:
|
|
38
38
|
- Table 4 is for:
|
|
39
|
+
- Diagnostic takeaway:
|
|
40
|
+
- What the tables do not prove:
|
|
39
41
|
|
|
40
42
|
## Table 1
|
|
41
43
|
|
|
42
44
|
- Purpose:
|
|
43
45
|
- Metrics used:
|
|
44
46
|
- Strongest supported claim:
|
|
47
|
+
- Mechanism or insight tested:
|
|
45
48
|
|
|
46
49
|
## Table 2
|
|
47
50
|
|
|
48
51
|
- Purpose:
|
|
49
52
|
- Metrics used:
|
|
50
53
|
- Strongest supported claim:
|
|
54
|
+
- Mechanism or insight tested:
|
|
51
55
|
|
|
52
56
|
## Table 3
|
|
53
57
|
|
|
54
58
|
- Purpose:
|
|
55
59
|
- Metrics used:
|
|
56
60
|
- Strongest supported claim:
|
|
61
|
+
- Mechanism or insight tested:
|
|
57
62
|
|
|
58
63
|
## Table 4
|
|
59
64
|
|
|
60
65
|
- Purpose:
|
|
61
66
|
- Metrics used:
|
|
62
67
|
- Strongest supported claim:
|
|
68
|
+
- Mechanism or insight tested:
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
- Venue or audience:
|
|
6
6
|
- Paper status:
|
|
7
7
|
- Core story in one sentence:
|
|
8
|
+
- Core insight anchor:
|
|
8
9
|
- Approved framing artifact:
|
|
9
10
|
- Terminology lock:
|
|
10
11
|
|
|
@@ -24,6 +25,14 @@
|
|
|
24
25
|
- Limitation sources:
|
|
25
26
|
- Claims that still need more evidence:
|
|
26
27
|
|
|
28
|
+
## Insight Integration Map
|
|
29
|
+
|
|
30
|
+
- Introduction contrast:
|
|
31
|
+
- Method design consequence:
|
|
32
|
+
- Experiments diagnostic evidence:
|
|
33
|
+
- Conclusion principle or action implication:
|
|
34
|
+
- Alternative explanation to address:
|
|
35
|
+
|
|
27
36
|
## Asset Coverage Targets
|
|
28
37
|
|
|
29
38
|
- Core asset floor:
|
|
@@ -20,6 +20,25 @@
|
|
|
20
20
|
- Primary artifact:
|
|
21
21
|
- Next owner:
|
|
22
22
|
|
|
23
|
+
## Requested Outcome Mapping
|
|
24
|
+
|
|
25
|
+
- Original request:
|
|
26
|
+
- Requested deliverables:
|
|
27
|
+
- Completion mapping:
|
|
28
|
+
- Response shape:
|
|
29
|
+
|
|
30
|
+
## Repair Control
|
|
31
|
+
|
|
32
|
+
- Repair budget:
|
|
33
|
+
- Repair attempts used:
|
|
34
|
+
- Current failure class:
|
|
35
|
+
- Repair hypothesis:
|
|
36
|
+
- Evidence-changing knobs changed:
|
|
37
|
+
- Ordinary engineering fixes allowed:
|
|
38
|
+
- Frozen core unchanged:
|
|
39
|
+
- Forbidden repairs avoided:
|
|
40
|
+
- Confirmation check:
|
|
41
|
+
|
|
23
42
|
## Core Explanation Table
|
|
24
43
|
|
|
25
44
|
| Question | Plain Answer |
|
|
@@ -38,6 +38,19 @@
|
|
|
38
38
|
- Terminology consistency:
|
|
39
39
|
- Five-dimension self-review outcome:
|
|
40
40
|
|
|
41
|
+
## Insight Integration
|
|
42
|
+
|
|
43
|
+
- Core insight anchor used:
|
|
44
|
+
- Section role in the insight chain:
|
|
45
|
+
- Common assumption or surface explanation challenged:
|
|
46
|
+
- Mechanism or why-explanation added:
|
|
47
|
+
- Evidence or diagnostic result tied to the insight:
|
|
48
|
+
- Did the prose avoid an isolated `Our Insights`-style section:
|
|
49
|
+
- If the section is Introduction, what cognitive contrast was established:
|
|
50
|
+
- If the section is Method, which design choice follows from the insight:
|
|
51
|
+
- If the section is Experiments, which mechanism did the result or ablation diagnose:
|
|
52
|
+
- If the section is Conclusion, what broader principle or action implication was stated:
|
|
53
|
+
|
|
41
54
|
## Terminology Clarity
|
|
42
55
|
|
|
43
56
|
- Key terms introduced or revised this round:
|
|
@@ -43,7 +43,10 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
43
43
|
- Treat missing, stale, or contradictory `Rule Preflight` data as a stage-contract failure.
|
|
44
44
|
- Project-installed rules take priority over model memory. If remembered patterns conflict with the installed rule source, follow the installed source recorded in `.lab/.managed/rule-manifest.json`.
|
|
45
45
|
- Before a `/lab:*` stage reaches a final handoff, write or update one plain-language stage report under `.lab/stage-reports/` from `.lab/.managed/templates/stage-report.md`.
|
|
46
|
+
- The stage report must include `Requested Outcome Mapping`: the user's original request, requested deliverables, completion status for each requested deliverable, and the response shape the user should see.
|
|
47
|
+
- The stage report must include `Repair Control`. For non-repair stages, mark the section as not applicable; for auto repair, record budget, attempts used, failure class, repair hypothesis, evidence-changing knobs, ordinary engineering fixes that remain allowed, unchanged frozen core, forbidden repairs avoided, and confirmation check.
|
|
46
48
|
- The stage report must include a filled `Core Explanation Table` that answers, in workflow language and plain language: background, why now, what was done, how it was done, what worked, what did not work, what was verified, what remains unverified, whether improvement is needed and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
49
|
+
- If the stage says improvement is needed, do not choose `stop` unless the next action states a concrete terminal boundary such as budget exhaustion, frozen-core risk, safety or integrity failure, impossible target, or a required approval boundary. Otherwise choose `continue`, `revise`, `rerun`, or `escalate`.
|
|
47
50
|
- Stage reports are closeout and handoff artifacts, not a new user command and not a replacement for stage-specific artifacts such as idea memos, iteration reports, final reports, or write-iteration records.
|
|
48
51
|
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage <stage>` before claiming the stage is complete, and include the stage-report path plus validation result in the final user-facing summary.
|
|
49
52
|
- Final paper output should default to LaTeX, and its manuscript language should be decided separately from the workflow language.
|
|
@@ -73,6 +76,9 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
73
76
|
- Ask one clarifying question at a time when critical ambiguity remains.
|
|
74
77
|
- State the scenario, the problem, the failure case, and why the problem matters before proposing solutions.
|
|
75
78
|
- Classify the idea by contribution category and breakthrough level.
|
|
79
|
+
- Separate contribution from insight. Contribution is what the work adds; insight is what the work teaches and why it should matter beyond the artifact.
|
|
80
|
+
- Write one reusable core insight anchor sentence so later write and report stages can keep a stable story.
|
|
81
|
+
- Require an insight evidence chain before final recommendation: observation, why existing explanations fail, core insight, mechanism, validation tests, generalization or action implication, and prediction.
|
|
76
82
|
- Compare against existing methods explicitly and state why the idea should be better.
|
|
77
83
|
- Include a closest-prior-work comparison and a plain-language description of how the proposed direction would work.
|
|
78
84
|
- In the final user-facing summary, state what current methods do, why they still fall short, how the proposed direction differs, the rough approach, the main risk, and where to read `.lab/writing/idea.md` plus `.lab/writing/idea-source-log.md`.
|
|
@@ -151,6 +157,14 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
151
157
|
- Poll long-running commands until they complete, time out, or hit a stop condition.
|
|
152
158
|
- Update `.lab/context/auto-status.md`, `.lab/context/workflow-state.md`, `.lab/context/decisions.md`, `.lab/context/data-decisions.md`, and `.lab/context/evidence-index.md` as the campaign advances, then refresh the derived handoff files.
|
|
153
159
|
- Keep an explicit approval gate when a proposed action would leave the frozen core defined by the auto-mode contract.
|
|
160
|
+
- A failed metric gate is not by itself a terminal stop in `L2` or `L3` when `iterate` is allowed and loop budget remains. First classify the miss as recoverable or terminal.
|
|
161
|
+
- Treat ordinary target misses, weak effects, overly strong effects, low coverage, placement or extraction mismatch, threshold mismatch, candidate-generation weakness, and no-op deltas as recoverable until a bounded repair attempt rules them out.
|
|
162
|
+
- For recoverable misses, run at least one bounded repair iteration inside the approved envelope before stopping. Generic repair knobs include intervention strength, delivery or placement, detector or scoring threshold, candidate generation, sampling, baseline alignment, extraction/parser fixes, calibration, and control checks.
|
|
163
|
+
- Ordinary engineering fixes do not count against the repair budget when they do not change evidence interpretation: path fixes, dependency fixes, parser bugs, data loading bugs, runner retries, logging, cache invalidation, and result serialization can be fixed directly inside the current envelope.
|
|
164
|
+
- Count evidence-changing repairs against the repair budget: changes to intervention strength, delivery semantics, scoring thresholds, sampling, candidate generation, baseline alignment, calibration, extraction behavior that changes observed evidence, or controls that change the evaluated set.
|
|
165
|
+
- Forbidden repair moves require explicit approval and cannot be used to claim success: changing the primary metric definition, relaxing target thresholds, deleting hard cases, changing labels or ground truth, switching the final test split, changing paper-facing claims, or changing threat model, reviewer profile, dataset scope, or frozen core.
|
|
166
|
+
- A repair pilot that passes is not enough for promotion or final success. Require a confirmation check such as a new seed, holdout, control batch, repeated run, or anomaly check before promotion.
|
|
167
|
+
- Stop without repair only when the report names the terminal boundary: exhausted budget, frozen-core change, approval-required scope change, safety or integrity risk, invalid metric, impossible target, or repeated failed repair attempts.
|
|
154
168
|
|
|
155
169
|
### `/lab:spec`
|
|
156
170
|
|
|
@@ -218,6 +232,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
218
232
|
- Aggregate them with `.lab/.managed/scripts/summarize_iterations.py`.
|
|
219
233
|
- Write the final document with `.lab/.managed/templates/final-report.md`, the managed table summary with `.lab/.managed/templates/main-tables.md`, and the internal handoff with `.lab/.managed/templates/artifact-status.md`.
|
|
220
234
|
- Keep failed attempts and limitations visible.
|
|
235
|
+
- Put the report-level insight near the top: what was learned beyond the produced artifact, what evidence supports it, what action or design implication follows, and what boundary still applies.
|
|
236
|
+
- Use main tables and ablations as diagnostic evidence for the insight rather than only containers for metric values.
|
|
221
237
|
- Update `.lab/context/mission.md`, `.lab/context/eval-protocol.md`, `.lab/context/workflow-state.md`, and `.lab/context/evidence-index.md` with report-level handoff notes, then refresh derived views.
|
|
222
238
|
- If canonical context is still skeletal, hydrate the smallest trustworthy version from frozen artifacts before finalizing the report.
|
|
223
239
|
- If collaborator-critical fields remain missing after hydration, downgrade to an `artifact-anchored interim report` instead of presenting a final collaborator-ready report.
|
|
@@ -250,6 +266,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
250
266
|
- If a section uses canonical short names or variant labels before the section that formally defines them has been drafted, add a local naming bridge in that section and then keep those labels stable.
|
|
251
267
|
- Keep one canonical natural-language paper-facing name per concept.
|
|
252
268
|
- Once a paper-facing model or ablation label is chosen, reuse the canonical label instead of replacing it with a narrative alias in later prose, tables, or captions.
|
|
269
|
+
- Carry the same core insight anchor through the paper: Introduction creates the contrast, Method turns it into design motivation, Experiments diagnose it with evidence, and Conclusion states the broader principle and boundary.
|
|
270
|
+
- Do not create a standalone `Our Insights` section just to satisfy this; dissolve the insight into motivation, mechanism, evidence, and limitations.
|
|
253
271
|
- Before drafting or polishing, check the current section block in `skills/lab/references/paper-writing/section-style-policies.md` and follow its encouraged, discouraged, and banned expression lists.
|
|
254
272
|
- When the user provides reference PDFs, paper URLs, local reference-paper paths, or asks to write by reference, stay within `/lab:write` but switch to reference-guided deep writing: extract structure, map section/subsection slots, paragraph roles, and table/figure roles to the current paper, record the mapping, and only then draft prose.
|
|
255
273
|
- The reference-consumption plan is not sufficient by itself. The current section must visibly realize the adopted structure slots through subsection or paragraph anchors, table/figure placement, local bridges, and reader-facing prose.
|