superlab 0.1.69 → 0.1.71
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/i18n.cjs +89 -0
- package/lib/install.cjs +1 -0
- package/package-assets/shared/lab/.managed/scripts/validate_collaborator_report.py +45 -1
- package/package-assets/shared/lab/.managed/scripts/validate_manuscript_delivery.py +116 -0
- package/package-assets/shared/lab/.managed/scripts/validate_section_draft.py +20 -0
- package/package-assets/shared/lab/.managed/scripts/validate_stage_report.py +301 -0
- package/package-assets/shared/lab/.managed/templates/final-report.md +6 -0
- package/package-assets/shared/lab/.managed/templates/main-tables.md +6 -0
- package/package-assets/shared/lab/.managed/templates/paper-table.tex +3 -3
- package/package-assets/shared/lab/.managed/templates/paper.tex +2 -0
- package/package-assets/shared/lab/.managed/templates/stage-report.md +52 -0
- package/package-assets/shared/lab/.managed/templates/write-iteration.md +4 -0
- package/package-assets/shared/skills/lab/SKILL.md +9 -1
- package/package-assets/shared/skills/lab/references/paper-writing/section-style-policies.md +1 -0
- package/package-assets/shared/skills/lab/stages/auto.md +6 -0
- package/package-assets/shared/skills/lab/stages/data.md +6 -0
- package/package-assets/shared/skills/lab/stages/framing.md +6 -0
- package/package-assets/shared/skills/lab/stages/idea.md +6 -0
- package/package-assets/shared/skills/lab/stages/iterate.md +6 -0
- package/package-assets/shared/skills/lab/stages/report.md +9 -0
- package/package-assets/shared/skills/lab/stages/review.md +6 -0
- package/package-assets/shared/skills/lab/stages/run.md +6 -0
- package/package-assets/shared/skills/lab/stages/spec.md +6 -0
- package/package-assets/shared/skills/lab/stages/write.md +12 -1
- package/package.json +1 -1
package/lib/i18n.cjs
CHANGED
|
@@ -887,6 +887,60 @@ const ZH_SKILL_FILES = {
|
|
|
887
887
|
- [ ] 标准化并验证评估摘要。
|
|
888
888
|
- [ ] 执行有边界的 iteration rounds。
|
|
889
889
|
- [ ] 产出 final report。
|
|
890
|
+
`,
|
|
891
|
+
[path.join(".lab", ".managed", "templates", "stage-report.md")]:
|
|
892
|
+
`# 阶段报告
|
|
893
|
+
|
|
894
|
+
## 规则预检
|
|
895
|
+
|
|
896
|
+
- Rule source file:
|
|
897
|
+
- Rule source revision:
|
|
898
|
+
- Project version:
|
|
899
|
+
- Resolved stage:
|
|
900
|
+
- Resolved mode:
|
|
901
|
+
- Resolved target:
|
|
902
|
+
- Preflight stamp:
|
|
903
|
+
- Override reason, if any:
|
|
904
|
+
|
|
905
|
+
## 阶段身份
|
|
906
|
+
|
|
907
|
+
- Stage:
|
|
908
|
+
- Target:
|
|
909
|
+
- Date:
|
|
910
|
+
- Status:
|
|
911
|
+
- Primary artifact:
|
|
912
|
+
- Next owner:
|
|
913
|
+
|
|
914
|
+
## 核心说明表
|
|
915
|
+
|
|
916
|
+
| 问题 | 白话回答 |
|
|
917
|
+
|---|---|
|
|
918
|
+
| 这是什么阶段? | |
|
|
919
|
+
| 背景是什么? | |
|
|
920
|
+
| 为什么现在要做? | |
|
|
921
|
+
| 这轮具体做了什么? | |
|
|
922
|
+
| 怎么做的? | |
|
|
923
|
+
| 结果好的地方是什么? | |
|
|
924
|
+
| 结果坏的地方是什么? | |
|
|
925
|
+
| 这验证了什么? | |
|
|
926
|
+
| 还没有验证什么? | |
|
|
927
|
+
| 是否需要改进?为什么? | |
|
|
928
|
+
| 下一步怎么改?为什么这样改? | |
|
|
929
|
+
| 关键证据在哪里? | |
|
|
930
|
+
| 现在应该继续、停止、重做还是升级? | |
|
|
931
|
+
|
|
932
|
+
## 证据与工件
|
|
933
|
+
|
|
934
|
+
- 主工件:
|
|
935
|
+
- 支撑工件:
|
|
936
|
+
- 验证命令:
|
|
937
|
+
- 已知缺口:
|
|
938
|
+
|
|
939
|
+
## 下一步动作
|
|
940
|
+
|
|
941
|
+
- 决策:continue / stop / revise / rerun / escalate / handoff
|
|
942
|
+
- 具体下一步:
|
|
943
|
+
- 为什么这样做:
|
|
890
944
|
`,
|
|
891
945
|
[path.join(".lab", ".managed", "templates", "iteration-report.md")]:
|
|
892
946
|
`# 迭代报告
|
|
@@ -3020,6 +3074,41 @@ ZH_CONTENT[path.join(".claude", "skills", "lab", "stages", "auto.md")] =
|
|
|
3020
3074
|
ZH_CONTENT[path.join(".claude", "skills", "lab", "stages", "report.md")] =
|
|
3021
3075
|
ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "report.md")];
|
|
3022
3076
|
|
|
3077
|
+
const zhStageReportCloseout = `
|
|
3078
|
+
|
|
3079
|
+
## 阶段报告收尾
|
|
3080
|
+
|
|
3081
|
+
- 阶段进入最终交接、停止、失败或升级前,必须用 \`.lab/.managed/templates/stage-report.md\` 写或更新一个 \`.lab/stage-reports/<date>--<stage>--<target>.md\`。
|
|
3082
|
+
- \`核心说明表\` 必须用白话写清背景、为什么现在做、做了什么、怎么做、好结果、坏结果、验证了什么、还没验证什么、是否需要改进及原因、下一步怎么改及原因、关键证据,以及 continue/stop/revise/rerun/escalate/handoff 决策。
|
|
3083
|
+
- 阶段报告是收尾和交接工件,不是新命令,也不能替代 idea、iteration report、final report 或 write iteration 等阶段专属工件。
|
|
3084
|
+
- 收尾前运行 \`.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage <stage>\`,并在最终给用户的总结里给出阶段报告路径和校验结果。
|
|
3085
|
+
`;
|
|
3086
|
+
|
|
3087
|
+
for (const platformRoot of [".codex", ".claude"]) {
|
|
3088
|
+
for (const stage of ["idea", "data", "auto", "framing", "spec", "run", "iterate", "review", "report", "write"]) {
|
|
3089
|
+
const key = path.join(platformRoot, "skills", "lab", "stages", `${stage}.md`);
|
|
3090
|
+
if (ZH_CONTENT[key] && !ZH_CONTENT[key].includes("validate_stage_report.py")) {
|
|
3091
|
+
ZH_CONTENT[key] += zhStageReportCloseout;
|
|
3092
|
+
}
|
|
3093
|
+
}
|
|
3094
|
+
}
|
|
3095
|
+
|
|
3096
|
+
const zhStageReportGlobalRules =
|
|
3097
|
+
"- 每个 `/lab:*` 阶段进入最终交接前,都必须在 `.lab/stage-reports/` 下写或更新一个白话阶段报告,并通过 `.lab/.managed/scripts/validate_stage_report.py` 校验。\n" +
|
|
3098
|
+
"- 阶段报告必须包含核心说明表,讲清背景、为什么做、做了什么、怎么做、好坏结果、验证与未验证边界、是否需要改进及原因、下一步怎么改及原因、关键证据和继续/停止/修订/重跑/升级/交接决策。\n" +
|
|
3099
|
+
"- 阶段报告只是统一收尾和交接层,不新增用户命令,也不替代各阶段自己的受管工件。\n";
|
|
3100
|
+
|
|
3101
|
+
for (const platformRoot of [".codex", ".claude"]) {
|
|
3102
|
+
const key = path.join(platformRoot, "skills", "lab", "SKILL.md");
|
|
3103
|
+
if (ZH_CONTENT[key] && !ZH_CONTENT[key].includes("validate_stage_report.py")) {
|
|
3104
|
+
ZH_CONTENT[key] = ZH_CONTENT[key].replace(
|
|
3105
|
+
"- 项目里已安装的规则优先于模型记忆;如果记忆里的旧做法和 `.lab/.managed/rule-manifest.json` 记录的规则冲突,以项目里安装的规则为准。\n",
|
|
3106
|
+
"- 项目里已安装的规则优先于模型记忆;如果记忆里的旧做法和 `.lab/.managed/rule-manifest.json` 记录的规则冲突,以项目里安装的规则为准。\n" +
|
|
3107
|
+
zhStageReportGlobalRules
|
|
3108
|
+
);
|
|
3109
|
+
}
|
|
3110
|
+
}
|
|
3111
|
+
|
|
3023
3112
|
function getLocalizedContent(relativePath, lang) {
|
|
3024
3113
|
if (lang !== "zh") {
|
|
3025
3114
|
return null;
|
package/lib/install.cjs
CHANGED
|
@@ -674,6 +674,7 @@ function localizeInstalledAssets(targetDir, lang, { newlyCreatedProjectOwnedPath
|
|
|
674
674
|
path.join(".lab", ".managed", "templates", "design.md"),
|
|
675
675
|
path.join(".lab", ".managed", "templates", "spec.md"),
|
|
676
676
|
path.join(".lab", ".managed", "templates", "tasks.md"),
|
|
677
|
+
path.join(".lab", ".managed", "templates", "stage-report.md"),
|
|
677
678
|
path.join(".lab", ".managed", "templates", "iteration-report.md"),
|
|
678
679
|
path.join(".lab", ".managed", "templates", "review-checklist.md"),
|
|
679
680
|
path.join(".lab", ".managed", "templates", "final-report.md"),
|
|
@@ -51,6 +51,21 @@ SOURCE_SECTION_PATH_MARKERS = (
|
|
|
51
51
|
SOURCE_SECTION_CITATION_MARKERS = ("Citation:", "引用:")
|
|
52
52
|
SOURCE_SECTION_ROLE_MARKERS = ("What it established:", "What it does:", "What it measures:", "做了什么:", "衡量什么:")
|
|
53
53
|
SOURCE_SECTION_LIMITATION_MARKERS = ("Limitation", "局限")
|
|
54
|
+
METRIC_GUIDE_DETAIL_MARKERS = {
|
|
55
|
+
"evaluation target": ("Evaluation target:", "What is evaluated:", "评估对象:", "评估什么:"),
|
|
56
|
+
"test-set prediction": ("Test-set prediction used:", "Prediction used:", "测试集预测:", "预测量:"),
|
|
57
|
+
"ranking or grouping": ("Ranking or grouping step:", "Ranking step:", "Grouping step:", "排序或分组:", "排序步骤:", "分组步骤:"),
|
|
58
|
+
"calculation sketch": (
|
|
59
|
+
"Aggregation / calculation sketch:",
|
|
60
|
+
"Calculation sketch:",
|
|
61
|
+
"Approximate calculation:",
|
|
62
|
+
"大致计算:",
|
|
63
|
+
"近似公式:",
|
|
64
|
+
"聚合方式:",
|
|
65
|
+
),
|
|
66
|
+
"direction and scale": ("Direction and scale:", "Metric direction:", "方向与尺度:", "方向:", "越高/越低:"),
|
|
67
|
+
"comparability boundary": ("Comparability boundary:", "What not to compare:", "可比性边界:", "不能比较:"),
|
|
68
|
+
}
|
|
54
69
|
|
|
55
70
|
|
|
56
71
|
def parse_args():
|
|
@@ -99,6 +114,35 @@ def validate_source_sections(text: str, label: str) -> list[str]:
|
|
|
99
114
|
return issues
|
|
100
115
|
|
|
101
116
|
|
|
117
|
+
def has_marker_with_value(body: str, markers: tuple[str, ...]) -> bool:
|
|
118
|
+
for line in body.splitlines():
|
|
119
|
+
stripped = line.strip()
|
|
120
|
+
for marker in markers:
|
|
121
|
+
if marker not in stripped:
|
|
122
|
+
continue
|
|
123
|
+
value = stripped.split(marker, 1)[1].strip()
|
|
124
|
+
if value and value not in {"-", "—", "TODO", "TBD", "待补", "待定"}:
|
|
125
|
+
return True
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def validate_metric_guide_detail(text: str, label: str) -> list[str]:
|
|
130
|
+
body = extract_section_body(text, REPORT_REQUIRED_SECTIONS["Metric Guide"])
|
|
131
|
+
if not body:
|
|
132
|
+
return []
|
|
133
|
+
missing = [
|
|
134
|
+
detail_name
|
|
135
|
+
for detail_name, markers in METRIC_GUIDE_DETAIL_MARKERS.items()
|
|
136
|
+
if not has_marker_with_value(body, markers)
|
|
137
|
+
]
|
|
138
|
+
if not missing:
|
|
139
|
+
return []
|
|
140
|
+
return [
|
|
141
|
+
f"{label} section 'Metric Guide' must explain metric computation details: "
|
|
142
|
+
f"{', '.join(missing)}"
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
|
|
102
146
|
def validate(path_str: str, required_sections: dict[str, list[str]], label: str) -> list[str]:
|
|
103
147
|
path = Path(path_str)
|
|
104
148
|
if not path.exists():
|
|
@@ -108,7 +152,7 @@ def validate(path_str: str, required_sections: dict[str, list[str]], label: str)
|
|
|
108
152
|
if missing:
|
|
109
153
|
return [f"{label} is missing required sections: {', '.join(missing)}"]
|
|
110
154
|
if label == "report.md":
|
|
111
|
-
return validate_source_sections(text, label)
|
|
155
|
+
return validate_source_sections(text, label) + validate_metric_guide_detail(text, label)
|
|
112
156
|
return []
|
|
113
157
|
|
|
114
158
|
|
|
@@ -38,6 +38,7 @@ REQUIRED_TABLE_NOTE_MARKERS = (
|
|
|
38
38
|
"% Important caveat:",
|
|
39
39
|
)
|
|
40
40
|
WIDTH_CONTROL_NOTE_MARKER = "% Width control:"
|
|
41
|
+
WIDE_PLAIN_TABULAR_COLUMN_LIMIT = 7
|
|
41
42
|
TABLE_ABBREVIATION_EXCEPTIONS = {"TODO", "TBD"}
|
|
42
43
|
PLACEHOLDER_TABLE_NOTE_PREFIXES = (
|
|
43
44
|
"explain ",
|
|
@@ -97,6 +98,109 @@ def contains_any(text: str, needles: tuple[str, ...]) -> bool:
|
|
|
97
98
|
return any(needle.lower() in lowered for needle in needles)
|
|
98
99
|
|
|
99
100
|
|
|
101
|
+
def read_braced_group(text: str, start: int) -> tuple[str, int] | None:
|
|
102
|
+
if start >= len(text) or text[start] != "{":
|
|
103
|
+
return None
|
|
104
|
+
depth = 0
|
|
105
|
+
content_start = start + 1
|
|
106
|
+
for index in range(start, len(text)):
|
|
107
|
+
char = text[index]
|
|
108
|
+
if char == "{":
|
|
109
|
+
depth += 1
|
|
110
|
+
elif char == "}":
|
|
111
|
+
depth -= 1
|
|
112
|
+
if depth == 0:
|
|
113
|
+
return text[content_start:index], index + 1
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def skip_whitespace(text: str, index: int) -> int:
|
|
118
|
+
while index < len(text) and text[index].isspace():
|
|
119
|
+
index += 1
|
|
120
|
+
return index
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def extract_plain_tabular_specs(text: str) -> list[str]:
|
|
124
|
+
specs: list[str] = []
|
|
125
|
+
needle = r"\begin{tabular}"
|
|
126
|
+
search_from = 0
|
|
127
|
+
while True:
|
|
128
|
+
index = text.find(needle, search_from)
|
|
129
|
+
if index == -1:
|
|
130
|
+
return specs
|
|
131
|
+
spec_start = skip_whitespace(text, index + len(needle))
|
|
132
|
+
group = read_braced_group(text, spec_start)
|
|
133
|
+
if group is not None:
|
|
134
|
+
specs.append(group[0])
|
|
135
|
+
search_from = group[1]
|
|
136
|
+
else:
|
|
137
|
+
search_from = index + len(needle)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def count_column_spec(spec: str) -> tuple[int, bool]:
|
|
141
|
+
count = 0
|
|
142
|
+
has_width_aware_column = False
|
|
143
|
+
index = 0
|
|
144
|
+
while index < len(spec):
|
|
145
|
+
char = spec[index]
|
|
146
|
+
if char in "lcr":
|
|
147
|
+
count += 1
|
|
148
|
+
index += 1
|
|
149
|
+
continue
|
|
150
|
+
if char == "X":
|
|
151
|
+
count += 1
|
|
152
|
+
has_width_aware_column = True
|
|
153
|
+
index += 1
|
|
154
|
+
continue
|
|
155
|
+
if char in "pmb":
|
|
156
|
+
count += 1
|
|
157
|
+
has_width_aware_column = True
|
|
158
|
+
index = skip_whitespace(spec, index + 1)
|
|
159
|
+
if index < len(spec) and spec[index] == "{":
|
|
160
|
+
group = read_braced_group(spec, index)
|
|
161
|
+
index = group[1] if group is not None else index + 1
|
|
162
|
+
continue
|
|
163
|
+
if char == "*":
|
|
164
|
+
index = skip_whitespace(spec, index + 1)
|
|
165
|
+
repeat_group = read_braced_group(spec, index)
|
|
166
|
+
if repeat_group is None:
|
|
167
|
+
continue
|
|
168
|
+
repeat_text, index = repeat_group
|
|
169
|
+
index = skip_whitespace(spec, index)
|
|
170
|
+
repeated_spec_group = read_braced_group(spec, index)
|
|
171
|
+
if repeated_spec_group is None:
|
|
172
|
+
continue
|
|
173
|
+
repeated_spec, index = repeated_spec_group
|
|
174
|
+
try:
|
|
175
|
+
repeat_count = int(repeat_text.strip())
|
|
176
|
+
except ValueError:
|
|
177
|
+
repeat_count = 1
|
|
178
|
+
nested_count, nested_width_aware = count_column_spec(repeated_spec)
|
|
179
|
+
count += repeat_count * nested_count
|
|
180
|
+
has_width_aware_column = has_width_aware_column or nested_width_aware
|
|
181
|
+
continue
|
|
182
|
+
if char in "@!<>":
|
|
183
|
+
index = skip_whitespace(spec, index + 1)
|
|
184
|
+
if index < len(spec) and spec[index] == "{":
|
|
185
|
+
group = read_braced_group(spec, index)
|
|
186
|
+
index = group[1] if group is not None else index + 1
|
|
187
|
+
continue
|
|
188
|
+
index += 1
|
|
189
|
+
return count, has_width_aware_column
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def has_width_control_command(text: str) -> bool:
|
|
193
|
+
return any(
|
|
194
|
+
token in text
|
|
195
|
+
for token in (
|
|
196
|
+
r"\begin{tabularx}",
|
|
197
|
+
r"\begin{tabular*}",
|
|
198
|
+
r"\resizebox{",
|
|
199
|
+
r"\setlength{\tabcolsep}",
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
100
204
|
def find_workflow_config(start_path: Path) -> Path | None:
|
|
101
205
|
search_roots = [start_path, *start_path.parents]
|
|
102
206
|
for root in search_roots:
|
|
@@ -315,6 +419,18 @@ def check_table_file(path: Path, issues: list[str], label: str):
|
|
|
315
419
|
continue
|
|
316
420
|
if value < 3.0:
|
|
317
421
|
issues.append(f"{label} sets \\tabcolsep below the safe range for paper-facing main tables")
|
|
422
|
+
for spec in extract_plain_tabular_specs(text):
|
|
423
|
+
column_count, has_width_aware_column = count_column_spec(spec)
|
|
424
|
+
if (
|
|
425
|
+
column_count >= WIDE_PLAIN_TABULAR_COLUMN_LIMIT
|
|
426
|
+
and not has_width_aware_column
|
|
427
|
+
and not has_width_control_command(text)
|
|
428
|
+
):
|
|
429
|
+
issues.append(
|
|
430
|
+
f"{label} uses a wide plain tabular layout ({column_count} columns) without a width-aware strategy; "
|
|
431
|
+
"use tabularx or p columns, split the table, move secondary metrics to appendix, "
|
|
432
|
+
"or document last-resort width control"
|
|
433
|
+
)
|
|
318
434
|
|
|
319
435
|
|
|
320
436
|
def check_figure_file(path: Path, issues: list[str], label: str):
|
|
@@ -241,6 +241,22 @@ INTERNAL_EXPERIMENT_PROVENANCE_PHRASES = (
|
|
|
241
241
|
"调参运行",
|
|
242
242
|
"调参轮次",
|
|
243
243
|
)
|
|
244
|
+
INTERNAL_EXPERIMENT_PLANNING_PATTERNS = (
|
|
245
|
+
r"current\s+[\d.]+\s+only\s+shows?.*need(?:s|ed)?\s+(?:a\s+)?(?:new\s+)?holdout",
|
|
246
|
+
r"(?:new|additional)\s+holdout\s+(?:and|or)\s+(?:more\s+)?natural(?:ized)?\s+(?:payload|attack|statement)",
|
|
247
|
+
r"(?:small[- ]batch|pilot[- ]batch).*(?:gate|gating)",
|
|
248
|
+
r"(?:freeze|freezing).*(?:payload|attack statement|trigger)",
|
|
249
|
+
r"(?:api|API).*(?:budget|cost|scale)",
|
|
250
|
+
r"新增\s*(?:holdout|外部|样本|实验).*验证",
|
|
251
|
+
r"还需要\s*新增.*验证",
|
|
252
|
+
r"后文.*边界",
|
|
253
|
+
r"当前\s*[\d.]+\s*只能说明.*不能外推.*(?:还需要|需要)",
|
|
254
|
+
r"小批量.*(?:门控|gate)",
|
|
255
|
+
r"(?:冻结|固定).*(?:payload|载荷|攻击语句|触发语句)",
|
|
256
|
+
r"(?:不能|不得).*边跑边调",
|
|
257
|
+
r"API\s*(?:规模|预算|成本)",
|
|
258
|
+
r"(?:按设计|设计上).*(?:失败|不通过).*(?:过拟合|调参)",
|
|
259
|
+
)
|
|
244
260
|
INTERNAL_CONFIG_LABEL_PATTERN = re.compile(
|
|
245
261
|
r"\b[a-z]{1,4}\d+(?:[-_][a-z]?\d+(?:\.\d+)?){1,4}\b",
|
|
246
262
|
flags=re.IGNORECASE,
|
|
@@ -265,6 +281,10 @@ def check_common_section_gate_risks(text: str, issues: list[str]):
|
|
|
265
281
|
issues.append(
|
|
266
282
|
"reader-facing prose appears to contain internal experiment provenance or tuning/config labels; move run provenance to workflow notes or map it to paper-facing diagnostic terminology"
|
|
267
283
|
)
|
|
284
|
+
if any(re.search(pattern, prose_text, flags=re.IGNORECASE) for pattern in INTERNAL_EXPERIMENT_PLANNING_PATTERNS):
|
|
285
|
+
issues.append(
|
|
286
|
+
"reader-facing prose appears to contain internal experiment planning or holdout-expansion rationale; keep plans, gates, payload-freezing notes, and future validation logistics in workflow artifacts instead of the manuscript"
|
|
287
|
+
)
|
|
268
288
|
if contains_any(
|
|
269
289
|
prose_text,
|
|
270
290
|
(
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import re
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
REQUIRED_SECTIONS = {
|
|
9
|
+
"Rule Preflight": [r"^##\s+Rule Preflight\s*$", r"^##\s+规则预检\s*$"],
|
|
10
|
+
"Stage Identity": [r"^##\s+Stage Identity\s*$", r"^##\s+阶段身份\s*$"],
|
|
11
|
+
"Core Explanation Table": [r"^##\s+Core Explanation Table\s*$", r"^##\s+核心说明表\s*$"],
|
|
12
|
+
"Evidence And Artifacts": [r"^##\s+Evidence And Artifacts\s*$", r"^##\s+证据与工件\s*$"],
|
|
13
|
+
"Next Action": [r"^##\s+Next Action\s*$", r"^##\s+下一步动作\s*$"],
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
REQUIRED_CORE_ROWS = {
|
|
17
|
+
"stage": ("这是什么阶段", "what stage is this", "stage"),
|
|
18
|
+
"background": ("背景是什么", "background"),
|
|
19
|
+
"why_now": ("为什么现在要做", "why now", "why this stage ran"),
|
|
20
|
+
"what_done": ("这轮具体做了什么", "what this stage did", "what did this stage do"),
|
|
21
|
+
"how_done": ("怎么做的", "how it was done", "how was it done"),
|
|
22
|
+
"worked": ("结果好的地方是什么", "what worked"),
|
|
23
|
+
"did_not_work": ("结果坏的地方是什么", "what did not work", "negative result"),
|
|
24
|
+
"verifies": ("这验证了什么", "what this verifies", "what was verified"),
|
|
25
|
+
"unverified": ("还没有验证什么", "what remains unverified", "not yet verified"),
|
|
26
|
+
"improve_why": ("是否需要改进", "need improvement", "what needs improvement"),
|
|
27
|
+
"how_improve": ("下一步怎么改", "how to improve"),
|
|
28
|
+
"evidence": ("关键证据在哪里", "key evidence", "evidence"),
|
|
29
|
+
"decision": ("现在应该继续", "continue, stop", "decision"),
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
PLACEHOLDER_VALUES = {
|
|
33
|
+
"",
|
|
34
|
+
"-",
|
|
35
|
+
"--",
|
|
36
|
+
"—",
|
|
37
|
+
"todo",
|
|
38
|
+
"tbd",
|
|
39
|
+
"n/a",
|
|
40
|
+
"na",
|
|
41
|
+
"none",
|
|
42
|
+
"待补",
|
|
43
|
+
"待定",
|
|
44
|
+
"无",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
SHALLOW_VALUES = {
|
|
48
|
+
"done",
|
|
49
|
+
"ok",
|
|
50
|
+
"pass",
|
|
51
|
+
"passed",
|
|
52
|
+
"符合预期",
|
|
53
|
+
"已完成",
|
|
54
|
+
"继续优化",
|
|
55
|
+
"继续推进",
|
|
56
|
+
"没有问题",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
INTERNAL_META_PATTERNS = [
|
|
60
|
+
r"用户说",
|
|
61
|
+
r"我来解释",
|
|
62
|
+
r"我会",
|
|
63
|
+
r"我已经",
|
|
64
|
+
r"你要求",
|
|
65
|
+
r"\bagent\b",
|
|
66
|
+
r"\bsubagent\b",
|
|
67
|
+
r"\bprompt\b",
|
|
68
|
+
r"提示词",
|
|
69
|
+
r"按.*技能",
|
|
70
|
+
r"service-style",
|
|
71
|
+
r"AI-assistant",
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
WHY_MARKERS = (
|
|
75
|
+
"because",
|
|
76
|
+
"so that",
|
|
77
|
+
"therefore",
|
|
78
|
+
"reason",
|
|
79
|
+
"why",
|
|
80
|
+
"因为",
|
|
81
|
+
"所以",
|
|
82
|
+
"因此",
|
|
83
|
+
"原因",
|
|
84
|
+
"以便",
|
|
85
|
+
"用于",
|
|
86
|
+
"避免",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def parse_args():
|
|
91
|
+
parser = argparse.ArgumentParser(description="Validate a plain-language lab stage report.")
|
|
92
|
+
parser.add_argument("--stage-report", required=True, help="Path to the stage report markdown file.")
|
|
93
|
+
parser.add_argument("--stage", default="", help="Expected lab stage name, such as run, auto, or write.")
|
|
94
|
+
return parser.parse_args()
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def normalize(text: str) -> str:
|
|
98
|
+
return re.sub(r"\s+", " ", text.strip().lower())
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def extract_section(text: str, patterns: list[str]) -> str:
|
|
102
|
+
for pattern in patterns:
|
|
103
|
+
match = re.search(pattern, text, flags=re.MULTILINE)
|
|
104
|
+
if not match:
|
|
105
|
+
continue
|
|
106
|
+
start = match.end()
|
|
107
|
+
next_heading = re.search(r"^##\s+", text[start:], flags=re.MULTILINE)
|
|
108
|
+
end = start + next_heading.start() if next_heading else len(text)
|
|
109
|
+
return text[start:end].strip()
|
|
110
|
+
return ""
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def find_missing_sections(text: str) -> list[str]:
|
|
114
|
+
missing = []
|
|
115
|
+
for name, patterns in REQUIRED_SECTIONS.items():
|
|
116
|
+
if not any(re.search(pattern, text, flags=re.MULTILINE) for pattern in patterns):
|
|
117
|
+
missing.append(name)
|
|
118
|
+
return missing
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def parse_core_table_rows(text: str) -> dict[str, str]:
|
|
122
|
+
section = extract_section(text, REQUIRED_SECTIONS["Core Explanation Table"])
|
|
123
|
+
rows = {}
|
|
124
|
+
for raw_line in section.splitlines():
|
|
125
|
+
line = raw_line.strip()
|
|
126
|
+
if not line.startswith("|") or line.count("|") < 3:
|
|
127
|
+
continue
|
|
128
|
+
cells = [cell.strip() for cell in line.strip("|").split("|")]
|
|
129
|
+
if len(cells) < 2:
|
|
130
|
+
continue
|
|
131
|
+
question = normalize(cells[0])
|
|
132
|
+
answer = cells[1].strip()
|
|
133
|
+
if question in {"question", "---", ""}:
|
|
134
|
+
continue
|
|
135
|
+
rows[question] = answer
|
|
136
|
+
return rows
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def find_row_value(rows: dict[str, str], markers: tuple[str, ...]) -> str | None:
|
|
140
|
+
normalized_markers = tuple(normalize(marker) for marker in markers)
|
|
141
|
+
for question, answer in rows.items():
|
|
142
|
+
if any(marker in question for marker in normalized_markers):
|
|
143
|
+
return answer
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def is_blank_or_placeholder(value: str | None) -> bool:
|
|
148
|
+
if value is None:
|
|
149
|
+
return True
|
|
150
|
+
compact = normalize(value).strip(" .:;,。;:")
|
|
151
|
+
return compact in PLACEHOLDER_VALUES
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def has_marker_with_value(body: str, markers: tuple[str, ...]) -> bool:
|
|
155
|
+
for line in body.splitlines():
|
|
156
|
+
stripped = line.strip()
|
|
157
|
+
for marker in markers:
|
|
158
|
+
if marker not in stripped:
|
|
159
|
+
continue
|
|
160
|
+
value = stripped.split(marker, 1)[1].strip()
|
|
161
|
+
return not is_blank_or_placeholder(value)
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def is_shallow(value: str | None) -> bool:
|
|
166
|
+
if value is None:
|
|
167
|
+
return True
|
|
168
|
+
compact = normalize(value).strip(" .:;,。;:")
|
|
169
|
+
return compact in SHALLOW_VALUES or len(compact) < 8
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def has_why(value: str) -> bool:
|
|
173
|
+
lowered = normalize(value)
|
|
174
|
+
return any(marker in lowered for marker in WHY_MARKERS)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def validate_core_table(text: str) -> list[str]:
|
|
178
|
+
issues = []
|
|
179
|
+
rows = parse_core_table_rows(text)
|
|
180
|
+
missing_rows = []
|
|
181
|
+
for row_name, markers in REQUIRED_CORE_ROWS.items():
|
|
182
|
+
value = find_row_value(rows, markers)
|
|
183
|
+
if is_blank_or_placeholder(value):
|
|
184
|
+
missing_rows.append(row_name)
|
|
185
|
+
if missing_rows:
|
|
186
|
+
issues.append(f"Core Explanation Table is missing non-empty answers for: {', '.join(missing_rows)}")
|
|
187
|
+
|
|
188
|
+
for row_name in ("did_not_work", "verifies", "improve_why", "how_improve", "decision"):
|
|
189
|
+
value = find_row_value(rows, REQUIRED_CORE_ROWS[row_name])
|
|
190
|
+
if is_shallow(value):
|
|
191
|
+
issues.append(f"Core Explanation Table row '{row_name}' is too shallow")
|
|
192
|
+
|
|
193
|
+
for row_name in ("improve_why", "how_improve"):
|
|
194
|
+
value = find_row_value(rows, REQUIRED_CORE_ROWS[row_name])
|
|
195
|
+
if value and not has_why(value):
|
|
196
|
+
issues.append(f"Core Explanation Table row '{row_name}' must include a reason, not only an action")
|
|
197
|
+
return issues
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def validate_evidence_section(text: str) -> list[str]:
|
|
201
|
+
issues = []
|
|
202
|
+
body = extract_section(text, REQUIRED_SECTIONS["Evidence And Artifacts"])
|
|
203
|
+
marker_groups = (
|
|
204
|
+
("Primary artifact:", "主工件:"),
|
|
205
|
+
("Supporting artifacts:", "支撑工件:"),
|
|
206
|
+
("Validation commands:", "验证命令:"),
|
|
207
|
+
("Known gaps:", "已知缺口:"),
|
|
208
|
+
)
|
|
209
|
+
if not body:
|
|
210
|
+
return ["Evidence And Artifacts section is empty"]
|
|
211
|
+
for group in marker_groups:
|
|
212
|
+
if not any(marker in body for marker in group):
|
|
213
|
+
issues.append(f"Evidence And Artifacts is missing '{group[0]}'")
|
|
214
|
+
continue
|
|
215
|
+
if not has_marker_with_value(body, group):
|
|
216
|
+
issues.append(f"Evidence And Artifacts field '{group[0]}' must have a non-empty value")
|
|
217
|
+
return issues
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def validate_rule_preflight(text: str) -> list[str]:
|
|
221
|
+
body = extract_section(text, REQUIRED_SECTIONS["Rule Preflight"])
|
|
222
|
+
marker_groups = (
|
|
223
|
+
("Rule source file:",),
|
|
224
|
+
("Rule source revision:",),
|
|
225
|
+
("Project version:",),
|
|
226
|
+
("Resolved stage:",),
|
|
227
|
+
("Resolved mode:",),
|
|
228
|
+
("Resolved target:",),
|
|
229
|
+
("Preflight stamp:",),
|
|
230
|
+
)
|
|
231
|
+
issues = []
|
|
232
|
+
for group in marker_groups:
|
|
233
|
+
marker = group[0]
|
|
234
|
+
if marker not in body:
|
|
235
|
+
issues.append(f"Rule Preflight is missing '{marker}'")
|
|
236
|
+
continue
|
|
237
|
+
if not has_marker_with_value(body, group):
|
|
238
|
+
issues.append(f"Rule Preflight field '{marker}' must have a non-empty value")
|
|
239
|
+
return issues
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def validate_next_action(text: str) -> list[str]:
|
|
243
|
+
body = extract_section(text, REQUIRED_SECTIONS["Next Action"])
|
|
244
|
+
if is_shallow(body):
|
|
245
|
+
return ["Next Action section must state a concrete decision and next step"]
|
|
246
|
+
allowed = ("continue", "stop", "revise", "rerun", "escalate", "handoff", "继续", "停止", "修订", "重跑", "升级", "交接")
|
|
247
|
+
if not any(marker in normalize(body) for marker in allowed):
|
|
248
|
+
return ["Next Action must choose continue, stop, revise, rerun, escalate, or handoff"]
|
|
249
|
+
if not has_why(body):
|
|
250
|
+
return ["Next Action must include why the next step is appropriate"]
|
|
251
|
+
return []
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def validate_stage_identity(text: str, expected_stage: str) -> list[str]:
|
|
255
|
+
if not expected_stage:
|
|
256
|
+
return []
|
|
257
|
+
body = extract_section(text, REQUIRED_SECTIONS["Stage Identity"])
|
|
258
|
+
if expected_stage.lower() not in body.lower():
|
|
259
|
+
return [f"Stage Identity must mention expected stage '{expected_stage}'"]
|
|
260
|
+
return []
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def validate_internal_meta(text: str) -> list[str]:
|
|
264
|
+
issues = []
|
|
265
|
+
for pattern in INTERNAL_META_PATTERNS:
|
|
266
|
+
if re.search(pattern, text, flags=re.IGNORECASE):
|
|
267
|
+
issues.append(f"stage report contains internal or service-style meta language: {pattern}")
|
|
268
|
+
return issues
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def validate(path: Path, expected_stage: str = "") -> list[str]:
|
|
272
|
+
if not path.exists():
|
|
273
|
+
return [f"stage report does not exist: {path}"]
|
|
274
|
+
text = path.read_text(encoding="utf-8")
|
|
275
|
+
issues = []
|
|
276
|
+
missing_sections = find_missing_sections(text)
|
|
277
|
+
if missing_sections:
|
|
278
|
+
issues.append(f"stage report is missing required sections: {', '.join(missing_sections)}")
|
|
279
|
+
if not missing_sections:
|
|
280
|
+
issues.extend(validate_rule_preflight(text))
|
|
281
|
+
issues.extend(validate_stage_identity(text, expected_stage))
|
|
282
|
+
issues.extend(validate_core_table(text))
|
|
283
|
+
issues.extend(validate_evidence_section(text))
|
|
284
|
+
issues.extend(validate_next_action(text))
|
|
285
|
+
issues.extend(validate_internal_meta(text))
|
|
286
|
+
return issues
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def main():
|
|
290
|
+
args = parse_args()
|
|
291
|
+
issues = validate(Path(args.stage_report), args.stage)
|
|
292
|
+
if issues:
|
|
293
|
+
for issue in issues:
|
|
294
|
+
print(issue, file=sys.stderr)
|
|
295
|
+
return 1
|
|
296
|
+
print("stage report is valid")
|
|
297
|
+
return 0
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
if __name__ == "__main__":
|
|
301
|
+
raise SystemExit(main())
|
|
@@ -51,6 +51,12 @@
|
|
|
51
51
|
- Primary metric plain-language explanation:
|
|
52
52
|
- Secondary metric plain-language explanation:
|
|
53
53
|
- Health or support metrics and why they are not the main claim:
|
|
54
|
+
- Evaluation target:
|
|
55
|
+
- Test-set prediction used:
|
|
56
|
+
- Ranking or grouping step:
|
|
57
|
+
- Aggregation / calculation sketch:
|
|
58
|
+
- Direction and scale:
|
|
59
|
+
- Comparability boundary:
|
|
54
60
|
|
|
55
61
|
## Background Sources
|
|
56
62
|
|
|
@@ -17,6 +17,12 @@
|
|
|
17
17
|
- Primary metric plain-language explanation:
|
|
18
18
|
- Secondary metric plain-language explanation:
|
|
19
19
|
- Health or support metrics and how to read them:
|
|
20
|
+
- Evaluation target:
|
|
21
|
+
- Test-set prediction used:
|
|
22
|
+
- Ranking or grouping step:
|
|
23
|
+
- Aggregation / calculation sketch:
|
|
24
|
+
- Direction and scale:
|
|
25
|
+
- Comparability boundary:
|
|
20
26
|
|
|
21
27
|
## Final Performance Summary
|
|
22
28
|
|
|
@@ -2,18 +2,18 @@
|
|
|
2
2
|
\caption{One-sentence message of the table and the evaluation protocol.}
|
|
3
3
|
\label{tab:placeholder}
|
|
4
4
|
\centering
|
|
5
|
-
\begin{
|
|
5
|
+
\begin{tabularx}{\linewidth}{>{\raggedright\arraybackslash}Xcc}
|
|
6
6
|
\toprule
|
|
7
7
|
Method & Metric 1 $\uparrow$ & Metric 2 $\uparrow$ \\
|
|
8
8
|
\midrule
|
|
9
9
|
Ours & 0.0000 & 0.0000 \\
|
|
10
10
|
Baseline & 0.0000 & 0.0000 \\
|
|
11
11
|
\bottomrule
|
|
12
|
-
\end{
|
|
12
|
+
\end{tabularx}
|
|
13
13
|
% Rows: explain what each row represents.
|
|
14
14
|
% Columns: explain what each column represents and its direction.
|
|
15
15
|
% Metric definitions: expand local abbreviations, units, denominators, or event conditions.
|
|
16
16
|
% Comparison scope: explain which setting, split, attack family, or benchmark scope this table covers.
|
|
17
17
|
% Important caveat: state any omitted metrics, zero-valued metrics, or appendix-only reporting decision.
|
|
18
|
-
% Width control: first shorten headers, move secondary metrics out of the main table, and reduce or split columns; only then adjust \setlength{\tabcolsep}{...} conservatively or use \resizebox{\linewidth}{!}{...} as a documented last resort.
|
|
18
|
+
% Width control: default to bounded columns with tabularx or p{...}; first shorten headers, move secondary metrics out of the main table, and reduce or split columns; only then adjust \setlength{\tabcolsep}{...} conservatively or use \resizebox{\linewidth}{!}{...} as a documented last resort.
|
|
19
19
|
\end{table}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Stage Report
|
|
2
|
+
|
|
3
|
+
## Rule Preflight
|
|
4
|
+
|
|
5
|
+
- Rule source file:
|
|
6
|
+
- Rule source revision:
|
|
7
|
+
- Project version:
|
|
8
|
+
- Resolved stage:
|
|
9
|
+
- Resolved mode:
|
|
10
|
+
- Resolved target:
|
|
11
|
+
- Preflight stamp:
|
|
12
|
+
- Override reason, if any:
|
|
13
|
+
|
|
14
|
+
## Stage Identity
|
|
15
|
+
|
|
16
|
+
- Stage:
|
|
17
|
+
- Target:
|
|
18
|
+
- Date:
|
|
19
|
+
- Status:
|
|
20
|
+
- Primary artifact:
|
|
21
|
+
- Next owner:
|
|
22
|
+
|
|
23
|
+
## Core Explanation Table
|
|
24
|
+
|
|
25
|
+
| Question | Plain Answer |
|
|
26
|
+
|---|---|
|
|
27
|
+
| 这是什么阶段? | |
|
|
28
|
+
| 背景是什么? | |
|
|
29
|
+
| 为什么现在要做? | |
|
|
30
|
+
| 这轮具体做了什么? | |
|
|
31
|
+
| 怎么做的? | |
|
|
32
|
+
| 结果好的地方是什么? | |
|
|
33
|
+
| 结果坏的地方是什么? | |
|
|
34
|
+
| 这验证了什么? | |
|
|
35
|
+
| 还没有验证什么? | |
|
|
36
|
+
| 是否需要改进?为什么? | |
|
|
37
|
+
| 下一步怎么改?为什么这样改? | |
|
|
38
|
+
| 关键证据在哪里? | |
|
|
39
|
+
| 现在应该继续、停止、重做还是升级? | |
|
|
40
|
+
|
|
41
|
+
## Evidence And Artifacts
|
|
42
|
+
|
|
43
|
+
- Primary artifact:
|
|
44
|
+
- Supporting artifacts:
|
|
45
|
+
- Validation commands:
|
|
46
|
+
- Known gaps:
|
|
47
|
+
|
|
48
|
+
## Next Action
|
|
49
|
+
|
|
50
|
+
- Decision: continue / stop / revise / rerun / escalate / handoff
|
|
51
|
+
- Concrete next step:
|
|
52
|
+
- Why this next step:
|
|
@@ -86,6 +86,9 @@
|
|
|
86
86
|
- Were all abbreviations expanded at local first mention:
|
|
87
87
|
- Did each main table include a local table note:
|
|
88
88
|
- Can a reader interpret rows and columns without chasing Method:
|
|
89
|
+
- Table width audit:
|
|
90
|
+
- Did any main table use a wide plain `tabular` layout:
|
|
91
|
+
- If width control was needed, was the table first shortened, split, moved partly to appendix, or converted to `tabularx` / bounded columns before using `\tabcolsep` or `\resizebox`:
|
|
89
92
|
- If this section used canonical short names before their defining section, was a local naming bridge added:
|
|
90
93
|
- Did model and ablation labels stay canonical instead of drifting into narrative aliases:
|
|
91
94
|
|
|
@@ -141,6 +144,7 @@
|
|
|
141
144
|
- Did the round avoid copying reference wording, claims, metrics, captions, or conclusions:
|
|
142
145
|
- Did final prose avoid service-style or AI-assistant meta language:
|
|
143
146
|
- Did final prose avoid workflow-only placeholder language:
|
|
147
|
+
- Did final prose avoid internal experiment planning, future-holdout logistics, gates, payload-freezing notes, API-budget notes, and automation triage language:
|
|
144
148
|
- Validator command and result:
|
|
145
149
|
|
|
146
150
|
## Decision
|
|
@@ -42,6 +42,10 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
42
42
|
- Generate the `Rule Preflight` block from `.lab/.managed/rule-manifest.json` with the managed preflight renderer instead of handwriting it from memory.
|
|
43
43
|
- Treat missing, stale, or contradictory `Rule Preflight` data as a stage-contract failure.
|
|
44
44
|
- Project-installed rules take priority over model memory. If remembered patterns conflict with the installed rule source, follow the installed source recorded in `.lab/.managed/rule-manifest.json`.
|
|
45
|
+
- Before a `/lab:*` stage reaches a final handoff, write or update one plain-language stage report under `.lab/stage-reports/` from `.lab/.managed/templates/stage-report.md`.
|
|
46
|
+
- The stage report must include a filled `Core Explanation Table` that answers, in workflow language and plain language: background, why now, what was done, how it was done, what worked, what did not work, what was verified, what remains unverified, whether improvement is needed and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
47
|
+
- Stage reports are closeout and handoff artifacts, not a new user command and not a replacement for stage-specific artifacts such as idea memos, iteration reports, final reports, or write-iteration records.
|
|
48
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage <stage>` before claiming the stage is complete, and include the stage-report path plus validation result in the final user-facing summary.
|
|
45
49
|
- Final paper output should default to LaTeX, and its manuscript language should be decided separately from the workflow language.
|
|
46
50
|
- Separate sourced facts from model-generated hypotheses.
|
|
47
51
|
- Preserve failed runs, failed ideas, and limitations.
|
|
@@ -210,6 +214,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
210
214
|
- Read `.lab/context/mission.md`, `.lab/context/state.md`, `.lab/context/workflow-state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, and `.lab/context/data-decisions.md` before drafting.
|
|
211
215
|
- Read `.lab/context/eval-protocol.md` before choosing tables, thresholds, or final result framing.
|
|
212
216
|
- Keep metric definitions, comparison semantics, and implementation references anchored to the approved evaluation protocol instead of re-deriving them during reporting.
|
|
217
|
+
- In `report.md`, explain each primary metric with a computation guide: what is evaluated, which test-set predictions or scores are used, whether examples are sorted, grouped, bucketed, or paired, how the value is aggregated or approximately calculated, what direction and scale mean, and what cannot be compared across datasets, splits, or implementations.
|
|
213
218
|
- Aggregate them with `.lab/.managed/scripts/summarize_iterations.py`.
|
|
214
219
|
- Write the final document with `.lab/.managed/templates/final-report.md`, the managed table summary with `.lab/.managed/templates/main-tables.md`, and the internal handoff with `.lab/.managed/templates/artifact-status.md`.
|
|
215
220
|
- Keep failed attempts and limitations visible.
|
|
@@ -272,10 +277,12 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
272
277
|
- Use the same metric names across Method, Experiments, captions, table headers, table notes, and result summaries; remove forbidden aliases from reader-facing LaTeX instead of letting legacy metric names drift.
|
|
273
278
|
- Run `.lab/.managed/scripts/validate_metric_glossary.py` in metric-bearing draft, final-draft, or export rounds and record the result in the latest write iteration artifact.
|
|
274
279
|
- Do not treat `\resizebox{\linewidth}{!}{...}` as the default main-table fit strategy.
|
|
275
|
-
-
|
|
280
|
+
- Wide plain `tabular` layouts with many columns are not manuscript-ready by default; prefer `tabularx` or bounded `p{...}` columns for text-heavy or multi-metric tables.
|
|
281
|
+
- Fit paper-facing main tables by redesign first: shorten headers, move secondary metrics out of the main table, reduce or split columns, prefer `tabularx` or bounded columns, then adjust `\tabcolsep` conservatively; only use `\resizebox` as a last resort and document why.
|
|
276
282
|
- Keep `\tabcolsep` adjustments conservative and avoid shrinking below a roughly readable floor for paper-facing main tables.
|
|
277
283
|
- Do not rely on `\scriptsize` or `\tiny` as the default way to make a main table fit.
|
|
278
284
|
- Keep internal identifiers, tuning-run labels, probe names, config strings, rerun ids, and package labels out of prose unless they are mapped once for the reader and then moved back out of prose.
|
|
285
|
+
- Keep internal experiment planning out of manuscript prose: future holdout expansion, small-batch gates, payload freezing, API budgets, automation decisions, and overfitting triage logic belong in lab artifacts, not paper-facing sections.
|
|
279
286
|
- Do not rely on unexplained jargon density as a substitute for academic tone.
|
|
280
287
|
- Bind each claim to evidence from `report`, iteration reports, or normalized summaries.
|
|
281
288
|
- Use the write-stage contract in `.codex/skills/lab/stages/write.md` or `.claude/skills/lab/stages/write.md` as the single source of truth for template choice, paper-plan requirements, section-specific references, validator calls, asset coverage, and final manuscript gates.
|
|
@@ -314,6 +321,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
314
321
|
- No auto start without an explicit autonomy level and `Approval status: approved`.
|
|
315
322
|
- No final report without validated normalized results.
|
|
316
323
|
- No paper-writing round without stable report artifacts, an approved framing artifact, evidence links, and LaTeX manuscript output.
|
|
324
|
+
- No stage-final handoff without a validated plain-language stage report.
|
|
317
325
|
- No final-draft or export round without passing section-quality, claim-safety, and manuscript-delivery validation.
|
|
318
326
|
- No final-draft or export round with mismatched `workflow_language` and `paper_language` unless the latest write iteration records the language decision audit that justified the final manuscript language and the persisted workflow-language paper-layer path.
|
|
319
327
|
|
|
@@ -122,6 +122,7 @@ These are paper-facing defaults. They are not project-specific branding rules.
|
|
|
122
122
|
- Self-evaluations such as "结果也很清楚", "the defense results are very clear", or "the table is self-explanatory".
|
|
123
123
|
- Layout-process commentary in scientific prose, such as "由于表列较多,这里采用页宽自适应排版" or "we use page-width adaptive layout here".
|
|
124
124
|
- Claims that a table "proves" something when the evidence only supports a bounded empirical result.
|
|
125
|
+
- Internal experiment-planning prose, such as "还需要新增 holdout", "小批量门控", "冻结 payload", "不能边跑边调", "API 规模估计", or "if all scores are 1.0000, treat it as overfitting".
|
|
125
126
|
- Service-style or AI-assistant meta language such as "用户说", "按你的要求", "我来解释", "let me explain", or "as requested by the user".
|
|
126
127
|
- Workflow-only placeholder language such as "图的意图", "资产意图", "占位符", "workflow-language", or "sync this wording".
|
|
127
128
|
|
|
@@ -194,3 +194,9 @@
|
|
|
194
194
|
- If the user chooses to convert, persist `paper_language_finalization_decision: convert-to-paper-language`
|
|
195
195
|
- While the real experiment process is still alive, emit only a progress update and keep waiting. Do not present a terminal summary for that rung until the process exits or the rung hits an explicit stop boundary.
|
|
196
196
|
- While the loop is healthy, do not ask the user to trigger the next poll. Keep polling until a meaningful change, keepalive boundary, stop boundary, escalation boundary, or terminal boundary is reached.
|
|
197
|
+
|
|
198
|
+
## Stage Report Closeout
|
|
199
|
+
|
|
200
|
+
- At every stop, failure, escalation, or final handoff, write or update `.lab/stage-reports/<date>--auto--<target>.md` from `.lab/.managed/templates/stage-report.md`.
|
|
201
|
+
- Fill the `Core Explanation Table` in plain language: background, why now, what ran, how the loop ran, what worked, what did not work, what was verified, what remains unverified, what needs improvement and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
202
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage auto` and include the report path plus validation result in the final user-facing summary.
|
|
@@ -66,3 +66,9 @@
|
|
|
66
66
|
6. Recommended approved dataset package
|
|
67
67
|
7. Risks and exclusions
|
|
68
68
|
8. Approval gate
|
|
69
|
+
|
|
70
|
+
## Stage Report Closeout
|
|
71
|
+
|
|
72
|
+
- Before final handoff, write or update `.lab/stage-reports/<date>--data--<target>.md` from `.lab/.managed/templates/stage-report.md`.
|
|
73
|
+
- Fill the `Core Explanation Table` in plain language: background, why now, what changed, how the dataset package was chosen, what worked, what did not work, what was verified, what remains unverified, what needs improvement and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
74
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage data` and include the report path plus validation result in the final user-facing summary.
|
|
@@ -69,3 +69,9 @@
|
|
|
69
69
|
5. Recommended framing pack
|
|
70
70
|
6. Forbidden claims and wording
|
|
71
71
|
7. Approval gate
|
|
72
|
+
|
|
73
|
+
## Stage Report Closeout
|
|
74
|
+
|
|
75
|
+
- Before final handoff, write or update `.lab/stage-reports/<date>--framing--<target>.md` from `.lab/.managed/templates/stage-report.md`.
|
|
76
|
+
- Fill the `Core Explanation Table` in plain language: background, why now, what naming or framing changed, how it was checked, what worked, what did not work, what was verified, what remains unverified, what needs improvement and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
77
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage framing` and include the report path plus validation result in the final user-facing summary.
|
|
@@ -119,6 +119,12 @@
|
|
|
119
119
|
28. Minimum viable experiment
|
|
120
120
|
29. Idea source log aligned with the two literature sweeps
|
|
121
121
|
|
|
122
|
+
## Stage Report Closeout
|
|
123
|
+
|
|
124
|
+
- Before final handoff, write or update `.lab/stage-reports/<date>--idea--<target>.md` from `.lab/.managed/templates/stage-report.md`.
|
|
125
|
+
- Fill the `Core Explanation Table` in plain language: background, why now, what idea work was done, how sources and brainstorm passes were used, what worked, what did not work, what was verified, what remains unverified, what needs improvement and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
126
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage idea` and include the report path plus validation result in the final user-facing summary.
|
|
127
|
+
|
|
122
128
|
## Writing Standard
|
|
123
129
|
|
|
124
130
|
- Keep the problem statement short, concrete, and easy to scan.
|
|
@@ -78,3 +78,9 @@ If the loop stops without success, record:
|
|
|
78
78
|
- If the next move depends on an unresolved assumption, ask one clarifying question at a time.
|
|
79
79
|
- If more than one next hypothesis is credible, present 2-3 approaches with trade-offs and recommend the next bounded experiment before changing the mission state.
|
|
80
80
|
- Keep an approval gate when a proposed change would alter the frozen mission instead of only changing the implementation hypothesis.
|
|
81
|
+
|
|
82
|
+
## Stage Report Closeout
|
|
83
|
+
|
|
84
|
+
- Before final handoff, write or update `.lab/stage-reports/<date>--iterate--<target>.md` from `.lab/.managed/templates/stage-report.md`.
|
|
85
|
+
- Fill the `Core Explanation Table` in plain language: background, why now, what rounds ran, how the loop evaluated them, what worked, what did not work, what was verified, what remains unverified, what needs improvement and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
86
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage iterate` and include the report path plus validation result in the final user-facing summary.
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
- method overview
|
|
11
11
|
- selected metrics summary
|
|
12
12
|
- plain-language metric guide
|
|
13
|
+
- metric computation guide that explains what is evaluated, which test-set predictions are used, whether examples are sorted or grouped, how values are aggregated or approximately calculated, metric direction and scale, and comparability boundaries
|
|
13
14
|
- background sources
|
|
14
15
|
- method and baseline sources
|
|
15
16
|
- metric sources
|
|
@@ -52,6 +53,8 @@
|
|
|
52
53
|
- Do not restate metric definitions, baseline behavior, or comparison implementations from memory; use the approved evaluation protocol and its recorded sources.
|
|
53
54
|
- Carry the approved `Primary metrics`, `Secondary metrics`, and `Required terminal evidence` into both the report and the managed main-tables artifact.
|
|
54
55
|
- Explain the selected primary and secondary metrics in plain language for the user: what each metric measures, whether higher or lower is better, and whether it is a main result metric or only a health/support metric.
|
|
56
|
+
- For every primary metric, also explain enough of the computation for a collaborator to reproduce the idea without reading code: what is evaluated, which test-set predictions or scores are used, whether the examples are sorted, bucketed, grouped, or paired, how the resulting values are aggregated or approximately calculated, what direction and scale mean, and which comparisons are invalid across datasets, splits, or metric implementations.
|
|
57
|
+
- If a metric depends on ranking, the report must name the ranking score and the order. If it depends on a contrast, the report must name the compared conditions or groups. If it depends on an average, rate, area, threshold crossing, or recovery amount, the report must give a simple calculation sketch.
|
|
55
58
|
- If coverage, completeness, confidence, or similar health metrics appear, explicitly say that they describe experimental reliability rather than the main scientific effect.
|
|
56
59
|
- Pull the core background references, method or baseline references, and metric references out of the approved evaluation protocol instead of hiding them in `.lab/context/*`.
|
|
57
60
|
- Treat `report.md` as an external-review-ready memo. Source sections must not rely on local file paths or internal provenance notes; they must give a few human-readable anchor references instead.
|
|
@@ -87,3 +90,9 @@
|
|
|
87
90
|
- If a missing assumption would change report interpretation, ask one clarifying question at a time.
|
|
88
91
|
- If there are multiple defensible report framings, present 2-3 approaches with trade-offs and recommend the most evidence-faithful framing before writing.
|
|
89
92
|
- Keep an approval gate when the reporting frame would materially affect what the paper later claims.
|
|
93
|
+
|
|
94
|
+
## Stage Report Closeout
|
|
95
|
+
|
|
96
|
+
- Before final handoff, write or update `.lab/stage-reports/<date>--report--<target>.md` from `.lab/.managed/templates/stage-report.md`.
|
|
97
|
+
- Fill the `Core Explanation Table` in plain language: background, why now, what report artifacts were produced, how evidence was carried forward, what worked, what did not work, what was verified, what remains unverified, what needs improvement and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
98
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage report` and include the report path plus validation result in the final user-facing summary.
|
|
@@ -58,3 +58,9 @@
|
|
|
58
58
|
- If there are multiple legitimate review framings, present 2-3 approaches with trade-offs and recommend the strictest useful framing.
|
|
59
59
|
- Do not use brainstorming to soften critique; once scope is clear, stay in reviewer mode and deliver findings directly.
|
|
60
60
|
- Call out the strongest remaining alternative explanation and the strongest boundary risk when either one could materially narrow the claim.
|
|
61
|
+
|
|
62
|
+
## Stage Report Closeout
|
|
63
|
+
|
|
64
|
+
- Before final handoff, write or update `.lab/stage-reports/<date>--review--<target>.md` from `.lab/.managed/templates/stage-report.md`.
|
|
65
|
+
- Fill the `Core Explanation Table` in plain language: background, why now, what was reviewed, how the review was performed, what worked, what did not work, what was verified, what remains unverified, what needs improvement and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
66
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage review` and include the report path plus validation result in the final user-facing summary.
|
|
@@ -55,3 +55,9 @@
|
|
|
55
55
|
- If the next run depends on an unresolved assumption, ask one clarifying question at a time.
|
|
56
56
|
- If there are multiple defensible tiny-run options, present 2-3 approaches with trade-offs and recommend the cheapest informative run.
|
|
57
57
|
- Only ask for approval when choosing a run path would materially spend more time or compute than the default smallest experiment.
|
|
58
|
+
|
|
59
|
+
## Stage Report Closeout
|
|
60
|
+
|
|
61
|
+
- Before final handoff, write or update `.lab/stage-reports/<date>--run--<target>.md` from `.lab/.managed/templates/stage-report.md`.
|
|
62
|
+
- Fill the `Core Explanation Table` in plain language: background, why now, what ran, how it ran, what worked, what did not work, what was verified, what remains unverified, what needs improvement and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
63
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage run` and include the report path plus validation result in the final user-facing summary.
|
|
@@ -72,3 +72,9 @@
|
|
|
72
72
|
- evaluation normalization
|
|
73
73
|
- bounded iteration
|
|
74
74
|
- final report
|
|
75
|
+
|
|
76
|
+
## Stage Report Closeout
|
|
77
|
+
|
|
78
|
+
- Before final handoff, write or update `.lab/stage-reports/<date>--spec--<target>.md` from `.lab/.managed/templates/stage-report.md`.
|
|
79
|
+
- Fill the `Core Explanation Table` in plain language: background, why now, what change artifacts were created, how the spec was structured, what worked, what did not work, what was verified, what remains unverified, what needs improvement and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
80
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage spec` and include the report path plus validation result in the final user-facing summary.
|
|
@@ -165,6 +165,8 @@ Do not enter prose polish until the current section has passed the reference-con
|
|
|
165
165
|
- Do not use labels containing `_` or `-` in reader-facing prose.
|
|
166
166
|
- Keep internal identifiers, config keys, and experiment package labels out of reader-facing prose unless they are mapped once for the reader and then moved back out of prose.
|
|
167
167
|
- Keep run provenance such as tuning-run labels, probe names, internal config strings, rerun ids, and package labels out of reader-facing prose. If the evidence is useful, rewrite it as a bounded paper-facing diagnostic or move the raw provenance to workflow notes or appendix metadata.
|
|
168
|
+
- Keep internal experiment planning out of reader-facing prose. Do not write paper sentences that explain future holdout expansion, small-batch gates, payload freezing, API budget, "if all scores are 1.0000 then treat as overfitting", or why a next automation round is needed.
|
|
169
|
+
- When an experiment boundary matters, report only the scientific scope already supported by the evidence. Put the operational plan for collecting new attacks, new papers, new markers, or additional holdout cases into `.lab/changes/`, `.lab/iterations/`, or report artifacts, not into manuscript sections.
|
|
168
170
|
- Do not use unexplained terminology density as a substitute for academic tone.
|
|
169
171
|
- Keep service-style or AI-assistant meta language out of manuscript prose. Phrases such as "用户说", "按你的要求", "我来解释", "下面我", "this version", or "as requested by the user" belong in workflow notes, not in paper-facing sections, captions, table notes, or analysis assets.
|
|
170
172
|
- Keep workflow-only placeholder language out of manuscript prose. Phrases such as "图的意图", "资产意图", "占位符", "workflow-language", "translation layer", or "sync this wording" belong in authoring artifacts, not in reader-facing LaTeX.
|
|
@@ -178,10 +180,12 @@ Do not enter prose polish until the current section has passed the reference-con
|
|
|
178
180
|
- If a metric's denominator, event condition, score scale, or comparison scope differs by setting, define a separate entry or explicitly scope the metric in `.lab/writing/metric-glossary.md`.
|
|
179
181
|
- Deprecated or forbidden metric aliases must be removed from reader-facing LaTeX instead of explained away locally.
|
|
180
182
|
- Do not treat `\resizebox{\linewidth}{!}{...}` as the default way to fit a main table.
|
|
181
|
-
-
|
|
183
|
+
- Wide plain `tabular` layouts with many columns are not manuscript-ready by default; final/export validation should force a width-aware table design instead of silently accepting likely overfull tables.
|
|
184
|
+
- Main-table width control should follow this order: shorten headers while preserving local explanations, move secondary metrics to appendix-only, reduce or split columns, prefer `tabularx` or bounded `p{...}` columns, adjust `\tabcolsep` conservatively, and only then consider `\resizebox` as a last resort.
|
|
182
185
|
- When `\tabcolsep` is adjusted for a paper-facing main table, keep it in a safe range and avoid shrinking below roughly `3pt`; prefer `4pt` or `5pt` when a small reduction is enough.
|
|
183
186
|
- Do not use `\scriptsize` or `\tiny` as the default main-table fit strategy. If a table only fits after aggressive font shrinking, redesign the table instead of forcing it into the page.
|
|
184
187
|
- If a paper-facing main table uses `\resizebox` or non-default width control, explain the width-control rationale in the same table note.
|
|
188
|
+
- Prefer `tabularx` for paper-facing main tables whose first column or text-heavy columns need bounded line wrapping; use plain `tabular` only for compact tables with a small column count.
|
|
185
189
|
- Every main table should have a short table-introduction sentence before it and a short interpretation sentence after it so the reader knows what question the table answers and how to read the result.
|
|
186
190
|
- Build the paper asset plan before prose when the section carries introduction, experimental, method, related-work, or conclusion claims:
|
|
187
191
|
- record the asset coverage targets and gaps for the current paper
|
|
@@ -221,6 +225,7 @@ Do not enter prose polish until the current section has passed the reference-con
|
|
|
221
225
|
- Table assets must also include a local table note that explains row meaning, column meaning, metric definitions, comparison scope, and any important caveat.
|
|
222
226
|
- The local table note must contain real reader-facing explanations, not the default template phrases such as "explain what each row represents" or "expand local abbreviations".
|
|
223
227
|
- Table assets must not rely on aggressive width hacks by default; if width control is still needed after table redesign, document it locally and keep it readable.
|
|
228
|
+
- Table assets with seven or more columns should be split, moved partly to appendix, or written with width-aware columns such as `tabularx` or `p{...}` instead of a plain `tabular` layout.
|
|
224
229
|
- Figure placeholders may record what the final figure should show and why the reader needs it in authoring comments, the paper plan, or the write-iteration artifact, but the caption itself must remain paper-facing and must not contain "Figure intent", "图的意图", "asset intent", "占位符", or similar workflow language.
|
|
225
230
|
- Core asset coverage for a paper-facing final draft should include a problem-setting or teaser figure, a method overview figure, a results overview figure, a main-results table, an ablation table, and one additional analysis asset.
|
|
226
231
|
- Keep `.lab/writing/plan.md` synchronized with the current table plan, figure plan, citation plan, and section-to-asset map whenever manuscript assets change.
|
|
@@ -298,3 +303,9 @@ Do not enter prose polish until the current section has passed the reference-con
|
|
|
298
303
|
- If the user asks to continue tightening the same section, default to a section-level acceptance review first instead of another immediate prose-polish pass.
|
|
299
304
|
- Only recommend another tighten/compress/polish pass after the current section has passed the section-level acceptance gate.
|
|
300
305
|
- If the round introduces or revises key terms, abbreviations, metrics, or mechanism names, include a short terminology note in the final user-facing response that says the full form, approved short form if any, what each term is, and why it matters here, and point to `.lab/writing/terminology-glossary.md` plus the write iteration artifact for the full terminology audit.
|
|
306
|
+
|
|
307
|
+
## Stage Report Closeout
|
|
308
|
+
|
|
309
|
+
- Before final handoff, write or update `.lab/stage-reports/<date>--write--<target>.md` from `.lab/.managed/templates/stage-report.md`.
|
|
310
|
+
- Fill the `Core Explanation Table` in plain language: background, why now, what section or asset changed, how evidence and writing rules were applied, what worked, what did not work, what was verified, what remains unverified, what needs improvement and why, how to improve and why, key evidence, and the continue/stop/revise/rerun/escalate/handoff decision.
|
|
311
|
+
- Run `.lab/.managed/scripts/validate_stage_report.py --stage-report <stage-report> --stage write` and include the report path plus validation result in the final user-facing summary.
|