superlab 0.1.23 → 0.1.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/README.zh-CN.md +3 -2
- package/lib/auto_contracts.cjs +4 -2
- package/lib/auto_runner.cjs +30 -0
- package/lib/auto_state.cjs +30 -0
- package/lib/context.cjs +437 -14
- package/lib/eval_protocol.cjs +75 -0
- package/lib/i18n.cjs +140 -24
- package/lib/install.cjs +2 -0
- package/package-assets/claude/commands/lab.md +2 -2
- package/package-assets/codex/prompts/lab.md +2 -2
- package/package-assets/shared/lab/.managed/scripts/validate_collaborator_report.py +53 -0
- package/package-assets/shared/lab/.managed/templates/artifact-status.md +28 -0
- package/package-assets/shared/lab/.managed/templates/final-report.md +24 -19
- package/package-assets/shared/lab/.managed/templates/review-checklist.md +4 -0
- package/package-assets/shared/lab/context/auto-mode.md +3 -3
- package/package-assets/shared/lab/context/auto-outcome.md +15 -0
- package/package-assets/shared/lab/context/eval-protocol.md +21 -0
- package/package-assets/shared/lab/context/session-brief.md +1 -1
- package/package-assets/shared/lab/context/state.md +19 -13
- package/package-assets/shared/lab/context/workflow-state.md +19 -0
- package/package-assets/shared/lab/system/core.md +4 -2
- package/package-assets/shared/skills/lab/SKILL.md +10 -10
- package/package-assets/shared/skills/lab/stages/auto.md +5 -1
- package/package-assets/shared/skills/lab/stages/iterate.md +4 -0
- package/package-assets/shared/skills/lab/stages/report.md +11 -1
- package/package-assets/shared/skills/lab/stages/review.md +4 -0
- package/package-assets/shared/skills/lab/stages/run.md +4 -0
- package/package.json +1 -1
package/lib/eval_protocol.cjs
CHANGED
|
@@ -79,6 +79,81 @@ const EVAL_PROTOCOL_FIELDS = [
|
|
|
79
79
|
key: "deviationFromOriginalImplementation",
|
|
80
80
|
labels: ["Deviation from original implementation", "与原始实现的偏差"],
|
|
81
81
|
},
|
|
82
|
+
{
|
|
83
|
+
name: "Evaluation setting semantics",
|
|
84
|
+
key: "evaluationSettingSemantics",
|
|
85
|
+
labels: ["Evaluation setting semantics", "评测设定语义"],
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
name: "Visibility and leakage risks",
|
|
89
|
+
key: "visibilityAndLeakageRisks",
|
|
90
|
+
labels: ["Visibility and leakage risks", "可见性与泄漏风险"],
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
name: "Anchor and label policy",
|
|
94
|
+
key: "anchorAndLabelPolicy",
|
|
95
|
+
labels: ["Anchor and label policy", "锚点与标签策略"],
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
name: "Scale and comparability policy",
|
|
99
|
+
key: "scaleAndComparabilityPolicy",
|
|
100
|
+
labels: ["Scale and comparability policy", "尺度与可比性策略"],
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
name: "Metric validity checks",
|
|
104
|
+
key: "metricValidityChecks",
|
|
105
|
+
labels: ["Metric validity checks", "指标有效性检查"],
|
|
106
|
+
},
|
|
107
|
+
{
|
|
108
|
+
name: "Comparison validity checks",
|
|
109
|
+
key: "comparisonValidityChecks",
|
|
110
|
+
labels: ["Comparison validity checks", "对比方法有效性检查"],
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
name: "Statistical validity checks",
|
|
114
|
+
key: "statisticalValidityChecks",
|
|
115
|
+
labels: ["Statistical validity checks", "统计有效性检查"],
|
|
116
|
+
},
|
|
117
|
+
{
|
|
118
|
+
name: "Claim boundary",
|
|
119
|
+
key: "claimBoundary",
|
|
120
|
+
labels: ["Claim boundary", "结论边界"],
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
name: "Integrity self-check",
|
|
124
|
+
key: "integritySelfCheck",
|
|
125
|
+
labels: ["Integrity self-check", "完整性自检"],
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
name: "Anomaly signals",
|
|
129
|
+
key: "anomalySignals",
|
|
130
|
+
labels: ["Anomaly signals", "异常信号"],
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
name: "Implementation reality checks",
|
|
134
|
+
key: "implementationRealityChecks",
|
|
135
|
+
labels: ["Implementation reality checks", "实现层现实检查"],
|
|
136
|
+
},
|
|
137
|
+
{
|
|
138
|
+
name: "Alternative explanations considered",
|
|
139
|
+
key: "alternativeExplanationsConsidered",
|
|
140
|
+
labels: ["Alternative explanations considered", "已考虑的替代解释"],
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
name: "Cross-check method",
|
|
144
|
+
key: "crossCheckMethod",
|
|
145
|
+
labels: ["Cross-check method", "交叉验证方法"],
|
|
146
|
+
},
|
|
147
|
+
{
|
|
148
|
+
name: "Best-supported interpretation",
|
|
149
|
+
key: "bestSupportedInterpretation",
|
|
150
|
+
labels: ["Best-supported interpretation", "当前最站得住的解释"],
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
name: "Escalation threshold",
|
|
154
|
+
key: "escalationThreshold",
|
|
155
|
+
labels: ["Escalation threshold", "升级阈值"],
|
|
156
|
+
},
|
|
82
157
|
{
|
|
83
158
|
name: "Benchmark ladder",
|
|
84
159
|
key: "benchmarkLadder",
|
package/lib/i18n.cjs
CHANGED
|
@@ -55,7 +55,7 @@ const ZH_CONTENT = {
|
|
|
55
55
|
[path.join(".codex", "prompts", "lab-report.md")]: codexPrompt(
|
|
56
56
|
"基于验证后的迭代工件生成最终报告",
|
|
57
57
|
"report context",
|
|
58
|
-
"使用已安装的 `lab` 技能:`.codex/skills/lab/SKILL.md`。\n\n立刻针对用户当前给出的参数执行 `/lab:report`,不要只推荐别的 `/lab` 阶段。只有在缺少阻塞性前提时,才明确指出缺什么,并且一次最多追问一个问题。\n\n本命令运行 `/lab:report`
|
|
58
|
+
"使用已安装的 `lab` 技能:`.codex/skills/lab/SKILL.md`。\n\n立刻针对用户当前给出的参数执行 `/lab:report`,不要只推荐别的 `/lab` 阶段。只有在缺少阻塞性前提时,才明确指出缺什么,并且一次最多追问一个问题。\n\n本命令运行 `/lab:report` 阶段。它必须生成给用户直接阅读的最终实验报告、受管的 `main-tables.md`,以及单独的内部 `artifact-status.md`。主报告要明确写出主指标、次级指标和必要终局证据,并用白话解释这些指标分别衡量什么、哪些只是健康度或支持性指标、以及每张主表到底证明了什么和没证明什么。"
|
|
59
59
|
),
|
|
60
60
|
[path.join(".codex", "prompts", "lab-write.md")]: codexPrompt(
|
|
61
61
|
"把验证过的研究工件转成论文 section,并按小步方式修订",
|
|
@@ -102,7 +102,7 @@ const ZH_CONTENT = {
|
|
|
102
102
|
"lab-report",
|
|
103
103
|
"基于验证后的迭代工件生成最终报告",
|
|
104
104
|
"report context",
|
|
105
|
-
"使用已安装的 `lab` 技能:`.claude/skills/lab/SKILL.md`。\n\n立刻针对用户当前给出的参数执行 `report` 阶段,不要只推荐别的 lab 阶段。只有在缺少阻塞性前提时,才明确指出缺什么,并且一次最多追问一个问题。\n\n本命令运行 lab workflow 的 `report`
|
|
105
|
+
"使用已安装的 `lab` 技能:`.claude/skills/lab/SKILL.md`。\n\n立刻针对用户当前给出的参数执行 `report` 阶段,不要只推荐别的 lab 阶段。只有在缺少阻塞性前提时,才明确指出缺什么,并且一次最多追问一个问题。\n\n本命令运行 lab workflow 的 `report` 阶段。它必须生成给用户直接阅读的最终实验报告、受管的 `main-tables.md`,以及单独的内部 `artifact-status.md`。主报告要明确写出主指标、次级指标和必要终局证据,并用白话解释这些指标分别衡量什么、哪些只是健康度或支持性指标、以及每张主表到底证明了什么和没证明什么。"
|
|
106
106
|
),
|
|
107
107
|
[path.join(".claude", "commands", "lab-write.md")]: claudeCommand(
|
|
108
108
|
"lab-write",
|
|
@@ -300,6 +300,7 @@ const ZH_SKILL_FILES = {
|
|
|
300
300
|
- 实验设置
|
|
301
301
|
- 已验证主结果
|
|
302
302
|
- 位于 \`<deliverables_root>/main-tables.md\` 的受管主表工件
|
|
303
|
+
- 位于 \`<deliverables_root>/artifact-status.md\` 的内部工件状态
|
|
303
304
|
- 怎么看主表的阅读指引
|
|
304
305
|
- 消融
|
|
305
306
|
- 失败尝试
|
|
@@ -311,6 +312,7 @@ const ZH_SKILL_FILES = {
|
|
|
311
312
|
|
|
312
313
|
- \`.lab/context/mission.md\`
|
|
313
314
|
- \`.lab/context/state.md\`
|
|
315
|
+
- \`.lab/context/workflow-state.md\`
|
|
314
316
|
- \`.lab/context/decisions.md\`
|
|
315
317
|
- \`.lab/context/evidence-index.md\`
|
|
316
318
|
- \`.lab/context/eval-protocol.md\`
|
|
@@ -320,6 +322,7 @@ const ZH_SKILL_FILES = {
|
|
|
320
322
|
- \`.lab/context/mission.md\`
|
|
321
323
|
- \`.lab/context/eval-protocol.md\`
|
|
322
324
|
- \`.lab/context/state.md\`
|
|
325
|
+
- \`.lab/context/workflow-state.md\`
|
|
323
326
|
- \`.lab/context/evidence-index.md\`
|
|
324
327
|
|
|
325
328
|
## 证据规则
|
|
@@ -332,16 +335,19 @@ const ZH_SKILL_FILES = {
|
|
|
332
335
|
- 必须用白话解释选定的主指标和次级指标:每个指标在衡量什么、越高还是越低更好、它是主结果指标还是健康度/支持性指标。
|
|
333
336
|
- 如果出现 coverage、completeness、confidence 或类似健康度指标,必须明确说明这类指标回答的是“实验是否跑稳、证据是否完整”,而不是主要科学效应本身。
|
|
334
337
|
- 要把最关键的背景来源、方法/基线来源和指标来源直接写进报告,不要把它们藏在 \`.lab/context/*\` 里。
|
|
338
|
+
- 把 \`report.md\` 当作给外部评审或合作者看的研究 memo;来源章节必须给出人类可读的 anchor references,不能拿本地路径或内部 provenance 充数。
|
|
335
339
|
- 如果 \`.lab/context/terminology-lock.md\` 里已经冻结了方法名和 contribution bullets,就必须把它们带进报告。
|
|
336
340
|
- 方法概述必须用协作者能读懂的话说明:我们的方法大致怎么做、相对 closest prior work 或 strongest baseline 改了什么、这些 prior 方法各自做了什么,以及它们为什么在当前 claim 下仍然不够。
|
|
337
341
|
- 只保留少量最关键的 prior work/baseline 锚点;每个锚点都要用一句话交代它做了什么和它的局限。
|
|
342
|
+
- 在“背景来源”“方法与基线来源”“指标来源”里,每个锚点都必须包含:引用、它做了什么或衡量什么、以及至少一个局限或 caveat。
|
|
343
|
+
- 内部 provenance 只能放到 \`<deliverables_root>/artifact-status.md\` 或 \`.lab/context/evidence-index.md\`,不能塞进来源章节。
|
|
338
344
|
- 在起草报告前,先检查 \`.lab/context/mission.md\` 和 \`.lab/context/eval-protocol.md\` 是否仍是模板空壳。
|
|
339
345
|
- 如果 canonical context 还是空壳,要先根据 frozen result artifacts、data-decisions、evidence-index 和已批准上下文回填“最小可信版本”,再写报告。
|
|
340
346
|
- 如果回填后仍缺少协作者可读所需的关键字段,就必须把输出降级成 \`artifact-anchored interim report\`,不能冒充最终协作者报告。
|
|
341
347
|
- 如果现有的 \`report.md\` 或 \`main-tables.md\` 缺少受管模板要求的协作者可读章节,也必须视为报告缺陷;rerun 需要补齐这些缺失块,不能直接宣称“正文无变化”或把这次 rerun 当成 no-op。
|
|
342
348
|
- 报告起草或 rerun 完成后,必须运行 \`.lab/.managed/scripts/validate_collaborator_report.py --report <deliverables_root>/report.md --main-tables <deliverables_root>/main-tables.md\`。如果校验失败,就继续补正文,不能停在“只新增审计痕迹”的状态。
|
|
343
349
|
- 如果报告依赖了对原始指标或原始实现的偏差,必须明确写出这个偏差。
|
|
344
|
-
- workflow 工件状态、rerun id 或 LaTeX
|
|
350
|
+
- workflow 工件状态、rerun id 或 LaTeX 骨架状态不能混进“已验证主结果”;这些内容必须单列到 \`<deliverables_root>/artifact-status.md\`。
|
|
345
351
|
- 如果 workflow language 是中文,\`report.md\` 和 \`<deliverables_root>/main-tables.md\` 也应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。
|
|
346
352
|
- 解释优先保守,不要写成营销文案。
|
|
347
353
|
- 要给 \`/lab:write\` 留下清晰 handoff,尤其是 section draft 可以直接引用的证据链接。
|
|
@@ -699,6 +705,10 @@ const ZH_SKILL_FILES = {
|
|
|
699
705
|
|
|
700
706
|
## Checklist
|
|
701
707
|
|
|
708
|
+
- 学术有效性检查是否已经填写,并且和实际实验设置保持一致?
|
|
709
|
+
- 完整性自检是否排除了不可见输入、不合理指标使用和把工作流状态当成科学证据的做法?
|
|
710
|
+
- 异常信号是否先被当成 diagnostic trigger,而不是被直接合理化成结果?
|
|
711
|
+
- 在升格当前解释前,是否已经记录更简单的替代解释和至少一种交叉验证?
|
|
702
712
|
- 是否把 claims 和 evidence 分开写清楚?
|
|
703
713
|
- baseline 是否公平且足够强?
|
|
704
714
|
- 数据集、切分和指标是否合理?
|
|
@@ -763,20 +773,36 @@ const ZH_SKILL_FILES = {
|
|
|
763
773
|
|
|
764
774
|
## 背景来源
|
|
765
775
|
|
|
766
|
-
-
|
|
767
|
-
-
|
|
776
|
+
- 参考 1:
|
|
777
|
+
- 引用:
|
|
778
|
+
- 做了什么:
|
|
779
|
+
- 为什么和当前问题相关:
|
|
780
|
+
- 对当前项目的局限:
|
|
768
781
|
|
|
769
782
|
## 方法与基线来源
|
|
770
783
|
|
|
771
|
-
-
|
|
772
|
-
-
|
|
773
|
-
-
|
|
784
|
+
- 参考 1:
|
|
785
|
+
- 引用:
|
|
786
|
+
- 做了什么:
|
|
787
|
+
- 为什么是这里的关键对照:
|
|
788
|
+
- 相对我们目标的局限:
|
|
774
789
|
|
|
775
790
|
## 指标来源
|
|
776
791
|
|
|
777
|
-
-
|
|
778
|
-
-
|
|
779
|
-
-
|
|
792
|
+
- 参考 1:
|
|
793
|
+
- 引用:
|
|
794
|
+
- 衡量什么:
|
|
795
|
+
- 为什么适合这里:
|
|
796
|
+
- 局限或注意事项:
|
|
797
|
+
|
|
798
|
+
## 异常与替代解释
|
|
799
|
+
|
|
800
|
+
- 观察到的异常信号:
|
|
801
|
+
- 做过的实现层检查:
|
|
802
|
+
- 已排除的更简单解释:
|
|
803
|
+
- 支撑当前解释的交叉验证:
|
|
804
|
+
- 当前最站得住的解释:
|
|
805
|
+
- 未来异常出现时的升级阈值:
|
|
780
806
|
|
|
781
807
|
## 怎么看主表
|
|
782
808
|
|
|
@@ -791,11 +817,6 @@ const ZH_SKILL_FILES = {
|
|
|
791
817
|
- 最终表现摘要:
|
|
792
818
|
- 主表覆盖情况:
|
|
793
819
|
|
|
794
|
-
## 工件状态
|
|
795
|
-
|
|
796
|
-
- 已就绪的交付物或工作流工件:
|
|
797
|
-
- 这些工件状态为什么不是科学结论:
|
|
798
|
-
|
|
799
820
|
## 主要结果
|
|
800
821
|
|
|
801
822
|
- 主要发现 1:
|
|
@@ -810,6 +831,36 @@ const ZH_SKILL_FILES = {
|
|
|
810
831
|
|
|
811
832
|
- Claim:
|
|
812
833
|
- 缺失支持:
|
|
834
|
+
`,
|
|
835
|
+
[path.join(".lab", ".managed", "templates", "artifact-status.md")]:
|
|
836
|
+
`# 工件状态
|
|
837
|
+
|
|
838
|
+
## 交付物状态
|
|
839
|
+
|
|
840
|
+
- 协作者报告路径:
|
|
841
|
+
- 受管主表路径:
|
|
842
|
+
- 当前报告模式:
|
|
843
|
+
- 为什么当前状态是合理的:
|
|
844
|
+
|
|
845
|
+
## 工作流审计
|
|
846
|
+
|
|
847
|
+
- 最近完成动作:
|
|
848
|
+
- 最新工件路径:
|
|
849
|
+
- 最新 run 或 report id:
|
|
850
|
+
- rerun 或校验备注:
|
|
851
|
+
|
|
852
|
+
## 内部溯源
|
|
853
|
+
|
|
854
|
+
- 使用的冻结结果工件:
|
|
855
|
+
- 已刷新 canonical context:
|
|
856
|
+
- Evidence index 锚点:
|
|
857
|
+
|
|
858
|
+
## 论文交接
|
|
859
|
+
|
|
860
|
+
- 已可进入 \`/lab:write\` 的 sections:
|
|
861
|
+
- 可引用的证据包:
|
|
862
|
+
- 仍需要更强支持的 claims:
|
|
863
|
+
- 仍未完成的 paper-finishing 项:
|
|
813
864
|
`,
|
|
814
865
|
[path.join(".lab", ".managed", "templates", "main-tables.md")]:
|
|
815
866
|
`# 主表工件
|
|
@@ -1033,6 +1084,33 @@ const ZH_SKILL_FILES = {
|
|
|
1033
1084
|
- 协作者可读状态:
|
|
1034
1085
|
`,
|
|
1035
1086
|
[path.join(".lab", "context", "state.md")]:
|
|
1087
|
+
`# 研究状态
|
|
1088
|
+
|
|
1089
|
+
## 已批准方向
|
|
1090
|
+
|
|
1091
|
+
- One-sentence problem:
|
|
1092
|
+
- Approved direction:
|
|
1093
|
+
- Strongest supported claim:
|
|
1094
|
+
|
|
1095
|
+
## 证据边界
|
|
1096
|
+
|
|
1097
|
+
- What the current evidence really supports:
|
|
1098
|
+
- What is still outside the boundary:
|
|
1099
|
+
- Biggest research risk:
|
|
1100
|
+
|
|
1101
|
+
## 当前研究主线
|
|
1102
|
+
|
|
1103
|
+
- Current research focus:
|
|
1104
|
+
- Primary metric:
|
|
1105
|
+
- Dataset or benchmark scope:
|
|
1106
|
+
|
|
1107
|
+
## 当前研究约束
|
|
1108
|
+
|
|
1109
|
+
- Hard constraints:
|
|
1110
|
+
- Claim boundary:
|
|
1111
|
+
- Conditions that require reopening the direction:
|
|
1112
|
+
`,
|
|
1113
|
+
[path.join(".lab", "context", "workflow-state.md")]:
|
|
1036
1114
|
`# 工作流状态
|
|
1037
1115
|
|
|
1038
1116
|
## 当前阶段
|
|
@@ -1121,7 +1199,7 @@ const ZH_SKILL_FILES = {
|
|
|
1121
1199
|
- Terminal goal target:
|
|
1122
1200
|
- Required terminal artifact:
|
|
1123
1201
|
- 如果 workflow language 是中文,摘要、清单条目、任务标签和进度更新都应使用中文。
|
|
1124
|
-
- 示例 Objective: 推进 paper layer 3
|
|
1202
|
+
- 示例 Objective: 推进 paper layer 3,完成一轮 bounded protocol、测试、最小实现和一轮小规模结果。
|
|
1125
1203
|
|
|
1126
1204
|
## 循环预算
|
|
1127
1205
|
|
|
@@ -1149,8 +1227,8 @@ const ZH_SKILL_FILES = {
|
|
|
1149
1227
|
|
|
1150
1228
|
- Run stage contract: write persistent outputs under \`results_root\`.
|
|
1151
1229
|
- Iterate stage contract: update persistent outputs under \`results_root\`.
|
|
1152
|
-
- Review stage contract: update canonical review context such as \`.lab/context/decisions.md\`、\`state.md\`、\`open-questions.md\` or \`evidence-index.md\`.
|
|
1153
|
-
- Report stage contract: write
|
|
1230
|
+
- Review stage contract: update canonical review context such as \`.lab/context/decisions.md\`、\`state.md\`、\`workflow-state.md\`、\`open-questions.md\` or \`evidence-index.md\`.
|
|
1231
|
+
- Report stage contract: write \`<deliverables_root>/report.md\`、\`<deliverables_root>/main-tables.md\` and \`<deliverables_root>/artifact-status.md\`.
|
|
1154
1232
|
- Write stage contract: write LaTeX output under \`<deliverables_root>/paper/\`.
|
|
1155
1233
|
|
|
1156
1234
|
## 升格策略
|
|
@@ -1184,6 +1262,21 @@ const ZH_SKILL_FILES = {
|
|
|
1184
1262
|
- 对比方法来源论文:
|
|
1185
1263
|
- 对比方法实现来源:
|
|
1186
1264
|
- 与原始实现的偏差:
|
|
1265
|
+
- 评测设定语义:
|
|
1266
|
+
- 可见性与泄漏风险:
|
|
1267
|
+
- 锚点与标签策略:
|
|
1268
|
+
- 尺度与可比性策略:
|
|
1269
|
+
- 指标有效性检查:
|
|
1270
|
+
- 对比有效性检查:
|
|
1271
|
+
- 统计有效性检查:
|
|
1272
|
+
- 结论边界:
|
|
1273
|
+
- 完整性自检:
|
|
1274
|
+
- 异常信号:
|
|
1275
|
+
- 实现层现实检查:
|
|
1276
|
+
- 已考虑的替代解释:
|
|
1277
|
+
- 交叉验证方法:
|
|
1278
|
+
- 当前最站得住的解释:
|
|
1279
|
+
- 升级阈值:
|
|
1187
1280
|
- 终止目标类型:
|
|
1188
1281
|
- 终止目标目标值:
|
|
1189
1282
|
- 必要终止工件:
|
|
@@ -1246,7 +1339,7 @@ ZH_CONTENT[path.join(".lab", "system", "core.md")] = `# Lab 系统核心
|
|
|
1246
1339
|
|
|
1247
1340
|
1. \`.lab/context/session-brief.md\`
|
|
1248
1341
|
2. \`.lab/context/mission.md\`
|
|
1249
|
-
3. \`.lab/context/state.md\`
|
|
1342
|
+
3. \`.lab/context/workflow-state.md\`
|
|
1250
1343
|
4. \`.lab/context/evidence-index.md\`
|
|
1251
1344
|
5. \`.lab/context/data-decisions.md\`(当问题涉及数据集、benchmark 或对比方法时)
|
|
1252
1345
|
6. \`.lab/context/auto-mode.md\` 和 \`.lab/context/auto-status.md\`(当任务涉及自动模式时)
|
|
@@ -1255,13 +1348,15 @@ ZH_CONTENT[path.join(".lab", "system", "core.md")] = `# Lab 系统核心
|
|
|
1255
1348
|
|
|
1256
1349
|
## 工作流边界
|
|
1257
1350
|
|
|
1258
|
-
- \`.lab/context/\`
|
|
1351
|
+
- \`.lab/context/\` 同时保存持久研究状态和轻量工作流状态。
|
|
1259
1352
|
- \`.lab/changes/\`、\`.lab/iterations/\`、\`.lab/writing/\` 保存工作流控制工件、轻量 manifest 和 change-local harness。
|
|
1260
1353
|
- \`.lab/.managed/\` 保存工具托管模板和脚本。
|
|
1261
1354
|
- 持久 run 输出应写到 \`results_root\`,不要写进 \`.lab/changes/\`。
|
|
1262
1355
|
- 图表和可视化应写到 \`figures_root\`,不要写进 \`.lab/changes/\`。
|
|
1263
1356
|
- 最终交付物应写到 \`deliverables_root\`,不要写进 \`.lab/context/\`。
|
|
1264
1357
|
- change-local 的 \`data/\` 目录只应用来放轻量 manifest 或 batch spec,不要当正式数据集入口。
|
|
1358
|
+
- \`.lab/context/state.md\` 保存持久研究状态;\`.lab/context/workflow-state.md\` 保存当前工作流状态。
|
|
1359
|
+
- \`.lab/context/summary.md\` 是长期项目摘要;\`.lab/context/session-brief.md\` 是下一次会话启动简报。
|
|
1265
1360
|
- \`.lab/context/auto-mode.md\` 定义自动模式边界,\`.lab/context/auto-status.md\` 记录自动运行状态,二者都属于项目状态。
|
|
1266
1361
|
- 如果用户提供了 LaTeX 模板目录,先校验并通过 \`paper_template_root\` 接入,再开始写作。
|
|
1267
1362
|
- 已接入的模板目录视为用户资产,默认不要改模板文件,除非用户明确要求。
|
|
@@ -1353,7 +1448,7 @@ ZH_CONTENT[path.join(".lab", "context", "session-brief.md")] = `# 会话简报
|
|
|
1353
1448
|
## 先读这些文件
|
|
1354
1449
|
|
|
1355
1450
|
1. \`.lab/context/mission.md\`
|
|
1356
|
-
2. \`.lab/context/state.md\`
|
|
1451
|
+
2. \`.lab/context/workflow-state.md\`
|
|
1357
1452
|
3. \`.lab/context/evidence-index.md\`
|
|
1358
1453
|
|
|
1359
1454
|
## 不要静默修改
|
|
@@ -1578,7 +1673,7 @@ ZH_CONTENT[path.join(".lab", ".managed", "templates", "framing.md")] = `# 论文
|
|
|
1578
1673
|
ZH_CONTENT[path.join(".codex", "prompts", "lab.md")] = codexPrompt(
|
|
1579
1674
|
"查看 /lab 研究工作流总览并选择合适阶段",
|
|
1580
1675
|
"workflow question 或 stage choice",
|
|
1581
|
-
"# `/lab` for Codex\n\n`/lab` 是严格的研究工作流命令族。每次都使用同一套仓库工件和阶段边界。\n\n## 子命令\n\n- `/lab:idea`\n 调研 idea,定义问题与 failure case,归类 contribution 与 breakthrough level,对比现有方法,收束三个一眼就有意义的点,并在实现前保留 approval gate。\n\n- `/lab:data`\n 把已批准的 idea 转成数据集与 benchmark 方案,记录数据集年份、使用过该数据集的论文、下载来源、许可或访问限制,以及 classic-public、recent-strong-public、claim-specific 三类 benchmark 的纳入理由,和 canonical baselines、strong historical baselines、recent strong public methods、closest prior work 四类对比方法的纳入理由。\n\n- `/lab:auto`\n 在不改变 mission、framing 和核心 claims 的前提下,读取 eval-protocol 与 auto-mode 契约并自动编排 `run`、`iterate`、`review`、`report`,必要时扩展数据集、benchmark 和 comparison methods,并在满足升格策略时自动升级 primary package。启动前必须选定 autonomy level、声明 terminal goal,并显式批准契约。\n\n- `/lab:framing`\n 通过审计当前领域与相邻领域的术语,锁定 paper-facing 的方法名、模块名、论文题目和 contribution bullets,并在 section 起草前保留 approval gate。\n\n- `/lab:spec`\n 把已批准的 idea 转成 `.lab/changes/<change-id>/` 下的一个 lab change 目录,并在其中写出 `proposal`、`design`、`spec`、`tasks`。\n\n- `/lab:run`\n 执行最小有意义验证运行,登记 run,并生成第一版标准化评估摘要。\n\n- `/lab:iterate`\n 在冻结 mission、阈值、verification commands 与 `completion_promise` 的前提下执行有边界的实验迭代。\n\n- `/lab:review`\n 以 reviewer mode 审查文档或结果,先给短摘要,再输出 findings、fatal flaws、fix priority 和 residual risks。\n\n- `/lab:report`\n 从 runs 和 iterations 工件生成最终研究报告。\n\n- `/lab:write`\n 使用已安装 `lab` skill 下 vendored 的 paper-writing references,把稳定 report 工件转成论文 section。\n\n## 调度规则\n\n- 始终使用 `skills/lab/SKILL.md` 作为工作流合同。\n- 用户显式调用 `/lab:<stage>` 时,要立刻执行该 stage,而不是只推荐别的 `/lab` stage。\n- 先给简洁摘要,再决定是否写工件,最后回报输出路径和下一步。\n- 如果歧义会影响结论,一次只问一个问题;如果有多条可行路径,先给 2-3 个方案再收敛。\n- `/lab:spec` 前应已有经批准的数据集与 benchmark 方案。\n- `/lab:run`、`/lab:iterate`、`/lab:auto`、`/lab:report` 都应遵循 `.lab/context/eval-protocol.md`。\n- `.lab/context/eval-protocol.md` 不只定义主指标和主表,也应定义指标释义、实验阶梯,以及指标和对比实现的来源。\n- `/lab:auto` 只编排已批准边界内的执行阶段,不替代手动的 idea/data/framing/spec 决策。\n- `/lab:write` 前必须已有经批准的 `/lab:framing` 工件。\n\n## 如何输入 `/lab:auto`\n\n## `/lab:auto` 层级指南\n\n- `L1`:适合安全验证、一轮 bounded 真实运行,或简单 report 刷新。\n- `L2`:默认推荐级别,适合冻结核心边界内的常规实验迭代。\n- `L3`:激进 campaign 级别,只在你明确想做更大范围探索和可选写作时使用。\n- 如果不确定,默认推荐 `L2`。\n- 如果用户输入没写级别,或者把级别和 `paper layer`、`phase`、`table` 混用了,就应先停下来,要求用户明确选 `L1/L2/L3`。\n\n- 把 `Autonomy level L1/L2/L3` 视为执行权限级别,不要和论文里的 layer、phase、table 编号混用。\n- 把 `paper layer`、`phase`、`table` 视为实验目标。例如 `paper layer 3` 或 `Phase 1
|
|
1676
|
+
"# `/lab` for Codex\n\n`/lab` 是严格的研究工作流命令族。每次都使用同一套仓库工件和阶段边界。\n\n## 子命令\n\n- `/lab:idea`\n 调研 idea,定义问题与 failure case,归类 contribution 与 breakthrough level,对比现有方法,收束三个一眼就有意义的点,并在实现前保留 approval gate。\n\n- `/lab:data`\n 把已批准的 idea 转成数据集与 benchmark 方案,记录数据集年份、使用过该数据集的论文、下载来源、许可或访问限制,以及 classic-public、recent-strong-public、claim-specific 三类 benchmark 的纳入理由,和 canonical baselines、strong historical baselines、recent strong public methods、closest prior work 四类对比方法的纳入理由。\n\n- `/lab:auto`\n 在不改变 mission、framing 和核心 claims 的前提下,读取 eval-protocol 与 auto-mode 契约并自动编排 `run`、`iterate`、`review`、`report`,必要时扩展数据集、benchmark 和 comparison methods,并在满足升格策略时自动升级 primary package。启动前必须选定 autonomy level、声明 terminal goal,并显式批准契约。\n\n- `/lab:framing`\n 通过审计当前领域与相邻领域的术语,锁定 paper-facing 的方法名、模块名、论文题目和 contribution bullets,并在 section 起草前保留 approval gate。\n\n- `/lab:spec`\n 把已批准的 idea 转成 `.lab/changes/<change-id>/` 下的一个 lab change 目录,并在其中写出 `proposal`、`design`、`spec`、`tasks`。\n\n- `/lab:run`\n 执行最小有意义验证运行,登记 run,并生成第一版标准化评估摘要。\n\n- `/lab:iterate`\n 在冻结 mission、阈值、verification commands 与 `completion_promise` 的前提下执行有边界的实验迭代。\n\n- `/lab:review`\n 以 reviewer mode 审查文档或结果,先给短摘要,再输出 findings、fatal flaws、fix priority 和 residual risks。\n\n- `/lab:report`\n 从 runs 和 iterations 工件生成最终研究报告。\n\n- `/lab:write`\n 使用已安装 `lab` skill 下 vendored 的 paper-writing references,把稳定 report 工件转成论文 section。\n\n## 调度规则\n\n- 始终使用 `skills/lab/SKILL.md` 作为工作流合同。\n- 用户显式调用 `/lab:<stage>` 时,要立刻执行该 stage,而不是只推荐别的 `/lab` stage。\n- 先给简洁摘要,再决定是否写工件,最后回报输出路径和下一步。\n- 如果歧义会影响结论,一次只问一个问题;如果有多条可行路径,先给 2-3 个方案再收敛。\n- `/lab:spec` 前应已有经批准的数据集与 benchmark 方案。\n- `/lab:run`、`/lab:iterate`、`/lab:auto`、`/lab:report` 都应遵循 `.lab/context/eval-protocol.md`。\n- `.lab/context/eval-protocol.md` 不只定义主指标和主表,也应定义指标释义、实验阶梯,以及指标和对比实现的来源。\n- `/lab:auto` 只编排已批准边界内的执行阶段,不替代手动的 idea/data/framing/spec 决策。\n- `/lab:write` 前必须已有经批准的 `/lab:framing` 工件。\n\n## 如何输入 `/lab:auto`\n\n## `/lab:auto` 层级指南\n\n- `L1`:适合安全验证、一轮 bounded 真实运行,或简单 report 刷新。\n- `L2`:默认推荐级别,适合冻结核心边界内的常规实验迭代。\n- `L3`:激进 campaign 级别,只在你明确想做更大范围探索和可选写作时使用。\n- 如果不确定,默认推荐 `L2`。\n- 如果用户输入没写级别,或者把级别和 `paper layer`、`phase`、`table` 混用了,就应先停下来,要求用户明确选 `L1/L2/L3`。\n\n- 把 `Autonomy level L1/L2/L3` 视为执行权限级别,不要和论文里的 layer、phase、table 编号混用。\n- 把 `paper layer`、`phase`、`table` 视为实验目标。例如 `paper layer 3` 或 `Phase 1` 不是 `Autonomy level L3`。\n- 一条好的 `/lab:auto` 输入应至少说清:objective、自治级别、terminal goal、scope、allowed modifications。\n- 如果 workflow language 是中文,摘要、清单条目、任务标签和进度更新都应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。\n- 示例:`/lab:auto 自治级别 L2。目标:推进 paper layer 3。终止条件:完成 bounded protocol、测试、最小实现和一轮小规模结果。允许修改:配置、数据接入、评估脚本。`\n"
|
|
1582
1677
|
);
|
|
1583
1678
|
|
|
1584
1679
|
ZH_CONTENT[path.join(".codex", "prompts", "lab-data.md")] = codexPrompt(
|
|
@@ -1597,7 +1692,7 @@ ZH_CONTENT[path.join(".claude", "commands", "lab.md")] = claudeCommand(
|
|
|
1597
1692
|
"lab",
|
|
1598
1693
|
"查看 /lab 研究工作流总览并选择合适阶段",
|
|
1599
1694
|
"[stage] [target]",
|
|
1600
|
-
"# `/lab` for Claude\n\n`/lab` 是 Claude Code 里的 lab 工作流分发入口。调用方式有两种:\n\n- `/lab <stage> ...`\n- `/lab-idea`、`/lab-data`、`/lab-auto`、`/lab-framing`、`/lab-spec`、`/lab-run`、`/lab-iterate`、`/lab-review`、`/lab-report`、`/lab-write`\n\n## 阶段别名\n\n- `/lab idea ...` 或 `/lab-idea`\n- `/lab data ...` 或 `/lab-data`\n- `/lab auto ...` 或 `/lab-auto`\n- `/lab framing ...` 或 `/lab-framing`\n- `/lab spec ...` 或 `/lab-spec`\n- `/lab run ...` 或 `/lab-run`\n- `/lab iterate ...` 或 `/lab-iterate`\n- `/lab review ...` 或 `/lab-review`\n- `/lab report ...` 或 `/lab-report`\n- `/lab write ...` 或 `/lab-write`\n\n## 调度规则\n\n- 始终使用 `skills/lab/SKILL.md` 作为工作流合同。\n- 用户显式调用 `/lab <stage> ...` 或 `/lab-<stage>` 时,要立刻执行该 stage,而不是只推荐别的阶段。\n- 先给简洁摘要,再决定是否写工件,最后回报输出路径和下一步。\n- 如果歧义会影响结论,一次只问一个问题;如果有多条可行路径,先给 2-3 个方案再收敛。\n- `spec` 前应已有经批准的数据集与 benchmark 方案。\n- `run`、`iterate`、`auto`、`report` 都应遵循 `.lab/context/eval-protocol.md`。\n- `auto` 只编排已批准边界内的执行阶段,不替代手动的 idea/data/framing/spec 决策。\n- `write` 前必须已有经批准的 `framing` 工件。\n\n## 如何输入 `/lab auto`\n\n## `/lab auto` 层级指南\n\n- `L1`:适合安全验证、一轮 bounded 真实运行,或简单 report 刷新。\n- `L2`:默认推荐级别,适合冻结核心边界内的常规实验迭代。\n- `L3`:激进 campaign 级别,只在你明确想做更大范围探索和可选写作时使用。\n- 如果不确定,默认推荐 `L2`。\n- 如果用户输入没写级别,或者把级别和 `paper layer`、`phase`、`table` 混用了,就应先停下来,要求用户明确选 `L1/L2/L3`。\n\n- 把 `Autonomy level L1/L2/L3` 视为执行权限级别,不要和论文里的 layer、phase、table 编号混用。\n- 把 `paper layer`、`phase`、`table` 视为实验目标。例如 `paper layer 3` 或 `Phase 1
|
|
1695
|
+
"# `/lab` for Claude\n\n`/lab` 是 Claude Code 里的 lab 工作流分发入口。调用方式有两种:\n\n- `/lab <stage> ...`\n- `/lab-idea`、`/lab-data`、`/lab-auto`、`/lab-framing`、`/lab-spec`、`/lab-run`、`/lab-iterate`、`/lab-review`、`/lab-report`、`/lab-write`\n\n## 阶段别名\n\n- `/lab idea ...` 或 `/lab-idea`\n- `/lab data ...` 或 `/lab-data`\n- `/lab auto ...` 或 `/lab-auto`\n- `/lab framing ...` 或 `/lab-framing`\n- `/lab spec ...` 或 `/lab-spec`\n- `/lab run ...` 或 `/lab-run`\n- `/lab iterate ...` 或 `/lab-iterate`\n- `/lab review ...` 或 `/lab-review`\n- `/lab report ...` 或 `/lab-report`\n- `/lab write ...` 或 `/lab-write`\n\n## 调度规则\n\n- 始终使用 `skills/lab/SKILL.md` 作为工作流合同。\n- 用户显式调用 `/lab <stage> ...` 或 `/lab-<stage>` 时,要立刻执行该 stage,而不是只推荐别的阶段。\n- 先给简洁摘要,再决定是否写工件,最后回报输出路径和下一步。\n- 如果歧义会影响结论,一次只问一个问题;如果有多条可行路径,先给 2-3 个方案再收敛。\n- `spec` 前应已有经批准的数据集与 benchmark 方案。\n- `run`、`iterate`、`auto`、`report` 都应遵循 `.lab/context/eval-protocol.md`。\n- `auto` 只编排已批准边界内的执行阶段,不替代手动的 idea/data/framing/spec 决策。\n- `write` 前必须已有经批准的 `framing` 工件。\n\n## 如何输入 `/lab auto`\n\n## `/lab auto` 层级指南\n\n- `L1`:适合安全验证、一轮 bounded 真实运行,或简单 report 刷新。\n- `L2`:默认推荐级别,适合冻结核心边界内的常规实验迭代。\n- `L3`:激进 campaign 级别,只在你明确想做更大范围探索和可选写作时使用。\n- 如果不确定,默认推荐 `L2`。\n- 如果用户输入没写级别,或者把级别和 `paper layer`、`phase`、`table` 混用了,就应先停下来,要求用户明确选 `L1/L2/L3`。\n\n- 把 `Autonomy level L1/L2/L3` 视为执行权限级别,不要和论文里的 layer、phase、table 编号混用。\n- 把 `paper layer`、`phase`、`table` 视为实验目标。例如 `paper layer 3` 或 `Phase 1` 不是 `Autonomy level L3`。\n- 一条好的 `/lab auto` 输入应至少说清:objective、自治级别、terminal goal、scope、allowed modifications。\n- 如果 workflow language 是中文,摘要、清单条目、任务标签和进度更新都应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。\n- 示例:`/lab auto 自治级别 L2。目标:推进 paper layer 3。终止条件:完成 bounded protocol、测试、最小实现和一轮小规模结果。允许修改:配置、数据接入、评估脚本。`\n"
|
|
1601
1696
|
);
|
|
1602
1697
|
|
|
1603
1698
|
ZH_CONTENT[path.join(".claude", "commands", "lab-data.md")] = claudeCommand(
|
|
@@ -2040,6 +2135,27 @@ ZH_CONTENT[path.join(".lab", "context", "eval-protocol.md")] = `# 评估协议
|
|
|
2040
2135
|
- 对比方法实现来源:
|
|
2041
2136
|
- 与原始实现的偏差:
|
|
2042
2137
|
|
|
2138
|
+
## 学术有效性检查
|
|
2139
|
+
|
|
2140
|
+
- 评测设定语义:
|
|
2141
|
+
- 可见性与泄漏风险:
|
|
2142
|
+
- 锚点与标签策略:
|
|
2143
|
+
- 尺度与可比性策略:
|
|
2144
|
+
- 指标有效性检查:
|
|
2145
|
+
- 对比有效性检查:
|
|
2146
|
+
- 统计有效性检查:
|
|
2147
|
+
- 结论边界:
|
|
2148
|
+
- 完整性自检:
|
|
2149
|
+
|
|
2150
|
+
## 异常与替代解释检查
|
|
2151
|
+
|
|
2152
|
+
- 异常信号:
|
|
2153
|
+
- 实现层现实检查:
|
|
2154
|
+
- 已考虑的替代解释:
|
|
2155
|
+
- 交叉验证方法:
|
|
2156
|
+
- 当前最站得住的解释:
|
|
2157
|
+
- 升级阈值:
|
|
2158
|
+
|
|
2043
2159
|
## Gate Ladder
|
|
2044
2160
|
|
|
2045
2161
|
- 实验阶梯:
|
package/lib/install.cjs
CHANGED
|
@@ -36,6 +36,7 @@ const PROJECT_OWNED_LOCALIZED_PATHS = [
|
|
|
36
36
|
path.join(".lab", "config", "workflow.json"),
|
|
37
37
|
path.join(".lab", "context", "mission.md"),
|
|
38
38
|
path.join(".lab", "context", "state.md"),
|
|
39
|
+
path.join(".lab", "context", "workflow-state.md"),
|
|
39
40
|
path.join(".lab", "context", "decisions.md"),
|
|
40
41
|
path.join(".lab", "context", "evidence-index.md"),
|
|
41
42
|
path.join(".lab", "context", "open-questions.md"),
|
|
@@ -542,6 +543,7 @@ function localizeInstalledAssets(targetDir, lang, { newlyCreatedProjectOwnedPath
|
|
|
542
543
|
path.join(".lab", ".managed", "templates", "review-checklist.md"),
|
|
543
544
|
path.join(".lab", ".managed", "templates", "final-report.md"),
|
|
544
545
|
path.join(".lab", ".managed", "templates", "main-tables.md"),
|
|
546
|
+
path.join(".lab", ".managed", "templates", "artifact-status.md"),
|
|
545
547
|
path.join(".lab", ".managed", "templates", "paper-plan.md"),
|
|
546
548
|
path.join(".lab", ".managed", "templates", "paper-section.md"),
|
|
547
549
|
path.join(".lab", ".managed", "templates", "write-iteration.md"),
|
|
@@ -71,7 +71,7 @@ Use the same repository artifacts and stage boundaries every time.
|
|
|
71
71
|
- If the request omits the level or mixes it with a paper layer, phase, or table target, `/lab auto` should stop and ask for an explicit autonomy level before arming the loop.
|
|
72
72
|
|
|
73
73
|
- Treat `Autonomy level L1/L2/L3` as the execution privilege level, not as a paper layer, phase, or table number.
|
|
74
|
-
- Treat `paper layer`, `phase`, and `table` as experiment targets. For example, `paper layer 3` or `Phase 1
|
|
74
|
+
- Treat `paper layer`, `phase`, and `table` as experiment targets. For example, `paper layer 3` or `Phase 1` should not be interpreted as `Autonomy level L3`.
|
|
75
75
|
- A good `/lab auto` request should name:
|
|
76
76
|
- the objective
|
|
77
77
|
- the autonomy level
|
|
@@ -80,4 +80,4 @@ Use the same repository artifacts and stage boundaries every time.
|
|
|
80
80
|
- the allowed modifications
|
|
81
81
|
- If the repository workflow language is Chinese, summaries, checklist items, task labels, and progress updates should be written in Chinese unless a code identifier or file path must stay literal.
|
|
82
82
|
- Good example:
|
|
83
|
-
- `/lab auto Autonomy level L2. Objective: advance paper layer 3
|
|
83
|
+
- `/lab auto Autonomy level L2. Objective: advance paper layer 3 through one bounded protocol improvement. Terminal goal: task-completion. Scope: bounded protocol, tests, one minimal implementation, and one small run. Allowed modifications: configuration, evaluation script, and data-loading logic only.`
|
|
@@ -65,7 +65,7 @@ argument-hint: workflow question or stage choice
|
|
|
65
65
|
- If the request omits the level or mixes it with a paper layer, phase, or table target, `/lab:auto` should stop and ask for an explicit autonomy level before arming the loop.
|
|
66
66
|
|
|
67
67
|
- Treat `Autonomy level L1/L2/L3` as the execution privilege level, not as a paper layer, phase, or table number.
|
|
68
|
-
- Treat `paper layer`, `phase`, and `table` as experiment targets. For example, `paper layer 3` or `Phase 1
|
|
68
|
+
- Treat `paper layer`, `phase`, and `table` as experiment targets. For example, `paper layer 3` or `Phase 1` should not be interpreted as `Autonomy level L3`.
|
|
69
69
|
- A good `/lab:auto` request should name:
|
|
70
70
|
- the objective
|
|
71
71
|
- the autonomy level
|
|
@@ -74,4 +74,4 @@ argument-hint: workflow question or stage choice
|
|
|
74
74
|
- the allowed modifications
|
|
75
75
|
- If the repository workflow language is Chinese, summaries, checklist items, task labels, and progress updates should be written in Chinese unless a code identifier or file path must stay literal.
|
|
76
76
|
- Good example:
|
|
77
|
-
- `/lab:auto Autonomy level L2. Objective: advance paper layer 3
|
|
77
|
+
- `/lab:auto Autonomy level L2. Objective: advance paper layer 3 through one bounded protocol improvement. Terminal goal: task-completion. Scope: bounded protocol, tests, one minimal implementation, and one small run. Allowed modifications: configuration, evaluation script, and data-loading logic only.`
|
|
@@ -20,6 +20,10 @@ REPORT_REQUIRED_SECTIONS = {
|
|
|
20
20
|
r"^##\s+方法与基线来源\s*$",
|
|
21
21
|
],
|
|
22
22
|
"Metric Sources": [r"^##\s+Metric Sources\s*$", r"^##\s+指标来源\s*$"],
|
|
23
|
+
"Sanity and Alternative Explanations": [
|
|
24
|
+
r"^##\s+Sanity and Alternative Explanations\s*$",
|
|
25
|
+
r"^##\s+异常与替代解释\s*$",
|
|
26
|
+
],
|
|
23
27
|
}
|
|
24
28
|
|
|
25
29
|
MAIN_TABLES_REQUIRED_SECTIONS = {
|
|
@@ -30,6 +34,24 @@ MAIN_TABLES_REQUIRED_SECTIONS = {
|
|
|
30
34
|
"How to Read These Tables": [r"^##\s+How to Read These Tables\s*$", r"^##\s+怎么读这些表\s*$"],
|
|
31
35
|
}
|
|
32
36
|
|
|
37
|
+
SOURCE_SECTION_NAMES = (
|
|
38
|
+
"Background Sources",
|
|
39
|
+
"Method and Baseline Sources",
|
|
40
|
+
"Metric Sources",
|
|
41
|
+
)
|
|
42
|
+
SOURCE_SECTION_PATH_MARKERS = (
|
|
43
|
+
"/Users/",
|
|
44
|
+
"/home/",
|
|
45
|
+
"/tmp/",
|
|
46
|
+
"/private/tmp/",
|
|
47
|
+
".lab/",
|
|
48
|
+
"outputs/",
|
|
49
|
+
"docs/research/",
|
|
50
|
+
)
|
|
51
|
+
SOURCE_SECTION_CITATION_MARKERS = ("Citation:", "引用:")
|
|
52
|
+
SOURCE_SECTION_ROLE_MARKERS = ("What it established:", "What it does:", "What it measures:", "做了什么:", "衡量什么:")
|
|
53
|
+
SOURCE_SECTION_LIMITATION_MARKERS = ("Limitation", "局限")
|
|
54
|
+
|
|
33
55
|
|
|
34
56
|
def parse_args():
|
|
35
57
|
parser = argparse.ArgumentParser(
|
|
@@ -48,6 +70,35 @@ def missing_sections(text: str, required_sections: dict[str, list[str]]) -> list
|
|
|
48
70
|
return missing
|
|
49
71
|
|
|
50
72
|
|
|
73
|
+
def extract_section_body(text: str, patterns: list[str]) -> str:
|
|
74
|
+
for pattern in patterns:
|
|
75
|
+
match = re.search(pattern, text, flags=re.MULTILINE)
|
|
76
|
+
if not match:
|
|
77
|
+
continue
|
|
78
|
+
start = match.end()
|
|
79
|
+
next_heading = re.search(r"^##\s+", text[start:], flags=re.MULTILINE)
|
|
80
|
+
end = start + next_heading.start() if next_heading else len(text)
|
|
81
|
+
return text[start:end].strip()
|
|
82
|
+
return ""
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def validate_source_sections(text: str, label: str) -> list[str]:
|
|
86
|
+
issues = []
|
|
87
|
+
for section_name in SOURCE_SECTION_NAMES:
|
|
88
|
+
body = extract_section_body(text, REPORT_REQUIRED_SECTIONS[section_name])
|
|
89
|
+
if not body:
|
|
90
|
+
continue
|
|
91
|
+
if any(marker in body for marker in SOURCE_SECTION_PATH_MARKERS):
|
|
92
|
+
issues.append(f"{label} section '{section_name}' must not rely on local file paths or internal provenance")
|
|
93
|
+
if not any(marker in body for marker in SOURCE_SECTION_CITATION_MARKERS):
|
|
94
|
+
issues.append(f"{label} section '{section_name}' must include at least one citation anchor")
|
|
95
|
+
has_role = any(marker in body for marker in SOURCE_SECTION_ROLE_MARKERS)
|
|
96
|
+
has_limitation = any(marker in body for marker in SOURCE_SECTION_LIMITATION_MARKERS)
|
|
97
|
+
if not has_role or not has_limitation:
|
|
98
|
+
issues.append(f"{label} section '{section_name}' must explain what the anchor does and one limitation")
|
|
99
|
+
return issues
|
|
100
|
+
|
|
101
|
+
|
|
51
102
|
def validate(path_str: str, required_sections: dict[str, list[str]], label: str) -> list[str]:
|
|
52
103
|
path = Path(path_str)
|
|
53
104
|
if not path.exists():
|
|
@@ -56,6 +107,8 @@ def validate(path_str: str, required_sections: dict[str, list[str]], label: str)
|
|
|
56
107
|
missing = missing_sections(text, required_sections)
|
|
57
108
|
if missing:
|
|
58
109
|
return [f"{label} is missing required sections: {', '.join(missing)}"]
|
|
110
|
+
if label == "report.md":
|
|
111
|
+
return validate_source_sections(text, label)
|
|
59
112
|
return []
|
|
60
113
|
|
|
61
114
|
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Artifact Status
|
|
2
|
+
|
|
3
|
+
## Deliverable Status
|
|
4
|
+
|
|
5
|
+
- Collaborator-facing report path:
|
|
6
|
+
- Managed main tables path:
|
|
7
|
+
- Current report mode:
|
|
8
|
+
- Why this status is appropriate:
|
|
9
|
+
|
|
10
|
+
## Workflow Audit
|
|
11
|
+
|
|
12
|
+
- Latest completed action:
|
|
13
|
+
- Latest artifact path:
|
|
14
|
+
- Latest run or report id:
|
|
15
|
+
- Rerun or validation notes:
|
|
16
|
+
|
|
17
|
+
## Internal Provenance
|
|
18
|
+
|
|
19
|
+
- Frozen result artifacts used:
|
|
20
|
+
- Canonical context files refreshed:
|
|
21
|
+
- Evidence index anchors:
|
|
22
|
+
|
|
23
|
+
## Paper Handoff
|
|
24
|
+
|
|
25
|
+
- Sections ready for `/lab:write`:
|
|
26
|
+
- Evidence bundles to cite:
|
|
27
|
+
- Claims that still need stronger support:
|
|
28
|
+
- Paper-finishing items still open:
|
|
@@ -54,20 +54,36 @@
|
|
|
54
54
|
|
|
55
55
|
## Background Sources
|
|
56
56
|
|
|
57
|
-
-
|
|
58
|
-
-
|
|
57
|
+
- Anchor reference 1:
|
|
58
|
+
- Citation:
|
|
59
|
+
- What it established:
|
|
60
|
+
- Why it matters here:
|
|
61
|
+
- Limitation for the current project:
|
|
59
62
|
|
|
60
63
|
## Method and Baseline Sources
|
|
61
64
|
|
|
62
|
-
-
|
|
63
|
-
-
|
|
64
|
-
-
|
|
65
|
+
- Anchor reference 1:
|
|
66
|
+
- Citation:
|
|
67
|
+
- What it does:
|
|
68
|
+
- Why it is the right anchor here:
|
|
69
|
+
- Limitation relative to our goal:
|
|
65
70
|
|
|
66
71
|
## Metric Sources
|
|
67
72
|
|
|
68
|
-
-
|
|
69
|
-
-
|
|
70
|
-
-
|
|
73
|
+
- Anchor reference 1:
|
|
74
|
+
- Citation:
|
|
75
|
+
- What it measures:
|
|
76
|
+
- Why it is appropriate here:
|
|
77
|
+
- Limitation or caveat:
|
|
78
|
+
|
|
79
|
+
## Sanity and Alternative Explanations
|
|
80
|
+
|
|
81
|
+
- Anomaly signals observed:
|
|
82
|
+
- Implementation checks performed:
|
|
83
|
+
- Alternative explanations ruled out:
|
|
84
|
+
- Cross-checks that strengthen the current interpretation:
|
|
85
|
+
- Best-supported interpretation:
|
|
86
|
+
- Escalation threshold if future anomalies appear:
|
|
71
87
|
|
|
72
88
|
## Experiment Setup
|
|
73
89
|
|
|
@@ -89,11 +105,6 @@
|
|
|
89
105
|
- Final performance summary:
|
|
90
106
|
- Table coverage:
|
|
91
107
|
|
|
92
|
-
## Artifact Status
|
|
93
|
-
|
|
94
|
-
- Deliverables or workflow artifacts that are ready:
|
|
95
|
-
- Artifact status notes that are not scientific findings:
|
|
96
|
-
|
|
97
108
|
## Main Results
|
|
98
109
|
|
|
99
110
|
Summarize validated iteration outcomes.
|
|
@@ -113,9 +124,3 @@ Describe unresolved risks and external validity limits.
|
|
|
113
124
|
## Next Steps
|
|
114
125
|
|
|
115
126
|
List concrete follow-up actions.
|
|
116
|
-
|
|
117
|
-
## Paper Handoff
|
|
118
|
-
|
|
119
|
-
- Sections ready for `/lab:write`:
|
|
120
|
-
- Evidence bundles to cite:
|
|
121
|
-
- Claims that still need stronger support:
|