npm - superlab - Versions diffs - 0.1.11 → 0.1.13 - Mend

superlab 0.1.11 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +19 -2
package/README.zh-CN.md +19 -2
package/bin/superlab.cjs +43 -1
package/lib/auto.cjs +14 -771
package/lib/auto_common.cjs +129 -0
package/lib/auto_contracts.cjs +387 -0
package/lib/auto_runner.cjs +830 -0
package/lib/auto_state.cjs +227 -0
package/lib/context.cjs +94 -0
package/lib/eval_protocol.cjs +236 -0
package/lib/i18n.cjs +140 -11
package/lib/install.cjs +26 -6
package/package-assets/claude/commands/lab/auto.md +1 -1
package/package-assets/claude/commands/lab.md +2 -1
package/package-assets/codex/prompts/lab-auto.md +1 -1
package/package-assets/codex/prompts/lab.md +2 -1
package/package-assets/shared/lab/context/auto-mode.md +16 -0
package/package-assets/shared/lab/context/auto-outcome.md +28 -0
package/package-assets/shared/lab/context/auto-status.md +3 -0
package/package-assets/shared/lab/context/eval-protocol.md +46 -0
package/package-assets/shared/skills/lab/SKILL.md +12 -1
package/package-assets/shared/skills/lab/stages/auto.md +37 -7
package/package-assets/shared/skills/lab/stages/iterate.md +4 -0
package/package-assets/shared/skills/lab/stages/report.md +4 -0
package/package-assets/shared/skills/lab/stages/run.md +4 -1
package/package.json +1 -1

package/lib/i18n.cjs CHANGED Viewed

@@ -129,6 +129,7 @@ const ZH_SKILL_FILES = {
 - \`.lab/context/mission.md\`
 - \`.lab/context/state.md\`
 - \`.lab/context/data-decisions.md\`
+- \`.lab/context/eval-protocol.md\`
 - \`.lab/config/workflow.json\`
 ## 上下文写回
@@ -140,6 +141,7 @@ const ZH_SKILL_FILES = {
 - 优先选择能打通全链路的最小实验。
 - 数据、环境或 metric 接线有问题时要尽快失败。
+- run 目标必须对齐已批准的评估协议，而不是只跟随聊天里的临时目标。
 - 记录精确启动命令和输出位置。
 - 持久 run 输出、日志和 checkpoint 写到 \`results_root\`。
 - 图表和可视化写到 \`figures_root\`。
@@ -151,7 +153,11 @@ const ZH_SKILL_FILES = {
 2. 登记 run
 3. 执行最小有意义实验
 4. 标准化原始指标
-5. 校验标准化摘要
+5. 按当前评估协议校验标准化摘要
+## 约束
+- 不要凭记忆现想指标定义、baseline 行为或对比方法实现；它们必须锚定到已批准评估协议里记录的来源。
 ## 交互约束
@@ -171,6 +177,7 @@ const ZH_SKILL_FILES = {
 - baseline
 - 主指标
 - 成功阈值
+- evaluation ladder 与 benchmark 扩量 gate
 - verification commands
 - completion_promise
 - 最大迭代轮次
@@ -182,6 +189,7 @@ const ZH_SKILL_FILES = {
 - \`.lab/context/decisions.md\`
 - \`.lab/context/evidence-index.md\`
 - \`.lab/context/data-decisions.md\`
+- \`.lab/context/eval-protocol.md\`
 - \`.lab/config/workflow.json\`
 ## 上下文写回
@@ -221,6 +229,8 @@ const ZH_SKILL_FILES = {
 - 持久 run 输出、日志和 checkpoint 放在 \`results_root\`。
 - 图表和可视化放在 \`figures_root\`。
 - 不要把长期结果堆在 \`.lab/changes/<change-id>/runs\` 里。
+- 不要修改指标定义、baseline 语义或对比方法实现，除非评估协议已经记录了来源和与原始实现的偏差。
+- 如果要调整 ladder、样本量或升格 gate，必须继续锚定到带来源的评估协议，而不是靠聊天临时判断。
 ## 交互约束
@@ -294,6 +304,7 @@ const ZH_SKILL_FILES = {
 - \`.lab/context/state.md\`
 - \`.lab/context/decisions.md\`
 - \`.lab/context/evidence-index.md\`
+- \`.lab/context/eval-protocol.md\`
 ## 上下文写回
@@ -304,6 +315,9 @@ const ZH_SKILL_FILES = {
 - 不能隐藏失败迭代。
 - 每个主要 claim 都要指向已记录的 summary 或 iteration artifact。
+- 主表结构、gate 和最终结果 framing 必须对齐已批准的评估协议。
+- 不要凭记忆重述指标定义、baseline 行为或对比方法实现；直接引用评估协议里记录的来源。
+- 如果报告依赖了对原始指标或原始实现的偏差，必须明确写出这个偏差。
 - 解释优先保守，不要写成营销文案。
 - 要给 \`/lab:write\` 留下清晰 handoff，尤其是 section draft 可以直接引用的证据链接。
@@ -919,8 +933,13 @@ const ZH_SKILL_FILES = {
 ## 目标
 - Objective:
+- Autonomy level: L2
+- Approval status: draft
 - Allowed stages: run, iterate, review, report
 - Success criteria:
+- Terminal goal type:
+- Terminal goal target:
+- Required terminal artifact:
 ## 循环预算
@@ -941,6 +960,14 @@ const ZH_SKILL_FILES = {
 - Promotion check command:
 - Promotion command:
+## 阶段产物约束
+- Run stage contract: write persistent outputs under \`results_root\`.
+- Iterate stage contract: update persistent outputs under \`results_root\`.
+- Review stage contract: update canonical review context such as \`.lab/context/decisions.md\`、\`state.md\`、\`open-questions.md\` or \`evidence-index.md\`.
+- Report stage contract: write the final report to \`<deliverables_root>/report.md\`.
+- Write stage contract: write LaTeX output under \`<deliverables_root>/paper/\`.
 ## 升格策略
 - Promotion policy:
@@ -954,6 +981,37 @@ const ZH_SKILL_FILES = {
 - Stop conditions:
 - Escalation conditions:
+- Canonical promotion writeback: update \`.lab/context/data-decisions.md\`、\`.lab/context/decisions.md\`、\`.lab/context/state.md\` and \`.lab/context/session-brief.md\`.
+`,
+  [path.join(".lab", "context", "auto-outcome.md")]:
+`# 自动结果
+## 目标
+- Objective:
+- Experiment ladder:
+- Metric glossary:
+- Metric source papers:
+- Metric implementation source:
+- Comparison source papers:
+- Comparison implementation source:
+- Deviation from original implementation:
+- Terminal goal type:
+- Terminal goal target:
+- Required terminal artifact:
+## 结果
+- Status: idle
+- Goal reached: no
+- Stop reason:
+- Promotion applied: no
+- Final artifact:
+- Final rung:
+- Executed stages:
+- Iterations completed: 0
+- Started at:
+- Finished at:
 `,
   [path.join(".lab", "context", "auto-status.md")]:
 `# 自动模式状态
@@ -965,6 +1023,9 @@ const ZH_SKILL_FILES = {
 - Current command:
 - Active run id:
 - Iteration count: 0
+- Current rung:
+- Watch target:
+- Next rung:
 ## 时间
@@ -1322,7 +1383,7 @@ ZH_CONTENT[path.join(".lab", ".managed", "templates", "framing.md")] = `# 论文
 ZH_CONTENT[path.join(".codex", "prompts", "lab.md")] = codexPrompt(
   "查看 /lab 研究工作流总览并选择合适阶段",
   "workflow question 或 stage choice",
-  "# `/lab` for Codex\n\n`/lab` 是严格的研究工作流命令族。每次都使用同一套仓库工件和阶段边界。\n\n## 子命令\n\n- `/lab:idea`\n  调研 idea，定义问题与 failure case，归类 contribution 与 breakthrough level，对比现有方法，收束三个一眼就有意义的点，并在实现前保留 approval gate。\n\n- `/lab:data`\n  把已批准的 idea 转成数据集与 benchmark 方案，记录数据集年份、使用过该数据集的论文、下载来源、许可或访问限制，以及 classic-public、recent-strong-public、claim-specific 三类 benchmark 的纳入理由，和 canonical baselines、strong historical baselines、recent strong public methods、closest prior work 四类对比方法的纳入理由。\n\n- `/lab:auto`\n  在不改变 mission、framing 和核心 claims 的前提下，读取 auto-mode 契约并自动编排 `run`、`iterate`、`review`、`report`，必要时扩展数据集、benchmark 和 comparison methods，并在满足升格策略时自动升级 primary package。\n\n- `/lab:framing`\n  通过审计当前领域与相邻领域的术语，锁定 paper-facing 的方法名、模块名、论文题目和 contribution bullets，并在 section 起草前保留 approval gate。\n\n- `/lab:spec`\n  把已批准的 idea 转成 `.lab/changes/<change-id>/` 下的一个 lab change 目录，并在其中写出 `proposal`、`design`、`spec`、`tasks`。\n\n- `/lab:run`\n  执行最小有意义验证运行，登记 run，并生成第一版标准化评估摘要。\n\n- `/lab:iterate`\n  在冻结 mission、阈值、verification commands 与 `completion_promise` 的前提下执行有边界的实验迭代。\n\n- `/lab:review`\n  以 reviewer mode 审查文档或结果，先给短摘要，再输出 findings、fatal flaws、fix priority 和 residual risks。\n\n- `/lab:report`\n  从 runs 和 iterations 工件生成最终研究报告。\n\n- `/lab:write`\n  使用已安装 `lab` skill 下 vendored 的 paper-writing references，把稳定 report 工件转成论文 section。\n\n## 调度规则\n\n- 始终使用 `skills/lab/SKILL.md` 作为工作流合同。\n- 用户显式调用 `/lab:<stage>` 时，要立刻执行该 stage，而不是只推荐别的 `/lab` stage。\n- 先给简洁摘要，再决定是否写工件，最后回报输出路径和下一步。\n- 如果歧义会影响结论，一次只问一个问题；如果有多条可行路径，先给 2-3 个方案再收敛。\n- `/lab:spec` 前应已有经批准的数据集与 benchmark 方案。\n- `/lab:auto` 只编排已批准边界内的执行阶段，不替代手动的 idea/data/framing/spec 决策。\n- `/lab:write` 前必须已有经批准的 `/lab:framing` 工件。\n"
+  "# `/lab` for Codex\n\n`/lab` 是严格的研究工作流命令族。每次都使用同一套仓库工件和阶段边界。\n\n## 子命令\n\n- `/lab:idea`\n  调研 idea，定义问题与 failure case，归类 contribution 与 breakthrough level，对比现有方法，收束三个一眼就有意义的点，并在实现前保留 approval gate。\n\n- `/lab:data`\n  把已批准的 idea 转成数据集与 benchmark 方案，记录数据集年份、使用过该数据集的论文、下载来源、许可或访问限制，以及 classic-public、recent-strong-public、claim-specific 三类 benchmark 的纳入理由，和 canonical baselines、strong historical baselines、recent strong public methods、closest prior work 四类对比方法的纳入理由。\n\n- `/lab:auto`\n  在不改变 mission、framing 和核心 claims 的前提下，读取 eval-protocol 与 auto-mode 契约并自动编排 `run`、`iterate`、`review`、`report`，必要时扩展数据集、benchmark 和 comparison methods，并在满足升格策略时自动升级 primary package。启动前必须选定 autonomy level、声明 terminal goal，并显式批准契约。\n\n- `/lab:framing`\n  通过审计当前领域与相邻领域的术语，锁定 paper-facing 的方法名、模块名、论文题目和 contribution bullets，并在 section 起草前保留 approval gate。\n\n- `/lab:spec`\n  把已批准的 idea 转成 `.lab/changes/<change-id>/` 下的一个 lab change 目录，并在其中写出 `proposal`、`design`、`spec`、`tasks`。\n\n- `/lab:run`\n  执行最小有意义验证运行，登记 run，并生成第一版标准化评估摘要。\n\n- `/lab:iterate`\n  在冻结 mission、阈值、verification commands 与 `completion_promise` 的前提下执行有边界的实验迭代。\n\n- `/lab:review`\n  以 reviewer mode 审查文档或结果，先给短摘要，再输出 findings、fatal flaws、fix priority 和 residual risks。\n\n- `/lab:report`\n  从 runs 和 iterations 工件生成最终研究报告。\n\n- `/lab:write`\n  使用已安装 `lab` skill 下 vendored 的 paper-writing references，把稳定 report 工件转成论文 section。\n\n## 调度规则\n\n- 始终使用 `skills/lab/SKILL.md` 作为工作流合同。\n- 用户显式调用 `/lab:<stage>` 时，要立刻执行该 stage，而不是只推荐别的 `/lab` stage。\n- 先给简洁摘要，再决定是否写工件，最后回报输出路径和下一步。\n- 如果歧义会影响结论，一次只问一个问题；如果有多条可行路径，先给 2-3 个方案再收敛。\n- `/lab:spec` 前应已有经批准的数据集与 benchmark 方案。\n- `/lab:run`、`/lab:iterate`、`/lab:auto`、`/lab:report` 都应遵循 `.lab/context/eval-protocol.md`。\n- `.lab/context/eval-protocol.md` 不只定义主指标和主表，也应定义指标释义、实验阶梯，以及指标和对比实现的来源。\n- `/lab:auto` 只编排已批准边界内的执行阶段，不替代手动的 idea/data/framing/spec 决策。\n- `/lab:write` 前必须已有经批准的 `/lab:framing` 工件。\n"
 );
 ZH_CONTENT[path.join(".codex", "prompts", "lab-data.md")] = codexPrompt(
@@ -1334,14 +1395,14 @@ ZH_CONTENT[path.join(".codex", "prompts", "lab-data.md")] = codexPrompt(
 ZH_CONTENT[path.join(".codex", "prompts", "lab-auto.md")] = codexPrompt(
   "在已批准边界内编排自动实验循环",
   "auto mode objective",
-  "使用已安装的 `lab` 技能：`.codex/skills/lab/SKILL.md`。\n\n立刻针对用户当前给出的参数执行 `/lab:auto`，不要只推荐别的 `/lab` 阶段。只有在缺少阻塞性前提时，才明确指出缺什么，并且一次最多追问一个问题。\n\n本命令运行 `/lab:auto` 阶段。它必须读取 `.lab/context/auto-mode.md` 与 `.lab/context/auto-status.md`，在不修改 mission、framing 和核心 claims 的前提下，编排已批准的 `run`、`iterate`、`review`、`report`，轮询长任务完成情况，并在满足升格策略时自动升级 primary package。"
+  "使用已安装的 `lab` 技能：`.codex/skills/lab/SKILL.md`。\n\n立刻针对用户当前给出的参数执行 `/lab:auto`，不要只推荐别的 `/lab` 阶段。只有在缺少阻塞性前提时，才明确指出缺什么，并且一次最多追问一个问题。\n\n本命令运行 `/lab:auto` 阶段。它必须读取 `.lab/context/eval-protocol.md`、`.lab/context/auto-mode.md`、`.lab/context/auto-status.md` 与 `.lab/context/auto-outcome.md`，先确认 autonomy level、approval status 与 terminal goal schema，再把 eval-protocol 里的指标释义、主表计划、来源约束与结构化实验阶梯当作执行依据，在不修改 mission、framing 和核心 claims 的前提下编排已批准的 `run`、`iterate`、`review`、`report`，轮询长任务完成情况；如果声明了 rung，就保持会话活着并按 rung 转移继续推进。"
 );
 ZH_CONTENT[path.join(".claude", "commands", "lab.md")] = claudeCommand(
   "LAB",
   "查看 /lab 研究工作流总览并选择合适阶段",
   "workflow, research, overview",
-  "# `/lab` for Claude\n\n`/lab` 是严格的研究工作流命令族。每次都使用同一套仓库工件和阶段边界。\n\n## 子命令\n\n- `/lab:idea`\n  调研 idea，定义问题与 failure case，归类 contribution 与 breakthrough level，对比现有方法，收束三个一眼就有意义的点，并在实现前保留 approval gate。\n\n- `/lab:data`\n  把已批准的 idea 转成数据集与 benchmark 方案，记录数据集年份、使用过该数据集的论文、下载来源、许可或访问限制，以及 classic-public、recent-strong-public、claim-specific 三类 benchmark 的纳入理由，和 canonical baselines、strong historical baselines、recent strong public methods、closest prior work 四类对比方法的纳入理由。\n\n- `/lab:auto`\n  在不改变 mission、framing 和核心 claims 的前提下，读取 auto-mode 契约并自动编排 `run`、`iterate`、`review`、`report`，必要时扩展数据集、benchmark 和 comparison methods，并在满足升格策略时自动升级 primary package。\n\n- `/lab:framing`\n  通过审计当前领域与相邻领域的术语，锁定 paper-facing 的方法名、模块名、论文题目和 contribution bullets，并在 section 起草前保留 approval gate。\n\n- `/lab:spec`\n  把已批准的 idea 转成 `.lab/changes/<change-id>/` 下的一个 lab change 目录，并在其中写出 `proposal`、`design`、`spec`、`tasks`。\n\n- `/lab:run`\n  执行最小有意义验证运行，登记 run，并生成第一版标准化评估摘要。\n\n- `/lab:iterate`\n  在冻结 mission、阈值、verification commands 与 `completion_promise` 的前提下执行有边界的实验迭代。\n\n- `/lab:review`\n  以 reviewer mode 审查文档或结果，先给短摘要，再输出 findings、fatal flaws、fix priority 和 residual risks。\n\n- `/lab:report`\n  从 runs 和 iterations 工件生成最终研究报告。\n\n- `/lab:write`\n  使用已安装 `lab` skill 下 vendored 的 paper-writing references，把稳定 report 工件转成论文 section。\n\n## 调度规则\n\n- 始终使用 `skills/lab/SKILL.md` 作为工作流合同。\n- 用户显式调用 `/lab:<stage>` 时，要立刻执行该 stage，而不是只推荐别的 `/lab` stage。\n- 先给简洁摘要，再决定是否写工件，最后回报输出路径和下一步。\n- 如果歧义会影响结论，一次只问一个问题；如果有多条可行路径，先给 2-3 个方案再收敛。\n- `/lab:spec` 前应已有经批准的数据集与 benchmark 方案。\n- `/lab:auto` 只编排已批准边界内的执行阶段，不替代手动的 idea/data/framing/spec 决策。\n- `/lab:write` 前必须已有经批准的 `/lab:framing` 工件。\n"
+  "# `/lab` for Claude\n\n`/lab` 是严格的研究工作流命令族。每次都使用同一套仓库工件和阶段边界。\n\n## 子命令\n\n- `/lab:idea`\n  调研 idea，定义问题与 failure case，归类 contribution 与 breakthrough level，对比现有方法，收束三个一眼就有意义的点，并在实现前保留 approval gate。\n\n- `/lab:data`\n  把已批准的 idea 转成数据集与 benchmark 方案，记录数据集年份、使用过该数据集的论文、下载来源、许可或访问限制，以及 classic-public、recent-strong-public、claim-specific 三类 benchmark 的纳入理由，和 canonical baselines、strong historical baselines、recent strong public methods、closest prior work 四类对比方法的纳入理由。\n\n- `/lab:auto`\n  在不改变 mission、framing 和核心 claims 的前提下，读取 eval-protocol 与 auto-mode 契约并自动编排 `run`、`iterate`、`review`、`report`，必要时扩展数据集、benchmark 和 comparison methods，并在满足升格策略时自动升级 primary package。启动前必须选定 autonomy level、声明 terminal goal，并显式批准契约。\n\n- `/lab:framing`\n  通过审计当前领域与相邻领域的术语，锁定 paper-facing 的方法名、模块名、论文题目和 contribution bullets，并在 section 起草前保留 approval gate。\n\n- `/lab:spec`\n  把已批准的 idea 转成 `.lab/changes/<change-id>/` 下的一个 lab change 目录，并在其中写出 `proposal`、`design`、`spec`、`tasks`。\n\n- `/lab:run`\n  执行最小有意义验证运行，登记 run，并生成第一版标准化评估摘要。\n\n- `/lab:iterate`\n  在冻结 mission、阈值、verification commands 与 `completion_promise` 的前提下执行有边界的实验迭代。\n\n- `/lab:review`\n  以 reviewer mode 审查文档或结果，先给短摘要，再输出 findings、fatal flaws、fix priority 和 residual risks。\n\n- `/lab:report`\n  从 runs 和 iterations 工件生成最终研究报告。\n\n- `/lab:write`\n  使用已安装 `lab` skill 下 vendored 的 paper-writing references，把稳定 report 工件转成论文 section。\n\n## 调度规则\n\n- 始终使用 `skills/lab/SKILL.md` 作为工作流合同。\n- 用户显式调用 `/lab:<stage>` 时，要立刻执行该 stage，而不是只推荐别的 `/lab` stage。\n- 先给简洁摘要，再决定是否写工件，最后回报输出路径和下一步。\n- 如果歧义会影响结论，一次只问一个问题；如果有多条可行路径，先给 2-3 个方案再收敛。\n- `/lab:spec` 前应已有经批准的数据集与 benchmark 方案。\n- `/lab:run`、`/lab:iterate`、`/lab:auto`、`/lab:report` 都应遵循 `.lab/context/eval-protocol.md`。\n- `.lab/context/eval-protocol.md` 不只定义主指标和主表，也应定义指标释义、实验阶梯，以及指标和对比实现的来源。\n- `/lab:auto` 只编排已批准边界内的执行阶段，不替代手动的 idea/data/framing/spec 决策。\n- `/lab:write` 前必须已有经批准的 `/lab:framing` 工件。\n"
 );
 ZH_CONTENT[path.join(".claude", "commands", "lab", "data.md")] = claudeCommand(
@@ -1355,7 +1416,7 @@ ZH_CONTENT[path.join(".claude", "commands", "lab", "auto.md")] = claudeCommand(
   "LAB: Auto",
   "在已批准边界内编排自动实验循环",
   "workflow, research, auto",
-  "使用已安装的 `lab` 技能：`.claude/skills/lab/SKILL.md`。\n\n立刻针对用户当前给出的参数执行 `/lab:auto`，不要只推荐别的 `/lab` 阶段。只有在缺少阻塞性前提时，才明确指出缺什么，并且一次最多追问一个问题。\n\n本命令运行 `/lab:auto` 阶段。它必须读取 `.lab/context/auto-mode.md` 与 `.lab/context/auto-status.md`，在不修改 mission、framing 和核心 claims 的前提下，编排已批准的 `run`、`iterate`、`review`、`report`，轮询长任务完成情况，并在满足升格策略时自动升级 primary package。"
+  "使用已安装的 `lab` 技能：`.claude/skills/lab/SKILL.md`。\n\n立刻针对用户当前给出的参数执行 `/lab:auto`，不要只推荐别的 `/lab` 阶段。只有在缺少阻塞性前提时，才明确指出缺什么，并且一次最多追问一个问题。\n\n本命令运行 `/lab:auto` 阶段。它必须读取 `.lab/context/eval-protocol.md`、`.lab/context/auto-mode.md`、`.lab/context/auto-status.md` 与 `.lab/context/auto-outcome.md`，先确认 autonomy level、approval status 与 terminal goal schema，再把 eval-protocol 里的指标释义、主表计划、来源约束与结构化实验阶梯当作执行依据，在不修改 mission、framing 和核心 claims 的前提下编排已批准的 `run`、`iterate`、`review`、`report`，轮询长任务完成情况；如果声明了 rung，就保持会话活着并按 rung 转移继续推进。"
 );
 ZH_CONTENT[path.join(".codex", "skills", "lab", "SKILL.md")] = `---
@@ -1755,6 +1816,52 @@ ZH_CONTENT[path.join(".lab", "context", "data-decisions.md")] = `# 已批准数
 - 剩余预处理或 leakage 风险：
 `;
+ZH_CONTENT[path.join(".lab", "context", "eval-protocol.md")] = `# 评估协议
+用这份文件定义 \`/lab:run\`、\`/lab:iterate\`、\`/lab:auto\` 和 \`/lab:report\` 共用的论文导向评估目标、主表计划、gate 与 benchmark ladder。
+## 主评估目标
+- 主评估目标：
+- 主指标：
+- 次级指标：
+- 必要终局证据：
+## 主表计划
+- 主表计划：
+- 每张表必须支撑的 claims：
+## 指标释义
+- 指标释义：
+- 指标来源论文：
+- 指标实现来源：
+- 对比方法来源论文：
+- 对比方法实现来源：
+- 与原始实现的偏差：
+## Gate Ladder
+- 实验阶梯：
+- benchmark 阶梯：
+- 对比方法 gate：
+- 升格 gate：
+- 最小样本量：
+- 必要输出工件：
+### Rung: <rung-id>
+- 阶段：
+- 目标：
+- 命令：
+- 监视目标：
+- gate 命令：
+- 通过后：
+- 失败后：
+- 停止后：
+`;
 ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/lab:auto\` 阶段指南
 ## 必要输出
@@ -1772,9 +1879,11 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
 - \`.lab/context/decisions.md\`
 - \`.lab/context/data-decisions.md\`
 - \`.lab/context/evidence-index.md\`
+- \`.lab/context/eval-protocol.md\`
 - \`.lab/context/terminology-lock.md\`
 - \`.lab/context/auto-mode.md\`
 - \`.lab/context/auto-status.md\`
+- \`.lab/context/auto-outcome.md\`
 ## 上下文写回
@@ -1785,31 +1894,51 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
 - \`.lab/context/summary.md\`
 - \`.lab/context/session-brief.md\`
 - \`.lab/context/auto-status.md\`
+- \`.lab/context/auto-outcome.md\`
 ## 边界规则
 - 把 \`/lab:auto\` 当作编排层，不要再发明第二套 workflow。
+- 把 \`.lab/context/eval-protocol.md\` 当作论文导向指标、指标释义、主表、gate 与结构化实验阶梯的唯一来源。
+- 把评估协议当作“带来源的协议”，不是“临场想出来的说明”：指标定义、baseline 行为、对比实现和偏差都必须先写明来源，再用于 gate 或 promotion。
+- 契约里必须声明 \`Autonomy level\` 和 \`Approval status\`，只有显式写成 \`approved\` 才能启动。
+- 契约里还必须声明具体的 terminal goal：\`rounds\`、\`metric-threshold\` 或 \`task-completion\`，并补齐 \`Terminal goal target\` 与 \`Required terminal artifact\`。
+- 级别含义固定为：
+  - \`L1\`：safe run，只允许 \`run\`、\`review\`、\`report\`
+  - \`L2\`：bounded iteration，允许 \`run\`、\`iterate\`、\`review\`、\`report\`
+  - \`L3\`：aggressive campaign，才允许额外编排 \`write\`
 - 默认只编排 \`run\`、\`iterate\`、\`review\`、\`report\`；只有 framing 已批准时才可选 \`write\`。
 - 不要自动修改 mission、paper-facing framing 或核心 claims。
 - 可以在 exploration envelope 内增加数据集、benchmark 和 comparison methods。
 - 只有在 auto-mode 契约中的升格策略满足时，才允许把 exploratory addition 自动升格为 primary package。
 - 长任务必须通过轮询推进，直到完成、超时或命中停止条件。
+- 每次结束都必须写出规范的 \`.lab/context/auto-outcome.md\`。
+- 如果评估协议声明了结构化 rung，就按前台 rung 状态机执行：每个 rung 都要声明阶段、目标、命令、监视目标、gate、通过后/失败后/停止后的转移，并把当前 rung、监视目标和下一 rung 写进 \`.lab/context/auto-status.md\`。
+- 不要只看命令退出码；必须检查阶段产物约束：
+  - \`run\` 和 \`iterate\` 更新 \`results_root\`
+  - \`review\` 更新规范审查上下文
+  - \`report\` 写出 \`<deliverables_root>/report.md\`
+  - \`write\` 写出 \`<deliverables_root>/paper/\` 下的 LaTeX 产物
+- promotion 成功后，必须写回 \`data-decisions.md\`、\`decisions.md\`、\`state.md\` 和 \`session-brief.md\`。
+- 如果某个指标或对比 claim 在评估协议里没有带来源的定义，就不能拿它做 stop 或 promotion 判断。
 ## 最小流程
 1. 校验自动模式契约
-2. 设置或刷新自动模式状态
-3. 选择下一个允许的 \`/lab\` 子阶段
-4. 发起有边界动作
-5. 轮询进程、checkpoint 或 summary 的变化
-6. 记录结果并决定 continue、promote、stop 或 escalate
+2. 确认已批准的 autonomy level 与允许阶段一致
+3. 设置或刷新自动模式状态
+4. 选择下一个允许的 \`/lab\` 子阶段
+5. 发起有边界动作
+6. 轮询进程、checkpoint 或 summary 的变化
+7. 评估声明过的 terminal goal 是否已经达成
+8. 记录结果并决定 continue、promote、stop 或 escalate
 ## 交互约束
 - 开始前先简洁说明：objective、frozen core 和下一自动阶段。
 - 如果契约本身不完整，一次只追问一个问题。
 - 如果存在多个可信的下一动作，先给 2-3 个 bounded 方案和推荐项，再启动长任务。
-- 只有当下一步会离开 frozen core 时，才保留人工 approval gate。
+- 只有当下一步会离开已批准的 exploration envelope、超出选定 autonomy level，或实质改变 frozen core 时，才保留人工 approval gate。
 `;
 ZH_CONTENT[path.join(".claude", "skills", "lab", "stages", "auto.md")] =

package/lib/install.cjs CHANGED Viewed

@@ -40,8 +40,10 @@ const PROJECT_OWNED_LOCALIZED_PATHS = [
   path.join(".lab", "context", "evidence-index.md"),
   path.join(".lab", "context", "open-questions.md"),
   path.join(".lab", "context", "data-decisions.md"),
+  path.join(".lab", "context", "eval-protocol.md"),
   path.join(".lab", "context", "auto-mode.md"),
   path.join(".lab", "context", "auto-status.md"),
+  path.join(".lab", "context", "auto-outcome.md"),
   path.join(".lab", "context", "terminology-lock.md"),
   path.join(".lab", "context", "summary.md"),
   path.join(".lab", "context", "next-action.md"),
@@ -431,13 +433,31 @@ function registerProjectInstall(targetDir, metadata, { env = process.env } = {})
 }
 function isTemporaryTestPath(targetDir) {
-  const resolvedTarget = path.resolve(targetDir);
-  const tmpRoot = path.resolve(os.tmpdir());
-  const relativeToTmp = path.relative(tmpRoot, resolvedTarget);
-  if (relativeToTmp.startsWith("..") || path.isAbsolute(relativeToTmp)) {
-    return false;
+  const normalizedCandidates = new Set([path.resolve(targetDir)]);
+  try {
+    normalizedCandidates.add(fs.realpathSync(targetDir));
+  } catch {}
+  const tempRoots = new Set([path.resolve(os.tmpdir()), path.resolve("/tmp"), path.resolve("/private/tmp")]);
+  for (const root of Array.from(tempRoots)) {
+    try {
+      tempRoots.add(fs.realpathSync(root));
+    } catch {}
+  }
+  for (const candidate of normalizedCandidates) {
+    if (!path.basename(candidate).startsWith("superlab-")) {
+      continue;
+    }
+    for (const tempRoot of tempRoots) {
+      const relativeToTmp = path.relative(tempRoot, candidate);
+      if (!relativeToTmp.startsWith("..") && !path.isAbsolute(relativeToTmp)) {
+        return true;
+      }
+    }
   }
-  return path.basename(resolvedTarget).startsWith("superlab-");
+  return false;
 }
 function detectLanguage({ explicitLang, env = process.env } = {}) {

package/package-assets/claude/commands/lab/auto.md CHANGED Viewed

@@ -8,4 +8,4 @@ tags: [workflow, research, auto]
 Use the installed `lab` skill at `.claude/skills/lab/SKILL.md`.
 Execute the requested `/lab:auto` stage against the user's argument now. Do not only recommend another lab stage. If a blocking prerequisite is missing, say exactly what is missing and ask at most one clarifying question.
-This command runs the `/lab:auto` stage. It must read `.lab/context/auto-mode.md` and `.lab/context/auto-status.md`, orchestrate approved run, iterate, review, and report stages inside that contract, poll long-running work until completion or stop conditions, and write progress back into `.lab/context/auto-status.md`.
+This command runs the `/lab:auto` stage. It must read `.lab/context/eval-protocol.md`, `.lab/context/auto-mode.md`, `.lab/context/auto-status.md`, and `.lab/context/auto-outcome.md`, enforce the declared terminal goal schema, orchestrate approved run, iterate, review, and report stages inside that contract, poll long-running work until completion or stop conditions, and write progress plus the final outcome back into `.lab/context/auto-status.md` and `.lab/context/auto-outcome.md`.

package/package-assets/claude/commands/lab.md CHANGED Viewed

@@ -18,7 +18,7 @@ tags: [workflow, research, overview]
   Turn the approved idea into an approved dataset and benchmark package with dataset years, papers that used each dataset, source audit, download plan, classic-public versus recent-strong-public versus claim-specific benchmark roles, and explicit rationale for canonical baselines, strong historical baselines, recent strong public methods, and closest prior work.
 - `/lab:auto`
-  Run a bounded orchestration loop over approved execution stages. Use an auto-mode contract plus live auto-status to drive `run`, `iterate`, `review`, `report`, and optionally `write` without changing the frozen mission or framing.
+  Run a bounded orchestration loop over approved execution stages. Use an auto-mode contract plus live auto-status to drive `run`, `iterate`, `review`, `report`, and optionally `write` without changing the frozen mission or framing. Choose an autonomy level, declare a concrete terminal goal, explicitly approve the contract before starting, and treat `.lab/context/eval-protocol.md` as the source of truth for metrics, metric glossary, source-backed comparison semantics, tables, and structured experiment-ladder rungs.
 - `/lab:framing`
   Lock paper-facing method name, module names, paper title, and contribution bullets by auditing current-field and adjacent-field terminology, then keep an approval gate before any section drafting.
@@ -51,5 +51,6 @@ tags: [workflow, research, overview]
 - `/lab:spec` should inherit the approved dataset package from `.lab/context/data-decisions.md`.
 - Never skip directly from `/lab:idea` to code.
 - `/lab:iterate` requires a normalized summary from `scripts/eval_report.py`.
+- `/lab:run`, `/lab:iterate`, `/lab:auto`, and `/lab:report` should all follow `.lab/context/eval-protocol.md`, including its recorded sources for metrics and comparison implementations.
 - `/lab:write` requires an approved framing artifact from `/lab:framing`.
 - `/lab:write` requires stable report artifacts, a mini-outline, the active section guide, `paper-review.md`, and `does-my-writing-flow-source.md`, and should only change one section per round.

package/package-assets/codex/prompts/lab-auto.md CHANGED Viewed

@@ -6,4 +6,4 @@ argument-hint: autonomous campaign target
 Use the installed `lab` skill at `.codex/skills/lab/SKILL.md`.
 Execute the requested `/lab:auto` stage against the user's argument now. Do not only recommend another lab stage. If a blocking prerequisite is missing, say exactly what is missing and ask at most one clarifying question.
-This command runs the `/lab:auto` stage. It must read `.lab/context/auto-mode.md` and `.lab/context/auto-status.md`, orchestrate approved run, iterate, review, and report stages inside that contract, poll long-running work until completion or stop conditions, and write progress back into `.lab/context/auto-status.md`.
+This command runs the `/lab:auto` stage. It must read `.lab/context/eval-protocol.md`, `.lab/context/auto-mode.md`, `.lab/context/auto-status.md`, and `.lab/context/auto-outcome.md`, enforce the declared terminal goal schema, orchestrate approved run, iterate, review, and report stages inside that contract, poll long-running work until completion or stop conditions, and write progress plus the final outcome back into `.lab/context/auto-status.md` and `.lab/context/auto-outcome.md`.

package/package-assets/codex/prompts/lab.md CHANGED Viewed

@@ -16,7 +16,7 @@ argument-hint: workflow question or stage choice
   Turn the approved idea into an approved dataset and benchmark package with dataset years, papers that used each dataset, source audit, download plan, classic-public versus recent-strong-public versus claim-specific benchmark roles, and explicit rationale for canonical baselines, strong historical baselines, recent strong public methods, and closest prior work.
 - `/lab:auto`
-  Run a bounded orchestration loop over approved execution stages. Use an auto-mode contract plus live auto-status to drive `run`, `iterate`, `review`, `report`, and optionally `write` without changing the frozen mission or framing.
+  Run a bounded orchestration loop over approved execution stages. Use an auto-mode contract plus live auto-status to drive `run`, `iterate`, `review`, `report`, and optionally `write` without changing the frozen mission or framing. Choose an autonomy level, declare a concrete terminal goal, explicitly approve the contract before starting, and treat `.lab/context/eval-protocol.md` as the source of truth for metrics, metric glossary, source-backed comparison semantics, tables, and structured experiment-ladder rungs.
 - `/lab:framing`
   Lock paper-facing method name, module names, paper title, and contribution bullets by auditing current-field and adjacent-field terminology, then keep an approval gate before any section drafting.
@@ -49,5 +49,6 @@ argument-hint: workflow question or stage choice
 - `/lab:spec` should inherit the approved dataset package from `.lab/context/data-decisions.md`.
 - Never skip directly from `/lab:idea` to code.
 - `/lab:iterate` requires a normalized summary from `scripts/eval_report.py`.
+- `/lab:run`, `/lab:iterate`, `/lab:auto`, and `/lab:report` should all follow `.lab/context/eval-protocol.md`, including its recorded sources for metrics and comparison implementations.
 - `/lab:write` requires an approved framing artifact from `/lab:framing`.
 - `/lab:write` requires stable report artifacts, a mini-outline, the active section guide, `paper-review.md`, and `does-my-writing-flow-source.md`, and should only change one section per round.

package/package-assets/shared/lab/context/auto-mode.md CHANGED Viewed

@@ -1,12 +1,19 @@
 # Auto Mode Contract
 Use this file to define the bounded autonomous execution envelope for `/lab:auto`.
+Pair it with `.lab/context/eval-protocol.md`, which defines the paper-facing metrics, tables, gates, and benchmark ladder that auto mode should optimize against.
+If `eval-protocol.md` declares structured rung entries, auto mode follows those rung transitions first and uses the stage commands here as per-stage fallbacks.
 ## Objective
 - Objective:
+- Autonomy level: L2
+- Approval status: draft
 - Allowed stages: run, iterate, review, report
 - Success criteria:
+- Terminal goal type:
+- Terminal goal target:
+- Required terminal artifact:
 ## Loop Budget
@@ -27,6 +34,14 @@ Use this file to define the bounded autonomous execution envelope for `/lab:auto
 - Promotion check command:
 - Promotion command:
+## Stage Output Contracts
+- Run stage contract: write persistent outputs under `results_root`.
+- Iterate stage contract: update persistent outputs under `results_root`.
+- Review stage contract: update canonical review context such as `.lab/context/decisions.md`, `state.md`, `open-questions.md`, or `evidence-index.md`.
+- Report stage contract: write the final report to `<deliverables_root>/report.md`.
+- Write stage contract: write LaTeX output under `<deliverables_root>/paper/`.
 ## Promotion Policy
 - Promotion policy:
@@ -40,3 +55,4 @@ Use this file to define the bounded autonomous execution envelope for `/lab:auto
 - Stop conditions:
 - Escalation conditions:
+- Canonical promotion writeback: update `.lab/context/data-decisions.md`, `.lab/context/decisions.md`, `.lab/context/state.md`, and `.lab/context/session-brief.md`.

package/package-assets/shared/lab/context/auto-outcome.md ADDED Viewed

@@ -0,0 +1,28 @@
+# Auto Outcome
+## Goal
+- Objective:
+- Experiment ladder:
+- Metric glossary:
+- Metric source papers:
+- Metric implementation source:
+- Comparison source papers:
+- Comparison implementation source:
+- Deviation from original implementation:
+- Terminal goal type:
+- Terminal goal target:
+- Required terminal artifact:
+## Outcome
+- Status: idle
+- Goal reached: no
+- Stop reason:
+- Promotion applied: no
+- Final artifact:
+- Final rung:
+- Executed stages:
+- Iterations completed: 0
+- Started at:
+- Finished at:

package/package-assets/shared/lab/context/auto-status.md CHANGED Viewed

@@ -7,6 +7,9 @@
 - Current command:
 - Active run id:
 - Iteration count: 0
+- Current rung:
+- Watch target:
+- Next rung:
 ## Timing

package/package-assets/shared/lab/context/eval-protocol.md ADDED Viewed

@@ -0,0 +1,46 @@
+# Evaluation Protocol
+Use this file to define the paper-facing evaluation objective, table plan, gates, and benchmark ladder for `/lab:run`, `/lab:iterate`, `/lab:auto`, and `/lab:report`.
+## Primary Evaluation Objective
+- Primary evaluation objective:
+- Primary metrics:
+- Secondary metrics:
+- Required terminal evidence:
+## Table Plan
+- Table plan:
+- Required claims per table:
+## Metric Glossary
+- Metric glossary:
+- Metric source papers:
+- Metric implementation source:
+- Comparison source papers:
+- Comparison implementation source:
+- Deviation from original implementation:
+Record enough source detail here that later `run`, `iterate`, `auto`, and `report` stages do not have to guess what a metric means, which baseline implementation is canonical, or where a comparison method came from.
+## Gate Ladder
+- Experiment ladder:
+- Benchmark ladder:
+- Comparison gate:
+- Promotion gate:
+- Minimum sample sizes:
+- Required output artifacts:
+### Rung: <rung-id>
+- Stage:
+- Goal:
+- Command:
+- Watch:
+- Gate:
+- On pass:
+- On fail:
+- On stop:

package/package-assets/shared/skills/lab/SKILL.md CHANGED Viewed

@@ -22,6 +22,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
 - Write durable artifacts to disk instead of leaving key decisions only in chat.
 - Use `.lab/config/workflow.json` as the global contract for workflow language, paper language, and paper format.
 - Use `.lab/context/` as the shared project state for both Codex and Claude entrypoints.
+- Use `.lab/context/eval-protocol.md` as the shared evaluation contract for run, iterate, auto, and report stages, including metric glossary and experiment ladder semantics.
+- Treat evaluation semantics as source-backed once evaluation planning starts: metrics, benchmark gates, baseline behavior, comparison implementations, and deviations should come from recorded sources, not memory.
 - Workflow artifacts should follow the installed workflow language.
 - Final paper output should default to LaTeX, and its manuscript language should be decided separately from the workflow language.
 - Separate sourced facts from model-generated hypotheses.
@@ -82,6 +84,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
 - Use this stage to orchestrate approved execution stages with bounded autonomy.
 - Read `.lab/config/workflow.json`, `.lab/context/mission.md`, `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/data-decisions.md`, `.lab/context/evidence-index.md`, `.lab/context/terminology-lock.md`, `.lab/context/auto-mode.md`, and `.lab/context/auto-status.md` before acting.
 - Treat `.lab/context/auto-mode.md` as the control contract and `.lab/context/auto-status.md` as the live state file.
+- Require `Autonomy level` and `Approval status` in `.lab/context/auto-mode.md` before execution.
+- Treat `L1` as safe-run validation, `L2` as bounded iteration, and `L3` as aggressive campaign mode.
 - Reuse `/lab:run`, `/lab:iterate`, `/lab:review`, `/lab:report`, and optional `/lab:write` instead of inventing a second workflow.
 - Do not automatically change the research mission, paper-facing framing, or core claims.
 - You may add exploratory datasets, benchmarks, and comparison methods inside the approved exploration envelope.
@@ -106,6 +110,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
 - Register the run with `.lab/.managed/scripts/register_run.py`.
 - Normalize the result with `.lab/.managed/scripts/eval_report.py`.
 - Validate normalized output with `.lab/.managed/scripts/validate_results.py`.
+- Read `.lab/context/eval-protocol.md` before choosing the smallest run so the first experiment already targets the approved tables, metrics, and gates.
 - Update `.lab/context/state.md` and `.lab/context/evidence-index.md` after the run.
 ### `/lab:iterate`
@@ -122,6 +127,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
 - Require a normalized evaluation report each round.
 - Read `.lab/context/mission.md`, `.lab/context/state.md`, `.lab/context/decisions.md`, and `.lab/context/evidence-index.md` at the start of each round.
 - Read `.lab/context/data-decisions.md` before changing benchmark-facing experiments.
+- Read `.lab/context/eval-protocol.md` before changing evaluation ladders, sample sizes, or promotion gates.
+- Keep metric definitions, baseline behavior, and comparison implementations anchored to the source-backed evaluation protocol before changing thresholds, gates, or ladder transitions.
 - Switch to diagnostic mode if risk increases for two consecutive rounds.
 - Write round reports with `.lab/.managed/templates/iteration-report.md`.
 - Update `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, and `.lab/context/open-questions.md` each round as needed.
@@ -141,6 +148,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
 - Summarize all validated iteration summaries.
 - Read `.lab/context/mission.md`, `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, and `.lab/context/data-decisions.md` before drafting.
+- Read `.lab/context/eval-protocol.md` before choosing tables, thresholds, or final result framing.
+- Keep metric definitions, comparison semantics, and implementation references anchored to the approved evaluation protocol instead of re-deriving them during reporting.
 - Aggregate them with `.lab/.managed/scripts/summarize_iterations.py`.
 - Write the final document with `.lab/.managed/templates/final-report.md`.
 - Keep failed attempts and limitations visible.
@@ -172,7 +181,9 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
 - No implementation before `/lab:spec`.
 - No frozen spec without an approved dataset package or an explicit defer reason recorded in `.lab/context/data-decisions.md`.
 - No unconstrained iteration. Every `/lab:iterate` campaign must declare done criteria and `max_iterations`.
+- No execution or reporting campaign without an evaluation protocol or an explicit defer reason recorded in `.lab/context/eval-protocol.md`.
 - No unconstrained auto mode. Every `/lab:auto` campaign must declare allowed stages, stop conditions, and a promotion policy in `.lab/context/auto-mode.md`.
+- No auto start without an explicit autonomy level and `Approval status: approved`.
 - No final report without validated normalized results.
 - No paper-writing round without stable report artifacts, an approved framing artifact, evidence links, and LaTeX manuscript output.
@@ -194,6 +205,6 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
 - Vendored paper-writing references: `.codex/skills/lab/references/paper-writing/{abstract,introduction,related-work,method,experiments,conclusion,paper-review,does-my-writing-flow-source}.md` or `.claude/skills/lab/references/paper-writing/{abstract,introduction,related-work,method,experiments,conclusion,paper-review,does-my-writing-flow-source}.md`
 - Command adapters: the installed `/lab:*` command assets
 - Shared workflow config: `.lab/config/workflow.json`
-- Shared project context: `.lab/context/{mission,state,decisions,evidence-index,open-questions,data-decisions,auto-mode,auto-status}.md`
+- Shared project context: `.lab/context/{mission,state,decisions,evidence-index,open-questions,data-decisions,eval-protocol,auto-mode,auto-status}.md`
 - Templates: `.lab/.managed/templates/`
 - Scripts: `.lab/.managed/scripts/`