superlab 0.1.49 → 0.1.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/auto_contracts.cjs +24 -0
- package/lib/auto_runner.cjs +51 -4
- package/lib/auto_state.cjs +1 -0
- package/lib/i18n.cjs +26 -21
- package/package-assets/claude/commands/lab.md +2 -0
- package/package-assets/codex/prompts/lab.md +2 -0
- package/package-assets/shared/lab/context/auto-mode.md +3 -0
- package/package-assets/shared/skills/lab/SKILL.md +2 -0
- package/package-assets/shared/skills/lab/stages/auto.md +6 -0
- package/package-assets/shared/skills/lab/stages/iterate.md +2 -1
- package/package.json +1 -1
package/lib/auto_contracts.cjs
CHANGED
|
@@ -7,6 +7,7 @@ const {
|
|
|
7
7
|
isMeaningful,
|
|
8
8
|
normalizeList,
|
|
9
9
|
normalizeScalar,
|
|
10
|
+
parseDurationMs,
|
|
10
11
|
parseInteger,
|
|
11
12
|
readFileIfExists,
|
|
12
13
|
readWorkflowConfig,
|
|
@@ -29,6 +30,11 @@ const CAMPAIGN_KIND_DEFAULT_STAGES = {
|
|
|
29
30
|
"experiment-loop": ["run", "iterate", "review", "report"],
|
|
30
31
|
"report-polish": ["review", "report", "write"],
|
|
31
32
|
};
|
|
33
|
+
const LONG_POLL_AUTO_STAGES = new Set(["run", "iterate"]);
|
|
34
|
+
const DEFAULT_SHORT_POLL_INTERVAL_MS = 15 * 1000;
|
|
35
|
+
const DEFAULT_LONG_POLL_INTERVAL_MS = 2 * 60 * 1000;
|
|
36
|
+
const DEFAULT_SHORT_PROGRESS_REPORT_INTERVAL_MS = 2 * 60 * 1000;
|
|
37
|
+
const DEFAULT_LONG_PROGRESS_REPORT_INTERVAL_MS = 10 * 60 * 1000;
|
|
32
38
|
const FROZEN_CORE_ALIASES = {
|
|
33
39
|
mission: [path.join(".lab", "context", "mission.md")],
|
|
34
40
|
framing: [
|
|
@@ -539,6 +545,22 @@ function resolveStageCommand(mode, stage, commandOverride = "") {
|
|
|
539
545
|
return isMeaningful(commandOverride) ? commandOverride : mode.stageCommands[stage];
|
|
540
546
|
}
|
|
541
547
|
|
|
548
|
+
function resolveAutoPollIntervalMs({ configuredPollInterval = "", stage = "" }) {
|
|
549
|
+
const normalizedStage = normalizeScalar(stage).toLowerCase();
|
|
550
|
+
const defaultMs = LONG_POLL_AUTO_STAGES.has(normalizedStage)
|
|
551
|
+
? DEFAULT_LONG_POLL_INTERVAL_MS
|
|
552
|
+
: DEFAULT_SHORT_POLL_INTERVAL_MS;
|
|
553
|
+
return parseDurationMs(configuredPollInterval, defaultMs);
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
function resolveAutoProgressReportIntervalMs({ configuredProgressReportInterval = "", stage = "" }) {
|
|
557
|
+
const normalizedStage = normalizeScalar(stage).toLowerCase();
|
|
558
|
+
const defaultMs = LONG_POLL_AUTO_STAGES.has(normalizedStage)
|
|
559
|
+
? DEFAULT_LONG_PROGRESS_REPORT_INTERVAL_MS
|
|
560
|
+
: DEFAULT_SHORT_PROGRESS_REPORT_INTERVAL_MS;
|
|
561
|
+
return parseDurationMs(configuredProgressReportInterval, defaultMs);
|
|
562
|
+
}
|
|
563
|
+
|
|
542
564
|
module.exports = {
|
|
543
565
|
ALLOWED_AUTO_STAGES,
|
|
544
566
|
AUTO_LEVEL_STAGE_ENVELOPES,
|
|
@@ -558,6 +580,8 @@ module.exports = {
|
|
|
558
580
|
inferCampaignKind,
|
|
559
581
|
isLocalProcessAlive,
|
|
560
582
|
normalizeRequestedAutoContract,
|
|
583
|
+
resolveAutoPollIntervalMs,
|
|
584
|
+
resolveAutoProgressReportIntervalMs,
|
|
561
585
|
resolveRequestedAutonomyLevel,
|
|
562
586
|
resolveFrozenCoreEntries,
|
|
563
587
|
resolveStageCommand,
|
package/lib/auto_runner.cjs
CHANGED
|
@@ -15,6 +15,8 @@ const {
|
|
|
15
15
|
inferCampaignKind,
|
|
16
16
|
isLocalProcessAlive,
|
|
17
17
|
normalizeRequestedAutoContract,
|
|
18
|
+
resolveAutoPollIntervalMs,
|
|
19
|
+
resolveAutoProgressReportIntervalMs,
|
|
18
20
|
resolveRequestedAutonomyLevel,
|
|
19
21
|
resolveStageCommand,
|
|
20
22
|
snapshotFrozenCore,
|
|
@@ -141,6 +143,20 @@ function makeCampaignId({ requested, now }) {
|
|
|
141
143
|
return raw.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
|
142
144
|
}
|
|
143
145
|
|
|
146
|
+
function formatDurationLabel(ms) {
|
|
147
|
+
if (ms % (60 * 1000) === 0) {
|
|
148
|
+
return `${ms / (60 * 1000)}m`;
|
|
149
|
+
}
|
|
150
|
+
if (ms % 1000 === 0) {
|
|
151
|
+
return `${ms / 1000}s`;
|
|
152
|
+
}
|
|
153
|
+
return `${ms}ms`;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function buildContinueBoundary({ stage, pollIntervalMs, progressReportIntervalMs }) {
|
|
157
|
+
return `Keep polling while the active owner is alive. Emit a user-facing progress update only on a meaningful change (stage or rung transition, new checkpoint or artifact, owner exit, stop or escalation match, or anomaly) or once every ${formatDurationLabel(progressReportIntervalMs)} as a keepalive; do not ask the user to trigger the next poll. Internal poll cadence for ${stage || "auto"} is ${formatDurationLabel(pollIntervalMs)} unless the contract overrides it.`;
|
|
158
|
+
}
|
|
159
|
+
|
|
144
160
|
function archiveAutoArtifact(targetDir, relativePath, campaignId, now) {
|
|
145
161
|
const absolutePath = path.join(targetDir, relativePath);
|
|
146
162
|
if (!fs.existsSync(absolutePath)) {
|
|
@@ -184,6 +200,7 @@ function buildRolledOverAutoMode(mode, requested, now) {
|
|
|
184
200
|
maxWallClockTime: mode.maxWallClockTime,
|
|
185
201
|
maxFailures: mode.maxFailures,
|
|
186
202
|
pollInterval: mode.pollInterval,
|
|
203
|
+
progressReportInterval: mode.progressReportInterval,
|
|
187
204
|
stageCommands: {
|
|
188
205
|
run: "",
|
|
189
206
|
iterate: "",
|
|
@@ -554,7 +571,6 @@ async function startAutoMode({ targetDir, now = new Date(), requestedContract =
|
|
|
554
571
|
writeAutoStatus(targetDir, status, { lang });
|
|
555
572
|
|
|
556
573
|
const startedAt = status.startedAt;
|
|
557
|
-
const pollIntervalMs = parseDurationMs(mode.pollInterval, 1000);
|
|
558
574
|
const maxWallClockMs = parseDurationMs(mode.maxWallClockTime, 60 * 60 * 1000);
|
|
559
575
|
const deadlineMs = Date.now() + maxWallClockMs;
|
|
560
576
|
const maxFailures = parseInteger(mode.maxFailures, 0);
|
|
@@ -572,6 +588,14 @@ async function startAutoMode({ targetDir, now = new Date(), requestedContract =
|
|
|
572
588
|
let stopReason = "";
|
|
573
589
|
let finalRung = "";
|
|
574
590
|
const campaignId = resumePlan?.campaignId || `auto-${startedAt.replace(/[:.]/g, "-")}`;
|
|
591
|
+
const initialPollIntervalMs = resolveAutoPollIntervalMs({
|
|
592
|
+
configuredPollInterval: mode.pollInterval,
|
|
593
|
+
stage: status.currentStage,
|
|
594
|
+
});
|
|
595
|
+
const initialProgressReportIntervalMs = resolveAutoProgressReportIntervalMs({
|
|
596
|
+
configuredProgressReportInterval: mode.progressReportInterval,
|
|
597
|
+
stage: status.currentStage,
|
|
598
|
+
});
|
|
575
599
|
let currentLedger = {
|
|
576
600
|
campaignId,
|
|
577
601
|
objective: mode.objective,
|
|
@@ -587,7 +611,11 @@ async function startAutoMode({ targetDir, now = new Date(), requestedContract =
|
|
|
587
611
|
lastCheckpoint: resumePlan?.lastCheckpoint || "",
|
|
588
612
|
checkpointSummary: resumePlan?.reason || "auto loop armed and waiting for the first owned command",
|
|
589
613
|
nextTransition: resumePlan?.rungId || "",
|
|
590
|
-
continueBoundary:
|
|
614
|
+
continueBoundary: buildContinueBoundary({
|
|
615
|
+
stage: status.currentStage,
|
|
616
|
+
pollIntervalMs: initialPollIntervalMs,
|
|
617
|
+
progressReportIntervalMs: initialProgressReportIntervalMs,
|
|
618
|
+
}),
|
|
591
619
|
stopBoundary: mode.stopConditions,
|
|
592
620
|
escalationBoundary: mode.escalationConditions,
|
|
593
621
|
requiredReadSet: ".lab/context/eval-protocol.md, .lab/context/auto-mode.md, .lab/context/auto-status.md, .lab/context/auto-ledger.md, .lab/context/auto-outcome.md",
|
|
@@ -707,6 +735,14 @@ async function startAutoMode({ targetDir, now = new Date(), requestedContract =
|
|
|
707
735
|
while (!stageCompleted) {
|
|
708
736
|
try {
|
|
709
737
|
const contract = stageContractSnapshot(targetDir, stage);
|
|
738
|
+
const pollIntervalMs = resolveAutoPollIntervalMs({
|
|
739
|
+
configuredPollInterval: mode.pollInterval,
|
|
740
|
+
stage,
|
|
741
|
+
});
|
|
742
|
+
const progressReportIntervalMs = resolveAutoProgressReportIntervalMs({
|
|
743
|
+
configuredProgressReportInterval: mode.progressReportInterval,
|
|
744
|
+
stage,
|
|
745
|
+
});
|
|
710
746
|
await runCommandWithPolling({
|
|
711
747
|
targetDir,
|
|
712
748
|
stage,
|
|
@@ -720,7 +756,15 @@ async function startAutoMode({ targetDir, now = new Date(), requestedContract =
|
|
|
720
756
|
watchTarget,
|
|
721
757
|
nextRung,
|
|
722
758
|
ownerInfo: { ownerType: "local-process" },
|
|
723
|
-
updateLedger:
|
|
759
|
+
updateLedger: (overrides = {}) =>
|
|
760
|
+
writeLedger({
|
|
761
|
+
continueBoundary: buildContinueBoundary({
|
|
762
|
+
stage,
|
|
763
|
+
pollIntervalMs,
|
|
764
|
+
progressReportIntervalMs,
|
|
765
|
+
}),
|
|
766
|
+
...overrides,
|
|
767
|
+
}),
|
|
724
768
|
});
|
|
725
769
|
verifyStageContract({ stage, snapshot: contract.snapshot });
|
|
726
770
|
executedStages.push(stage);
|
|
@@ -816,7 +860,10 @@ async function startAutoMode({ targetDir, now = new Date(), requestedContract =
|
|
|
816
860
|
targetDir,
|
|
817
861
|
stage: "promotion",
|
|
818
862
|
command: mode.promotionCommand,
|
|
819
|
-
pollIntervalMs
|
|
863
|
+
pollIntervalMs: resolveAutoPollIntervalMs({
|
|
864
|
+
configuredPollInterval: mode.pollInterval,
|
|
865
|
+
stage: "promotion",
|
|
866
|
+
}),
|
|
820
867
|
deadlineMs,
|
|
821
868
|
startedAt,
|
|
822
869
|
status: currentStatus,
|
package/lib/auto_state.cjs
CHANGED
|
@@ -36,6 +36,7 @@ function parseAutoMode(targetDir) {
|
|
|
36
36
|
maxWallClockTime: extractValue(text, ["Max wall-clock time", "最大运行时长"]),
|
|
37
37
|
maxFailures: extractValue(text, ["Max failures", "最大失败次数"]),
|
|
38
38
|
pollInterval: extractValue(text, ["Poll interval", "轮询间隔"]),
|
|
39
|
+
progressReportInterval: extractValue(text, ["Progress report interval", "进度汇报间隔"]),
|
|
39
40
|
stageCommands: {
|
|
40
41
|
run: extractValue(text, ["Run command", "运行命令"]),
|
|
41
42
|
iterate: extractValue(text, ["Iterate command", "迭代命令"]),
|
package/lib/i18n.cjs
CHANGED
|
@@ -207,7 +207,7 @@ const ZH_SKILL_FILES = {
|
|
|
207
207
|
- 本轮假设
|
|
208
208
|
- 具体代码或配置改动
|
|
209
209
|
- 标准化评估摘要
|
|
210
|
-
-
|
|
210
|
+
- 写入 \`.lab/iterations/\` 的书面 iteration report
|
|
211
211
|
- 写入 \`results_root\` 的持久 run 输出
|
|
212
212
|
- continue 或 stop 决策
|
|
213
213
|
- 风险连续两轮升高时触发 diagnostic mode
|
|
@@ -235,6 +235,7 @@ const ZH_SKILL_FILES = {
|
|
|
235
235
|
- 不要修改指标定义、baseline 语义或对比方法实现,除非评估协议已经记录了来源和与原始实现的偏差。
|
|
236
236
|
- 如果要调整 ladder、样本量或升格 gate,必须继续锚定到带来源的评估协议,而不是靠聊天临时判断。
|
|
237
237
|
- durable findings 和证据边界变化应写回 canonical context,再刷新派生的 \`state.md\` 快照;当前执行进度只写 \`workflow-state.md\`。
|
|
238
|
+
- \`.lab/iterations/*.md\` 属于对外可读的 workflow artifact。如果 workflow language 是中文,iteration report、本轮摘要、阻塞点和 continue-or-stop 决策都应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。
|
|
238
239
|
|
|
239
240
|
## 交互约束
|
|
240
241
|
|
|
@@ -887,13 +888,13 @@ const ZH_SKILL_FILES = {
|
|
|
887
888
|
- [ ] 产出 final report。
|
|
888
889
|
`,
|
|
889
890
|
[path.join(".lab", ".managed", "templates", "iteration-report.md")]:
|
|
890
|
-
`#
|
|
891
|
+
`# 迭代报告
|
|
891
892
|
|
|
892
893
|
## 轮次
|
|
893
894
|
|
|
894
|
-
-
|
|
895
|
-
-
|
|
896
|
-
-
|
|
895
|
+
- 迭代编号:
|
|
896
|
+
- 运行 id:
|
|
897
|
+
- 完成承诺:
|
|
897
898
|
|
|
898
899
|
## 假设
|
|
899
900
|
|
|
@@ -905,27 +906,27 @@ const ZH_SKILL_FILES = {
|
|
|
905
906
|
|
|
906
907
|
## 评估摘要
|
|
907
908
|
|
|
908
|
-
-
|
|
909
|
-
-
|
|
910
|
-
-
|
|
911
|
-
-
|
|
912
|
-
-
|
|
913
|
-
-
|
|
914
|
-
-
|
|
909
|
+
- 数据集:
|
|
910
|
+
- 划分:
|
|
911
|
+
- 基线:
|
|
912
|
+
- 变体:
|
|
913
|
+
- 主指标:
|
|
914
|
+
- 次级指标:
|
|
915
|
+
- 失败次数:
|
|
915
916
|
|
|
916
917
|
## 专家批评
|
|
917
918
|
|
|
918
|
-
-
|
|
919
|
-
-
|
|
920
|
-
-
|
|
919
|
+
- 主要关注点:
|
|
920
|
+
- 方法论关注点:
|
|
921
|
+
- 解释风险:
|
|
921
922
|
- 是否进入 diagnostic mode:
|
|
922
923
|
|
|
923
924
|
## 决策
|
|
924
925
|
|
|
925
|
-
-
|
|
926
|
-
-
|
|
927
|
-
-
|
|
928
|
-
-
|
|
926
|
+
- 继续还是停止:
|
|
927
|
+
- 下一步动作:
|
|
928
|
+
- 主要阻塞点:
|
|
929
|
+
- 次优后续动作:
|
|
929
930
|
`,
|
|
930
931
|
[path.join(".lab", ".managed", "templates", "review-checklist.md")]:
|
|
931
932
|
`# Review Checklist
|
|
@@ -1597,6 +1598,9 @@ const ZH_SKILL_FILES = {
|
|
|
1597
1598
|
- Max wall-clock time:
|
|
1598
1599
|
- Max failures:
|
|
1599
1600
|
- Poll interval:
|
|
1601
|
+
- 如果 \`Poll interval\` 留空,使用保守默认值:\`run\`/\`iterate\` 每 \`2m\` 轮询一次,其他 auto 轮询默认 \`15s\`。
|
|
1602
|
+
- Progress report interval:
|
|
1603
|
+
- 如果 \`Progress report interval\` 留空,只在出现有意义变化时对用户汇报;如果一直没有新变化,则 \`run\`/\`iterate\` 最多每 \`10m\` 发一次保活更新,其他 auto 轮询最多每 \`2m\` 发一次保活更新。
|
|
1600
1604
|
|
|
1601
1605
|
## 阶段命令
|
|
1602
1606
|
|
|
@@ -2180,7 +2184,7 @@ ZH_CONTENT[path.join(".codex", "prompts", "lab-data.md")] = codexPrompt(
|
|
|
2180
2184
|
ZH_CONTENT[path.join(".codex", "prompts", "lab-auto.md")] = codexPrompt(
|
|
2181
2185
|
"在已批准边界内编排自动实验循环",
|
|
2182
2186
|
"auto mode objective",
|
|
2183
|
-
"使用已安装的 `lab` 技能:`.codex/skills/lab/SKILL.md`。\n\n继续当前活动:`/lab-auto: 继续`。\n以最高执行级别继续当前活动:`/lab-auto: L3,继续`。\n\n立刻针对用户当前给出的参数执行 `/lab:auto`,不要只推荐别的 `/lab` 阶段。只有在缺少阻塞性前提时,才明确指出缺什么,并且一次最多追问一个问题。\n\n先把用户请求规范化成可交给 CLI 的 auto contract 字段:objective、autonomy level、campaign kind、allowed stages,以及任何不改变范围就能明确的 terminal-goal 提示。\n如果用户没写级别,默认按 `L2` 处理;接受 `L1/L2/L3`、`l1/l2/l3` 这类短写,`最高级别`、`最高自治` 也按 `L3` 处理。\n如果用户只写了 `继续`,且当前已有 active 或可恢复 campaign,就直接继承当前 campaign 的级别,而不是要求用户重复写。\n如果你想沿用 runtime 里已存的 campaign 级别继续,就直接写 `/lab-auto: 继续`。\n只有当级别本身真的有歧义时,才停下来追问,例如 `第三层`、`phase 3`、`table 3`。\n已批准的 `L2` 和 `L3` 执行 campaign 默认进入执行模式。\n在执行模式里,不要进入 brainstorming,不要进入 spec review,也不要为了常规实现路径选择、helper script、路径修正、数据集适配、同 family 候选切换或普通自检而生成 reviewer、explorer 或其他子智能体循环。\n只有当用户明确要求设计或审阅帮助、contract fit 需要新 campaign,或 contract 的 escalation condition 明确要求独立复核时,才从执行模式切到设计或 reviewer 模式。\n你的第一步执行动作必须是对当前项目运行 `superlab auto start`,而不是自己直接改写 `.lab/context/auto-mode.md`、`.lab/context/auto-status.md`、`.lab/context/auto-ledger.md` 或 `.lab/context/auto-outcome.md`。\n把规范化后的字段通过 CLI 参数传下去,包括 `--objective`、`--campaign-kind`、`--allowed-stages`,以及在用户已明确或已隐含时传 `--autonomy-level`。\nCLI 返回后的 runtime 结果才是事实来源。如果 CLI 报 rollover、conflict、缺字段,或已经成功启动 campaign,就如实回报,不要绕过 CLI 自己做 prompt 侧写回。\n\n本命令运行 `/lab:auto` 阶段。它必须读取 `.lab/context/eval-protocol.md`、`.lab/context/auto-mode.md`、`.lab/context/auto-status.md`、`.lab/context/auto-ledger.md` 与 `.lab/context/auto-outcome.md`,先确认 autonomy level、approval status、terminal goal schema,以及 primary gate、secondary guard、promotion condition、stop reason、escalation reason,再把 eval-protocol 里的指标释义、主表计划、来源约束与结构化实验阶梯当作执行依据,在不修改 mission、framing 和核心 claims 的前提下编排已批准的 `run`、`iterate`、`review`、`report`,轮询长任务完成情况;如果声明了 rung,就保持会话活着并按 rung 转移继续推进。\n首个可见输出块必须是 `Auto preflight`。这个块必须列出已读取文件,并回显 `Autonomy level`、`Approval status`、`Allowed stages`、`Terminal goal`、`Primary gate` 和 `Secondary guard`,然后才能进入执行摘要或动作计划。\n如果 preflight 所需字段缺失、过期或彼此冲突,就必须在执行前停下,并明确指出到底是哪一个字段阻止了 loop 启动。\n当 loop 活着时,必须把当前 owner、观察状态、checkpoint 摘要、继续边界、停止边界和恢复读取集合写进 `.lab/context/auto-ledger.md`。\n如果仓库的 workflow language 是中文,摘要、清单条目、任务标签和进度更新都必须使用中文,除非某个文件路径、代码标识符或字面指标名必须保持原样。\n把 `Layer 3`、`Phase 1`、`Table 2` 这类表达视为论文范围目标;只有显式写成 `Autonomy level L3` 或 `自治级别 L3` 时,才把它当成执行权限级别。\n不要用 `sleep 30`、单次 `pgrep` 或一次性的 `metrics.json`
|
|
2187
|
+
"使用已安装的 `lab` 技能:`.codex/skills/lab/SKILL.md`。\n\n继续当前活动:`/lab-auto: 继续`。\n以最高执行级别继续当前活动:`/lab-auto: L3,继续`。\n\n立刻针对用户当前给出的参数执行 `/lab:auto`,不要只推荐别的 `/lab` 阶段。只有在缺少阻塞性前提时,才明确指出缺什么,并且一次最多追问一个问题。\n\n先把用户请求规范化成可交给 CLI 的 auto contract 字段:objective、autonomy level、campaign kind、allowed stages,以及任何不改变范围就能明确的 terminal-goal 提示。\n如果用户没写级别,默认按 `L2` 处理;接受 `L1/L2/L3`、`l1/l2/l3` 这类短写,`最高级别`、`最高自治` 也按 `L3` 处理。\n如果用户只写了 `继续`,且当前已有 active 或可恢复 campaign,就直接继承当前 campaign 的级别,而不是要求用户重复写。\n如果你想沿用 runtime 里已存的 campaign 级别继续,就直接写 `/lab-auto: 继续`。\n只有当级别本身真的有歧义时,才停下来追问,例如 `第三层`、`phase 3`、`table 3`。\n已批准的 `L2` 和 `L3` 执行 campaign 默认进入执行模式。\n在执行模式里,不要进入 brainstorming,不要进入 spec review,也不要为了常规实现路径选择、helper script、路径修正、数据集适配、同 family 候选切换或普通自检而生成 reviewer、explorer 或其他子智能体循环。\n只有当用户明确要求设计或审阅帮助、contract fit 需要新 campaign,或 contract 的 escalation condition 明确要求独立复核时,才从执行模式切到设计或 reviewer 模式。\n你的第一步执行动作必须是对当前项目运行 `superlab auto start`,而不是自己直接改写 `.lab/context/auto-mode.md`、`.lab/context/auto-status.md`、`.lab/context/auto-ledger.md` 或 `.lab/context/auto-outcome.md`。\n把规范化后的字段通过 CLI 参数传下去,包括 `--objective`、`--campaign-kind`、`--allowed-stages`,以及在用户已明确或已隐含时传 `--autonomy-level`。\nCLI 返回后的 runtime 结果才是事实来源。如果 CLI 报 rollover、conflict、缺字段,或已经成功启动 campaign,就如实回报,不要绕过 CLI 自己做 prompt 侧写回。\n\n本命令运行 `/lab:auto` 阶段。它必须读取 `.lab/context/eval-protocol.md`、`.lab/context/auto-mode.md`、`.lab/context/auto-status.md`、`.lab/context/auto-ledger.md` 与 `.lab/context/auto-outcome.md`,先确认 autonomy level、approval status、terminal goal schema,以及 primary gate、secondary guard、promotion condition、stop reason、escalation reason,再把 eval-protocol 里的指标释义、主表计划、来源约束与结构化实验阶梯当作执行依据,在不修改 mission、framing 和核心 claims 的前提下编排已批准的 `run`、`iterate`、`review`、`report`,轮询长任务完成情况;如果声明了 rung,就保持会话活着并按 rung 转移继续推进。\n首个可见输出块必须是 `Auto preflight`。这个块必须列出已读取文件,并回显 `Autonomy level`、`Approval status`、`Allowed stages`、`Terminal goal`、`Primary gate` 和 `Secondary guard`,然后才能进入执行摘要或动作计划。\n如果 preflight 所需字段缺失、过期或彼此冲突,就必须在执行前停下,并明确指出到底是哪一个字段阻止了 loop 启动。\n当 loop 活着时,必须把当前 owner、观察状态、checkpoint 摘要、继续边界、停止边界和恢复读取集合写进 `.lab/context/auto-ledger.md`。\n如果仓库的 workflow language 是中文,摘要、清单条目、任务标签和进度更新都必须使用中文,除非某个文件路径、代码标识符或字面指标名必须保持原样。\n把 `Layer 3`、`Phase 1`、`Table 2` 这类表达视为论文范围目标;只有显式写成 `Autonomy level L3` 或 `自治级别 L3` 时,才把它当成执行权限级别。\n不要用 `sleep 30`、单次 `pgrep` 或一次性的 `metrics.json` 探针来代替真实长任务命令;当真实实验进程还活着时,只允许在出现有意义变化时发进度更新,并继续等待。没有新变化时,也只按保活节奏汇报,不要让用户触发下一次轮询。"
|
|
2184
2188
|
);
|
|
2185
2189
|
|
|
2186
2190
|
ZH_CONTENT[path.join(".claude", "commands", "lab.md")] = claudeCommand(
|
|
@@ -2201,7 +2205,7 @@ ZH_CONTENT[path.join(".claude", "commands", "lab-auto.md")] = claudeCommand(
|
|
|
2201
2205
|
"lab-auto",
|
|
2202
2206
|
"在已批准边界内编排自动实验循环",
|
|
2203
2207
|
"auto mode objective",
|
|
2204
|
-
"使用已安装的 `lab` 技能:`.claude/skills/lab/SKILL.md`。\n\n继续当前活动:`/lab-auto: 继续`。\n以最高执行级别继续当前活动:`/lab-auto: L3,继续`。\n\n立刻针对用户当前给出的参数执行 `auto` 阶段,不要只推荐别的 lab 阶段。只有在缺少阻塞性前提时,才明确指出缺什么,并且一次最多追问一个问题。\n\n先把用户请求规范化成可交给 CLI 的 auto contract 字段:objective、autonomy level、campaign kind、allowed stages,以及任何不改变范围就能明确的 terminal-goal 提示。\n如果用户没写级别,默认按 `L2` 处理;接受 `L1/L2/L3`、`l1/l2/l3` 这类短写,`最高级别`、`最高自治` 也按 `L3` 处理。\n如果用户只写了 `继续`,且当前已有 active 或可恢复 campaign,就直接继承当前 campaign 的级别,而不是要求用户重复写。\n如果你想沿用 runtime 里已存的 campaign 级别继续,就直接写 `/lab-auto: 继续`。\n只有当级别本身真的有歧义时,才停下来追问,例如 `第三层`、`phase 3`、`table 3`。\n已批准的 `L2` 和 `L3` 执行 campaign 默认进入执行模式。\n在执行模式里,不要进入 brainstorming,不要进入 spec review,也不要为了常规实现路径选择、helper script、路径修正、数据集适配、同 family 候选切换或普通自检而生成 reviewer、explorer 或其他子智能体循环。\n只有当用户明确要求设计或审阅帮助、contract fit 需要新 campaign,或 contract 的 escalation condition 明确要求独立复核时,才从执行模式切到设计或 reviewer 模式。\n你的第一步执行动作必须是对当前项目运行 `superlab auto start`,而不是自己直接改写 `.lab/context/auto-mode.md`、`.lab/context/auto-status.md`、`.lab/context/auto-ledger.md` 或 `.lab/context/auto-outcome.md`。\n把规范化后的字段通过 CLI 参数传下去,包括 `--objective`、`--campaign-kind`、`--allowed-stages`,以及在用户已明确或已隐含时传 `--autonomy-level`。\nCLI 返回后的 runtime 结果才是事实来源。如果 CLI 报 rollover、conflict、缺字段,或已经成功启动 campaign,就如实回报,不要绕过 CLI 自己做 prompt 侧写回。\n\n本命令运行 lab workflow 的 `auto` 阶段。它必须读取 `.lab/context/eval-protocol.md`、`.lab/context/auto-mode.md`、`.lab/context/auto-status.md`、`.lab/context/auto-ledger.md` 与 `.lab/context/auto-outcome.md`,先确认 autonomy level、approval status、terminal goal schema,以及 primary gate、secondary guard、promotion condition、stop reason、escalation reason,再把 eval-protocol 里的指标释义、主表计划、来源约束与结构化实验阶梯当作执行依据,在不修改 mission、framing 和核心 claims 的前提下编排已批准的 `run`、`iterate`、`review`、`report`,轮询长任务完成情况;如果声明了 rung,就保持会话活着并按 rung 转移继续推进。\n首个可见输出块必须是 `Auto preflight`。这个块必须列出已读取文件,并回显 `Autonomy level`、`Approval status`、`Allowed stages`、`Terminal goal`、`Primary gate` 和 `Secondary guard`,然后才能进入执行摘要或动作计划。\n如果 preflight 所需字段缺失、过期或彼此冲突,就必须在执行前停下,并明确指出到底是哪一个字段阻止了 loop 启动。\n当 loop 活着时,必须把当前 owner、观察状态、checkpoint 摘要、继续边界、停止边界和恢复读取集合写进 `.lab/context/auto-ledger.md`。\n如果仓库的 workflow language 是中文,摘要、清单条目、任务标签和进度更新都必须使用中文,除非某个文件路径、代码标识符或字面指标名必须保持原样。\n把 `Layer 3`、`Phase 1`、`Table 2` 这类表达视为论文范围目标;只有显式写成 `Autonomy level L3` 或 `自治级别 L3` 时,才把它当成执行权限级别。\n不要用 `sleep 30`、单次 `pgrep` 或一次性的 `metrics.json`
|
|
2208
|
+
"使用已安装的 `lab` 技能:`.claude/skills/lab/SKILL.md`。\n\n继续当前活动:`/lab-auto: 继续`。\n以最高执行级别继续当前活动:`/lab-auto: L3,继续`。\n\n立刻针对用户当前给出的参数执行 `auto` 阶段,不要只推荐别的 lab 阶段。只有在缺少阻塞性前提时,才明确指出缺什么,并且一次最多追问一个问题。\n\n先把用户请求规范化成可交给 CLI 的 auto contract 字段:objective、autonomy level、campaign kind、allowed stages,以及任何不改变范围就能明确的 terminal-goal 提示。\n如果用户没写级别,默认按 `L2` 处理;接受 `L1/L2/L3`、`l1/l2/l3` 这类短写,`最高级别`、`最高自治` 也按 `L3` 处理。\n如果用户只写了 `继续`,且当前已有 active 或可恢复 campaign,就直接继承当前 campaign 的级别,而不是要求用户重复写。\n如果你想沿用 runtime 里已存的 campaign 级别继续,就直接写 `/lab-auto: 继续`。\n只有当级别本身真的有歧义时,才停下来追问,例如 `第三层`、`phase 3`、`table 3`。\n已批准的 `L2` 和 `L3` 执行 campaign 默认进入执行模式。\n在执行模式里,不要进入 brainstorming,不要进入 spec review,也不要为了常规实现路径选择、helper script、路径修正、数据集适配、同 family 候选切换或普通自检而生成 reviewer、explorer 或其他子智能体循环。\n只有当用户明确要求设计或审阅帮助、contract fit 需要新 campaign,或 contract 的 escalation condition 明确要求独立复核时,才从执行模式切到设计或 reviewer 模式。\n你的第一步执行动作必须是对当前项目运行 `superlab auto start`,而不是自己直接改写 `.lab/context/auto-mode.md`、`.lab/context/auto-status.md`、`.lab/context/auto-ledger.md` 或 `.lab/context/auto-outcome.md`。\n把规范化后的字段通过 CLI 参数传下去,包括 `--objective`、`--campaign-kind`、`--allowed-stages`,以及在用户已明确或已隐含时传 `--autonomy-level`。\nCLI 返回后的 runtime 结果才是事实来源。如果 CLI 报 rollover、conflict、缺字段,或已经成功启动 campaign,就如实回报,不要绕过 CLI 自己做 prompt 侧写回。\n\n本命令运行 lab workflow 的 `auto` 阶段。它必须读取 `.lab/context/eval-protocol.md`、`.lab/context/auto-mode.md`、`.lab/context/auto-status.md`、`.lab/context/auto-ledger.md` 与 `.lab/context/auto-outcome.md`,先确认 autonomy level、approval status、terminal goal schema,以及 primary gate、secondary guard、promotion condition、stop reason、escalation reason,再把 eval-protocol 里的指标释义、主表计划、来源约束与结构化实验阶梯当作执行依据,在不修改 mission、framing 和核心 claims 的前提下编排已批准的 `run`、`iterate`、`review`、`report`,轮询长任务完成情况;如果声明了 rung,就保持会话活着并按 rung 转移继续推进。\n首个可见输出块必须是 `Auto preflight`。这个块必须列出已读取文件,并回显 `Autonomy level`、`Approval status`、`Allowed stages`、`Terminal goal`、`Primary gate` 和 `Secondary guard`,然后才能进入执行摘要或动作计划。\n如果 preflight 所需字段缺失、过期或彼此冲突,就必须在执行前停下,并明确指出到底是哪一个字段阻止了 loop 启动。\n当 loop 活着时,必须把当前 owner、观察状态、checkpoint 摘要、继续边界、停止边界和恢复读取集合写进 `.lab/context/auto-ledger.md`。\n如果仓库的 workflow language 是中文,摘要、清单条目、任务标签和进度更新都必须使用中文,除非某个文件路径、代码标识符或字面指标名必须保持原样。\n把 `Layer 3`、`Phase 1`、`Table 2` 这类表达视为论文范围目标;只有显式写成 `Autonomy level L3` 或 `自治级别 L3` 时,才把它当成执行权限级别。\n不要用 `sleep 30`、单次 `pgrep` 或一次性的 `metrics.json` 探针来代替真实长任务命令;当真实实验进程还活着时,只允许在出现有意义变化时发进度更新,并继续等待。没有新变化时,也只按保活节奏汇报,不要让用户触发下一次轮询。"
|
|
2205
2209
|
);
|
|
2206
2210
|
|
|
2207
2211
|
const zhAutoPriorityCodexLine =
|
|
@@ -2854,6 +2858,7 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
|
|
|
2854
2858
|
- 如果用户显式调用 \`/lab:auto\` 或 \`/lab-auto\`,就保持在 auto 执行路径里;只要请求仍在已批准 execution envelope 内,即使目标听起来像 feature selection、baseline selection、离散化或 candidate sweep,也不要重新路由到 brainstorming 或 spec review。
|
|
2855
2859
|
- 如果用户同时提了论文层、实验 phase 和自治级别,先用一句话重述:objective、自治级别、terminal goal、scope、allowed modifications。
|
|
2856
2860
|
- 如果 workflow language 是中文,摘要、清单条目、任务标签和进度更新都应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。
|
|
2861
|
+
- 对 auto 写出的 \`.lab/iterations/*.md\` 也应用同样的 workflow-language 规则,包括本轮摘要、阻塞点、diagnostic notes 和 continue-or-stop 决策。
|
|
2857
2862
|
- 当循环进入 \`report\` 时,要主动给出用户可读的白话总结,解释主指标、次级指标和主表作用;不要等用户额外发一句“解释这些指标”。
|
|
2858
2863
|
- 当循环即将进入 \`write\`,且 \`paper_template_root\` 为空时:
|
|
2859
2864
|
- 如果 \`paper_template_decision\` 是 \`unconfirmed\`,必须先追问一次:继续使用默认 scaffold,还是先接入模板目录
|
|
@@ -98,6 +98,8 @@ Treat all of these as equivalent stage requests:
|
|
|
98
98
|
- That first visible output must show files read plus `Autonomy level`, `Allowed stages`, `Terminal goal`, `Primary gate`, and `Secondary guard`.
|
|
99
99
|
- If the preflight block cannot be completed from `.lab/context/eval-protocol.md`, `.lab/context/auto-mode.md`, `.lab/context/auto-status.md`, `.lab/context/auto-ledger.md`, and `.lab/context/auto-outcome.md`, `/lab auto` should stop instead of acting like the loop is armed.
|
|
100
100
|
- While the loop is alive, `/lab auto` should keep `.lab/context/auto-ledger.md` updated with the active owner, observed state, and resume boundary.
|
|
101
|
+
- Separate internal polling from user-facing progress reports.
|
|
102
|
+
- While the loop is healthy, `/lab auto` should report to the user only on a meaningful change or at the keepalive cadence recorded in the current contract or runtime state, and it should not ask the user to trigger the next poll.
|
|
101
103
|
|
|
102
104
|
- Treat `Autonomy level L1/L2/L3` as the execution privilege level, not as a paper layer, phase, or table number.
|
|
103
105
|
- Treat `paper layer`, `phase`, and `table` as experiment targets. For example, `paper layer 3` or `Phase 1` should not be interpreted as `Autonomy level L3`.
|
|
@@ -92,6 +92,8 @@ Treat all of these as equivalent stage requests:
|
|
|
92
92
|
- That first visible output must show files read plus `Autonomy level`, `Allowed stages`, `Terminal goal`, `Primary gate`, and `Secondary guard`.
|
|
93
93
|
- If the preflight block cannot be completed from `.lab/context/eval-protocol.md`, `.lab/context/auto-mode.md`, `.lab/context/auto-status.md`, `.lab/context/auto-ledger.md`, and `.lab/context/auto-outcome.md`, `/lab:auto` should stop instead of acting like the loop is armed.
|
|
94
94
|
- While the loop is alive, `/lab:auto` should keep `.lab/context/auto-ledger.md` updated with the active owner, observed state, and resume boundary.
|
|
95
|
+
- Separate internal polling from user-facing progress reports.
|
|
96
|
+
- While the loop is healthy, `/lab:auto` should report to the user only on a meaningful change or at the keepalive cadence recorded in the current contract or runtime state, and it should not ask the user to trigger the next poll.
|
|
95
97
|
|
|
96
98
|
- Treat `Autonomy level L1/L2/L3` as the execution privilege level, not as a paper layer, phase, or table number.
|
|
97
99
|
- Treat `paper layer`, `phase`, and `table` as experiment targets. For example, `paper layer 3` or `Phase 1` should not be interpreted as `Autonomy level L3`.
|
|
@@ -40,6 +40,9 @@ Use `.lab/context/auto-ledger.md` as the live runtime ledger for ownership, chec
|
|
|
40
40
|
- Max wall-clock time:
|
|
41
41
|
- Max failures:
|
|
42
42
|
- Poll interval:
|
|
43
|
+
- Leave `Poll interval` blank to use conservative defaults: `run`/`iterate` poll every `2m`; other auto polling defaults to `15s`.
|
|
44
|
+
- Progress report interval:
|
|
45
|
+
- Leave `Progress report interval` blank to report to the user only on a meaningful change, plus a keepalive at most every `10m` for `run`/`iterate` and every `2m` for other auto polling.
|
|
43
46
|
|
|
44
47
|
## Stage Commands
|
|
45
48
|
|
|
@@ -28,6 +28,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
28
28
|
- Use `.lab/context/eval-protocol.md` as the shared evaluation contract for run, iterate, auto, and report stages, including metric glossary and experiment ladder semantics.
|
|
29
29
|
- Treat evaluation semantics as source-backed once evaluation planning starts: metrics, benchmark gates, baseline behavior, comparison implementations, and deviations should come from recorded sources, not memory.
|
|
30
30
|
- Workflow artifacts should follow the installed workflow language.
|
|
31
|
+
- Iteration artifacts under `.lab/iterations/` are workflow artifacts and should follow the installed workflow language.
|
|
31
32
|
- Final paper output should default to LaTeX, and its manuscript language should be decided separately from the workflow language.
|
|
32
33
|
- Separate sourced facts from model-generated hypotheses.
|
|
33
34
|
- Preserve failed runs, failed ideas, and limitations.
|
|
@@ -174,6 +175,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
174
175
|
- Keep metric definitions, baseline behavior, and comparison implementations anchored to the source-backed evaluation protocol before changing thresholds, gates, or ladder transitions.
|
|
175
176
|
- Switch to diagnostic mode if risk increases for two consecutive rounds.
|
|
176
177
|
- Write round reports with `.lab/.managed/templates/iteration-report.md`.
|
|
178
|
+
- Write round reports under `.lab/iterations/` with `.lab/.managed/templates/iteration-report.md`, and keep those iteration artifacts in the workflow language.
|
|
177
179
|
- Update `.lab/context/workflow-state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, `.lab/context/open-questions.md`, and `.lab/context/eval-protocol.md` each round as needed, then refresh derived views.
|
|
178
180
|
- Keep `.lab/context/eval-protocol.md` synchronized with accepted ladder changes, benchmark scope, and source-backed implementation deviations.
|
|
179
181
|
- Stop at threshold success or iteration cap, and record blockers plus next-best actions when the campaign ends without success.
|
|
@@ -77,6 +77,10 @@
|
|
|
77
77
|
- You may promote exploratory additions to the primary package only when the contract's promotion policy is satisfied and the promotion is written back into `data-decisions.md`, `decisions.md`, and `workflow-state.md`, then refresh derived views.
|
|
78
78
|
- Poll long-running commands until they finish, hit a timeout, or hit a stop condition.
|
|
79
79
|
- Keep a poll-based waiting loop instead of sleeping blindly.
|
|
80
|
+
- When `Poll interval` is blank, use conservative defaults: `run` and `iterate` poll every `2m`; other auto polling defaults to `15s`.
|
|
81
|
+
- Separate internal polling from user-facing progress reports.
|
|
82
|
+
- When `Progress report interval` is blank, report to the user only on a meaningful change, plus a keepalive at most every `10m` for `run`/`iterate` and every `2m` for other auto polling.
|
|
83
|
+
- A meaningful change means a stage or rung transition, a new checkpoint or artifact, an owner exit, a stop or escalation match, or an anomaly signal.
|
|
80
84
|
- Do not treat a short watcher such as `sleep 30`, a one-shot `pgrep`, or a single `metrics.json` probe as the rung command when the real experiment is still running.
|
|
81
85
|
- Bind each rung to the real long-running command or process that owns the experiment result.
|
|
82
86
|
- Record the active owner as one of:
|
|
@@ -166,6 +170,7 @@
|
|
|
166
170
|
- allowed modifications
|
|
167
171
|
- Then ask at most one clarifying question if a blocking field is still missing.
|
|
168
172
|
- If `.lab/config/workflow.json` sets the workflow language to Chinese, write summaries, options, checklist items, task labels, and progress updates in Chinese unless a file path, code identifier, or literal metric name must remain unchanged.
|
|
173
|
+
- Apply the same workflow-language rule to auto-written iteration artifacts under `.lab/iterations/`, including round summaries, blocker lists, diagnostic notes, and continue-or-stop decisions.
|
|
169
174
|
- When the loop reaches `report`, apply the same workflow-language rule to `report.md` and the managed `main-tables.md` artifact.
|
|
170
175
|
- When the loop reaches `report`, proactively deliver a user-facing plain-language summary of the selected metrics, what they mean, what the tables prove, and what remains unproven. Do not wait for a separate user request asking for interpretation.
|
|
171
176
|
- When the loop is about to enter `write` and `paper_template_root` is empty:
|
|
@@ -180,3 +185,4 @@
|
|
|
180
185
|
- If the user chooses to keep the draft language, persist `paper_language_finalization_decision: keep-workflow-language`
|
|
181
186
|
- If the user chooses to convert, persist `paper_language_finalization_decision: convert-to-paper-language`
|
|
182
187
|
- While the real experiment process is still alive, emit only a progress update and keep waiting. Do not present a terminal summary for that rung until the process exits or the rung hits an explicit stop boundary.
|
|
188
|
+
- While the loop is healthy, do not ask the user to trigger the next poll. Keep polling until a meaningful change, keepalive boundary, stop boundary, escalation boundary, or terminal boundary is reached.
|
|
@@ -37,7 +37,7 @@ Declare and keep fixed:
|
|
|
37
37
|
- round hypothesis
|
|
38
38
|
- concrete code or config changes
|
|
39
39
|
- normalized evaluation summary
|
|
40
|
-
- written iteration report
|
|
40
|
+
- written iteration report under `.lab/iterations/`
|
|
41
41
|
- durable run outputs under `results_root`
|
|
42
42
|
- continue or stop decision
|
|
43
43
|
- diagnostic mode trigger when risk increases for two consecutive rounds
|
|
@@ -70,6 +70,7 @@ If the loop stops without success, record:
|
|
|
70
70
|
- Re-run the `Sanity and Alternative-Explanation Checks` whenever a round produces anomaly signals, suspiciously unchanged results, impl/result mismatches, or other outcomes that could still have simpler explanations.
|
|
71
71
|
- If a round reveals leakage risk, invalid scale comparisons, unsupported metric semantics, or overstated claim boundaries, treat that as a fatal methodological flaw instead of a normal failed iteration.
|
|
72
72
|
- If anomaly signals remain unresolved after implementation reality checks and at least one cross-check, switch to diagnostic mode instead of continuing as if the interpretation were settled.
|
|
73
|
+
- Keep `.lab/iterations/*.md` as reader-facing workflow artifacts. If `.lab/config/workflow.json` sets the workflow language to Chinese, write the iteration report, round summary, blocker list, and continue-or-stop decision in Chinese unless a file path, code identifier, or literal metric name must remain unchanged.
|
|
73
74
|
|
|
74
75
|
## Interaction Contract
|
|
75
76
|
|