superlab 0.1.48 → 0.1.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/superlab.cjs +18 -5
- package/lib/auto_contracts.cjs +19 -8
- package/lib/i18n.cjs +21 -19
- package/package-assets/shared/skills/lab/SKILL.md +2 -0
- package/package-assets/shared/skills/lab/stages/auto.md +1 -0
- package/package-assets/shared/skills/lab/stages/iterate.md +2 -1
- package/package.json +1 -1
package/bin/superlab.cjs
CHANGED
|
@@ -39,7 +39,8 @@ Usage:
|
|
|
39
39
|
superlab init [--target <dir>] [--platform codex|claude|both|all] [--lang en|zh] [--force]
|
|
40
40
|
superlab install [--target <dir>] [--platform codex|claude|both|all] [--lang en|zh] [--force]
|
|
41
41
|
superlab paper attach-template --path <dir> [--target <dir>]
|
|
42
|
-
superlab auto start [--target <dir>] [--
|
|
42
|
+
superlab auto start [--target <dir>] [--campaign-kind <kind>] [--allowed-stages <csv>] [--autonomy-level <L1|L2|L3>|--l1|--l2|--l3] [--objective <text>|<objective>]
|
|
43
|
+
superlab auto experiment [--target <dir>] [--l1|--l2|--l3] <objective>
|
|
43
44
|
superlab auto status [--target <dir>]
|
|
44
45
|
superlab auto stop [--target <dir>]
|
|
45
46
|
superlab update [--target <dir>]
|
|
@@ -219,7 +220,7 @@ function parsePaperArgs(argv) {
|
|
|
219
220
|
|
|
220
221
|
function parseAutoArgs(argv) {
|
|
221
222
|
const [action, ...rest] = argv;
|
|
222
|
-
if (!["start", "status", "stop"].includes(action || "")) {
|
|
223
|
+
if (!["start", "status", "stop", "experiment"].includes(action || "")) {
|
|
223
224
|
throw new Error(`Unknown auto action: ${action || "(missing)"}`);
|
|
224
225
|
}
|
|
225
226
|
const options = {
|
|
@@ -230,13 +231,14 @@ function parseAutoArgs(argv) {
|
|
|
230
231
|
requestedAllowedStages: "",
|
|
231
232
|
requestedAutonomyLevel: "",
|
|
232
233
|
};
|
|
234
|
+
const positionalObjectiveParts = [];
|
|
233
235
|
|
|
234
236
|
for (let index = 0; index < rest.length; index += 1) {
|
|
235
237
|
const value = rest[index];
|
|
236
238
|
if (value === "--target") {
|
|
237
239
|
options.targetDir = path.resolve(rest[index + 1]);
|
|
238
240
|
index += 1;
|
|
239
|
-
} else if (action === "start" && value === "--objective") {
|
|
241
|
+
} else if ((action === "start" || action === "experiment") && value === "--objective") {
|
|
240
242
|
options.requestedObjective = rest[index + 1] || "";
|
|
241
243
|
index += 1;
|
|
242
244
|
} else if (action === "start" && value === "--campaign-kind") {
|
|
@@ -245,14 +247,25 @@ function parseAutoArgs(argv) {
|
|
|
245
247
|
} else if (action === "start" && value === "--allowed-stages") {
|
|
246
248
|
options.requestedAllowedStages = rest[index + 1] || "";
|
|
247
249
|
index += 1;
|
|
248
|
-
} else if (action === "start" && value === "--autonomy-level") {
|
|
250
|
+
} else if ((action === "start" || action === "experiment") && value === "--autonomy-level") {
|
|
249
251
|
options.requestedAutonomyLevel = (rest[index + 1] || "").trim();
|
|
250
252
|
index += 1;
|
|
253
|
+
} else if ((action === "start" || action === "experiment") && ["--l1", "--l2", "--l3"].includes(value)) {
|
|
254
|
+
options.requestedAutonomyLevel = value.slice(2).toUpperCase();
|
|
255
|
+
} else if (action === "start" || action === "experiment") {
|
|
256
|
+
positionalObjectiveParts.push(value);
|
|
251
257
|
} else {
|
|
252
258
|
throw new Error(`Unknown option: ${value}`);
|
|
253
259
|
}
|
|
254
260
|
}
|
|
255
261
|
|
|
262
|
+
if (!options.requestedObjective && positionalObjectiveParts.length > 0) {
|
|
263
|
+
options.requestedObjective = positionalObjectiveParts.join(" ").trim();
|
|
264
|
+
}
|
|
265
|
+
if (action === "experiment" && !options.requestedCampaignKind) {
|
|
266
|
+
options.requestedCampaignKind = "experiment-loop";
|
|
267
|
+
}
|
|
268
|
+
|
|
256
269
|
return options;
|
|
257
270
|
}
|
|
258
271
|
|
|
@@ -990,7 +1003,7 @@ async function main() {
|
|
|
990
1003
|
printAutoStatus(options);
|
|
991
1004
|
return;
|
|
992
1005
|
}
|
|
993
|
-
if (options.action === "start") {
|
|
1006
|
+
if (options.action === "start" || options.action === "experiment") {
|
|
994
1007
|
const result = await startAutoMode({
|
|
995
1008
|
targetDir: options.targetDir,
|
|
996
1009
|
requestedContract: {
|
package/lib/auto_contracts.cjs
CHANGED
|
@@ -23,6 +23,12 @@ const AUTO_LEVEL_STAGE_ENVELOPES = {
|
|
|
23
23
|
};
|
|
24
24
|
const VALID_APPROVAL_STATUSES = new Set(["draft", "approved"]);
|
|
25
25
|
const VALID_TERMINAL_GOAL_TYPES = new Set(["rounds", "metric-threshold", "task-completion"]);
|
|
26
|
+
const CAMPAIGN_KIND_DEFAULT_STAGES = {
|
|
27
|
+
planning: ["idea", "data", "framing", "spec", "review", "report"],
|
|
28
|
+
spec: ["idea", "data", "framing", "spec", "review", "report"],
|
|
29
|
+
"experiment-loop": ["run", "iterate", "review", "report"],
|
|
30
|
+
"report-polish": ["review", "report", "write"],
|
|
31
|
+
};
|
|
26
32
|
const FROZEN_CORE_ALIASES = {
|
|
27
33
|
mission: [path.join(".lab", "context", "mission.md")],
|
|
28
34
|
framing: [
|
|
@@ -82,19 +88,23 @@ function inferCampaignKind({ campaignKind = "", allowedStages = [] }) {
|
|
|
82
88
|
}
|
|
83
89
|
|
|
84
90
|
function normalizeRequestedAutoContract(requested = {}) {
|
|
85
|
-
const
|
|
91
|
+
const rawAllowedStages = Array.isArray(requested.allowedStages)
|
|
86
92
|
? requested.allowedStages.map((stage) => String(stage).trim().toLowerCase()).filter(Boolean)
|
|
87
93
|
: normalizeList(requested.allowedStages || "").map((stage) => stage.toLowerCase());
|
|
94
|
+
const normalizedCampaignKind =
|
|
95
|
+
isMeaningful(requested.campaignKind || "") || rawAllowedStages.length > 0
|
|
96
|
+
? inferCampaignKind({
|
|
97
|
+
campaignKind: requested.campaignKind || "",
|
|
98
|
+
allowedStages: rawAllowedStages,
|
|
99
|
+
})
|
|
100
|
+
: "";
|
|
101
|
+
const allowedStages = rawAllowedStages.length > 0
|
|
102
|
+
? rawAllowedStages
|
|
103
|
+
: [...(CAMPAIGN_KIND_DEFAULT_STAGES[normalizedCampaignKind] || [])];
|
|
88
104
|
return {
|
|
89
105
|
objective: (requested.objective || "").trim(),
|
|
90
106
|
autonomyLevel: normalizeScalar(requested.autonomyLevel || ""),
|
|
91
|
-
campaignKind:
|
|
92
|
-
isMeaningful(requested.campaignKind || "") || allowedStages.length > 0
|
|
93
|
-
? inferCampaignKind({
|
|
94
|
-
campaignKind: requested.campaignKind || "",
|
|
95
|
-
allowedStages,
|
|
96
|
-
})
|
|
97
|
-
: "",
|
|
107
|
+
campaignKind: normalizedCampaignKind,
|
|
98
108
|
allowedStages,
|
|
99
109
|
};
|
|
100
110
|
}
|
|
@@ -539,6 +549,7 @@ module.exports = {
|
|
|
539
549
|
REVIEW_CONTEXT_FILES,
|
|
540
550
|
VALID_APPROVAL_STATUSES,
|
|
541
551
|
VALID_TERMINAL_GOAL_TYPES,
|
|
552
|
+
CAMPAIGN_KIND_DEFAULT_STAGES,
|
|
542
553
|
classifyAutoContractFit,
|
|
543
554
|
changedSnapshotPaths,
|
|
544
555
|
detectFrozenCoreChanges,
|
package/lib/i18n.cjs
CHANGED
|
@@ -207,7 +207,7 @@ const ZH_SKILL_FILES = {
|
|
|
207
207
|
- 本轮假设
|
|
208
208
|
- 具体代码或配置改动
|
|
209
209
|
- 标准化评估摘要
|
|
210
|
-
-
|
|
210
|
+
- 写入 \`.lab/iterations/\` 的书面 iteration report
|
|
211
211
|
- 写入 \`results_root\` 的持久 run 输出
|
|
212
212
|
- continue 或 stop 决策
|
|
213
213
|
- 风险连续两轮升高时触发 diagnostic mode
|
|
@@ -235,6 +235,7 @@ const ZH_SKILL_FILES = {
|
|
|
235
235
|
- 不要修改指标定义、baseline 语义或对比方法实现,除非评估协议已经记录了来源和与原始实现的偏差。
|
|
236
236
|
- 如果要调整 ladder、样本量或升格 gate,必须继续锚定到带来源的评估协议,而不是靠聊天临时判断。
|
|
237
237
|
- durable findings 和证据边界变化应写回 canonical context,再刷新派生的 \`state.md\` 快照;当前执行进度只写 \`workflow-state.md\`。
|
|
238
|
+
- \`.lab/iterations/*.md\` 属于对外可读的 workflow artifact。如果 workflow language 是中文,iteration report、本轮摘要、阻塞点和 continue-or-stop 决策都应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。
|
|
238
239
|
|
|
239
240
|
## 交互约束
|
|
240
241
|
|
|
@@ -887,13 +888,13 @@ const ZH_SKILL_FILES = {
|
|
|
887
888
|
- [ ] 产出 final report。
|
|
888
889
|
`,
|
|
889
890
|
[path.join(".lab", ".managed", "templates", "iteration-report.md")]:
|
|
890
|
-
`#
|
|
891
|
+
`# 迭代报告
|
|
891
892
|
|
|
892
893
|
## 轮次
|
|
893
894
|
|
|
894
|
-
-
|
|
895
|
-
-
|
|
896
|
-
-
|
|
895
|
+
- 迭代编号:
|
|
896
|
+
- 运行 id:
|
|
897
|
+
- 完成承诺:
|
|
897
898
|
|
|
898
899
|
## 假设
|
|
899
900
|
|
|
@@ -905,27 +906,27 @@ const ZH_SKILL_FILES = {
|
|
|
905
906
|
|
|
906
907
|
## 评估摘要
|
|
907
908
|
|
|
908
|
-
-
|
|
909
|
-
-
|
|
910
|
-
-
|
|
911
|
-
-
|
|
912
|
-
-
|
|
913
|
-
-
|
|
914
|
-
-
|
|
909
|
+
- 数据集:
|
|
910
|
+
- 划分:
|
|
911
|
+
- 基线:
|
|
912
|
+
- 变体:
|
|
913
|
+
- 主指标:
|
|
914
|
+
- 次级指标:
|
|
915
|
+
- 失败次数:
|
|
915
916
|
|
|
916
917
|
## 专家批评
|
|
917
918
|
|
|
918
|
-
-
|
|
919
|
-
-
|
|
920
|
-
-
|
|
919
|
+
- 主要关注点:
|
|
920
|
+
- 方法论关注点:
|
|
921
|
+
- 解释风险:
|
|
921
922
|
- 是否进入 diagnostic mode:
|
|
922
923
|
|
|
923
924
|
## 决策
|
|
924
925
|
|
|
925
|
-
-
|
|
926
|
-
-
|
|
927
|
-
-
|
|
928
|
-
-
|
|
926
|
+
- 继续还是停止:
|
|
927
|
+
- 下一步动作:
|
|
928
|
+
- 主要阻塞点:
|
|
929
|
+
- 次优后续动作:
|
|
929
930
|
`,
|
|
930
931
|
[path.join(".lab", ".managed", "templates", "review-checklist.md")]:
|
|
931
932
|
`# Review Checklist
|
|
@@ -2854,6 +2855,7 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
|
|
|
2854
2855
|
- 如果用户显式调用 \`/lab:auto\` 或 \`/lab-auto\`,就保持在 auto 执行路径里;只要请求仍在已批准 execution envelope 内,即使目标听起来像 feature selection、baseline selection、离散化或 candidate sweep,也不要重新路由到 brainstorming 或 spec review。
|
|
2855
2856
|
- 如果用户同时提了论文层、实验 phase 和自治级别,先用一句话重述:objective、自治级别、terminal goal、scope、allowed modifications。
|
|
2856
2857
|
- 如果 workflow language 是中文,摘要、清单条目、任务标签和进度更新都应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。
|
|
2858
|
+
- 对 auto 写出的 \`.lab/iterations/*.md\` 也应用同样的 workflow-language 规则,包括本轮摘要、阻塞点、diagnostic notes 和 continue-or-stop 决策。
|
|
2857
2859
|
- 当循环进入 \`report\` 时,要主动给出用户可读的白话总结,解释主指标、次级指标和主表作用;不要等用户额外发一句“解释这些指标”。
|
|
2858
2860
|
- 当循环即将进入 \`write\`,且 \`paper_template_root\` 为空时:
|
|
2859
2861
|
- 如果 \`paper_template_decision\` 是 \`unconfirmed\`,必须先追问一次:继续使用默认 scaffold,还是先接入模板目录
|
|
@@ -28,6 +28,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
28
28
|
- Use `.lab/context/eval-protocol.md` as the shared evaluation contract for run, iterate, auto, and report stages, including metric glossary and experiment ladder semantics.
|
|
29
29
|
- Treat evaluation semantics as source-backed once evaluation planning starts: metrics, benchmark gates, baseline behavior, comparison implementations, and deviations should come from recorded sources, not memory.
|
|
30
30
|
- Workflow artifacts should follow the installed workflow language.
|
|
31
|
+
- Iteration artifacts under `.lab/iterations/` are workflow artifacts and should follow the installed workflow language.
|
|
31
32
|
- Final paper output should default to LaTeX, and its manuscript language should be decided separately from the workflow language.
|
|
32
33
|
- Separate sourced facts from model-generated hypotheses.
|
|
33
34
|
- Preserve failed runs, failed ideas, and limitations.
|
|
@@ -174,6 +175,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
174
175
|
- Keep metric definitions, baseline behavior, and comparison implementations anchored to the source-backed evaluation protocol before changing thresholds, gates, or ladder transitions.
|
|
175
176
|
- Switch to diagnostic mode if risk increases for two consecutive rounds.
|
|
176
177
|
- Write round reports with `.lab/.managed/templates/iteration-report.md`.
|
|
178
|
+
- Write round reports under `.lab/iterations/` with `.lab/.managed/templates/iteration-report.md`, and keep those iteration artifacts in the workflow language.
|
|
177
179
|
- Update `.lab/context/workflow-state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, `.lab/context/open-questions.md`, and `.lab/context/eval-protocol.md` each round as needed, then refresh derived views.
|
|
178
180
|
- Keep `.lab/context/eval-protocol.md` synchronized with accepted ladder changes, benchmark scope, and source-backed implementation deviations.
|
|
179
181
|
- Stop at threshold success or iteration cap, and record blockers plus next-best actions when the campaign ends without success.
|
|
@@ -166,6 +166,7 @@
|
|
|
166
166
|
- allowed modifications
|
|
167
167
|
- Then ask at most one clarifying question if a blocking field is still missing.
|
|
168
168
|
- If `.lab/config/workflow.json` sets the workflow language to Chinese, write summaries, options, checklist items, task labels, and progress updates in Chinese unless a file path, code identifier, or literal metric name must remain unchanged.
|
|
169
|
+
- Apply the same workflow-language rule to auto-written iteration artifacts under `.lab/iterations/`, including round summaries, blocker lists, diagnostic notes, and continue-or-stop decisions.
|
|
169
170
|
- When the loop reaches `report`, apply the same workflow-language rule to `report.md` and the managed `main-tables.md` artifact.
|
|
170
171
|
- When the loop reaches `report`, proactively deliver a user-facing plain-language summary of the selected metrics, what they mean, what the tables prove, and what remains unproven. Do not wait for a separate user request asking for interpretation.
|
|
171
172
|
- When the loop is about to enter `write` and `paper_template_root` is empty:
|
|
@@ -37,7 +37,7 @@ Declare and keep fixed:
|
|
|
37
37
|
- round hypothesis
|
|
38
38
|
- concrete code or config changes
|
|
39
39
|
- normalized evaluation summary
|
|
40
|
-
- written iteration report
|
|
40
|
+
- written iteration report under `.lab/iterations/`
|
|
41
41
|
- durable run outputs under `results_root`
|
|
42
42
|
- continue or stop decision
|
|
43
43
|
- diagnostic mode trigger when risk increases for two consecutive rounds
|
|
@@ -70,6 +70,7 @@ If the loop stops without success, record:
|
|
|
70
70
|
- Re-run the `Sanity and Alternative-Explanation Checks` whenever a round produces anomaly signals, suspiciously unchanged results, impl/result mismatches, or other outcomes that could still have simpler explanations.
|
|
71
71
|
- If a round reveals leakage risk, invalid scale comparisons, unsupported metric semantics, or overstated claim boundaries, treat that as a fatal methodological flaw instead of a normal failed iteration.
|
|
72
72
|
- If anomaly signals remain unresolved after implementation reality checks and at least one cross-check, switch to diagnostic mode instead of continuing as if the interpretation were settled.
|
|
73
|
+
- Keep `.lab/iterations/*.md` as reader-facing workflow artifacts. If `.lab/config/workflow.json` sets the workflow language to Chinese, write the iteration report, round summary, blocker list, and continue-or-stop decision in Chinese unless a file path, code identifier, or literal metric name must remain unchanged.
|
|
73
74
|
|
|
74
75
|
## Interaction Contract
|
|
75
76
|
|