superlab 0.1.12 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -2
- package/README.zh-CN.md +11 -2
- package/bin/superlab.cjs +43 -1
- package/lib/auto.cjs +14 -972
- package/lib/auto_common.cjs +129 -0
- package/lib/auto_contracts.cjs +387 -0
- package/lib/auto_runner.cjs +830 -0
- package/lib/auto_state.cjs +227 -0
- package/lib/context.cjs +94 -0
- package/lib/eval_protocol.cjs +236 -0
- package/lib/i18n.cjs +125 -11
- package/lib/install.cjs +26 -6
- package/package-assets/claude/commands/lab/auto.md +1 -1
- package/package-assets/claude/commands/lab.md +2 -1
- package/package-assets/codex/prompts/lab-auto.md +1 -1
- package/package-assets/codex/prompts/lab.md +2 -1
- package/package-assets/shared/lab/context/auto-mode.md +7 -0
- package/package-assets/shared/lab/context/auto-outcome.md +28 -0
- package/package-assets/shared/lab/context/auto-status.md +3 -0
- package/package-assets/shared/lab/context/eval-protocol.md +46 -0
- package/package-assets/shared/skills/lab/SKILL.md +12 -1
- package/package-assets/shared/skills/lab/stages/auto.md +31 -7
- package/package-assets/shared/skills/lab/stages/iterate.md +4 -0
- package/package-assets/shared/skills/lab/stages/report.md +4 -0
- package/package-assets/shared/skills/lab/stages/run.md +4 -1
- package/package.json +1 -1
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
const fs = require("node:fs");
|
|
2
|
+
const path = require("node:path");
|
|
3
|
+
const {
|
|
4
|
+
contextFile,
|
|
5
|
+
extractValue,
|
|
6
|
+
isMeaningful,
|
|
7
|
+
normalizeAllowedStages,
|
|
8
|
+
normalizeScalar,
|
|
9
|
+
readFileIfExists,
|
|
10
|
+
readWorkflowLanguage,
|
|
11
|
+
} = require("./auto_common.cjs");
|
|
12
|
+
|
|
13
|
+
function parseAutoMode(targetDir) {
|
|
14
|
+
const text = readFileIfExists(contextFile(targetDir, "auto-mode.md"));
|
|
15
|
+
const allowedStages = normalizeAllowedStages(extractValue(text, ["Allowed stages", "允许阶段"]));
|
|
16
|
+
return {
|
|
17
|
+
path: contextFile(targetDir, "auto-mode.md"),
|
|
18
|
+
text,
|
|
19
|
+
objective: extractValue(text, ["Objective", "目标"]),
|
|
20
|
+
autonomyLevel: normalizeScalar(extractValue(text, ["Autonomy level", "自治级别"])),
|
|
21
|
+
approvalStatus: normalizeScalar(extractValue(text, ["Approval status", "批准状态"])),
|
|
22
|
+
allowedStages,
|
|
23
|
+
successCriteria: extractValue(text, ["Success criteria", "成功标准"]),
|
|
24
|
+
terminalGoalType: normalizeScalar(extractValue(text, ["Terminal goal type", "终止目标类型"])),
|
|
25
|
+
terminalGoalTarget: extractValue(text, ["Terminal goal target", "终止目标目标值"]),
|
|
26
|
+
requiredTerminalArtifact: extractValue(text, ["Required terminal artifact", "终止目标工件"]),
|
|
27
|
+
maxIterations: extractValue(text, ["Max iterations", "最大迭代轮次"]),
|
|
28
|
+
maxWallClockTime: extractValue(text, ["Max wall-clock time", "最大运行时长"]),
|
|
29
|
+
maxFailures: extractValue(text, ["Max failures", "最大失败次数"]),
|
|
30
|
+
pollInterval: extractValue(text, ["Poll interval", "轮询间隔"]),
|
|
31
|
+
stageCommands: {
|
|
32
|
+
run: extractValue(text, ["Run command", "运行命令"]),
|
|
33
|
+
iterate: extractValue(text, ["Iterate command", "迭代命令"]),
|
|
34
|
+
review: extractValue(text, ["Review command", "审查命令"]),
|
|
35
|
+
report: extractValue(text, ["Report command", "报告命令"]),
|
|
36
|
+
write: extractValue(text, ["Write command", "写作命令"]),
|
|
37
|
+
},
|
|
38
|
+
successCheckCommand: extractValue(text, ["Success check command", "成功检查命令"]),
|
|
39
|
+
stopCheckCommand: extractValue(text, ["Stop check command", "停止检查命令"]),
|
|
40
|
+
promotionCheckCommand: extractValue(text, ["Promotion check command", "升格检查命令"]),
|
|
41
|
+
promotionCommand: extractValue(text, ["Promotion command", "升格命令"]),
|
|
42
|
+
promotionPolicy: extractValue(text, ["Promotion policy", "升格策略"]),
|
|
43
|
+
frozenCore: extractValue(text, ["Frozen core", "冻结核心"]),
|
|
44
|
+
explorationEnvelope: extractValue(text, ["Exploration envelope", "探索边界"]),
|
|
45
|
+
stopConditions: extractValue(text, ["Stop conditions", "停止条件"]),
|
|
46
|
+
escalationConditions: extractValue(text, ["Escalation conditions", "升级条件"]),
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function parseAutoStatus(targetDir) {
|
|
51
|
+
const text = readFileIfExists(contextFile(targetDir, "auto-status.md"));
|
|
52
|
+
return {
|
|
53
|
+
path: contextFile(targetDir, "auto-status.md"),
|
|
54
|
+
text,
|
|
55
|
+
status: extractValue(text, ["Status", "状态"]),
|
|
56
|
+
currentStage: extractValue(text, ["Current stage", "当前阶段"]),
|
|
57
|
+
currentCommand: extractValue(text, ["Current command", "当前命令"]),
|
|
58
|
+
activeRunId: extractValue(text, ["Active run id", "当前活跃 run id"]),
|
|
59
|
+
iterationCount: extractValue(text, ["Iteration count", "迭代计数"]),
|
|
60
|
+
currentRung: extractValue(text, ["Current rung", "当前 rung"]),
|
|
61
|
+
watchTarget: extractValue(text, ["Watch target", "监视目标"]),
|
|
62
|
+
nextRung: extractValue(text, ["Next rung", "下一 rung"]),
|
|
63
|
+
startedAt: extractValue(text, ["Started at", "开始时间"]),
|
|
64
|
+
lastHeartbeat: extractValue(text, ["Last heartbeat", "最近心跳"]),
|
|
65
|
+
lastCheckpoint: extractValue(text, ["Last checkpoint", "最近 checkpoint"]),
|
|
66
|
+
lastSummary: extractValue(text, ["Last summary", "最近 summary"]),
|
|
67
|
+
decision: extractValue(text, ["Current decision", "当前决策"]),
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function renderAutoStatus(status, { lang = "en" } = {}) {
|
|
72
|
+
if (lang === "zh") {
|
|
73
|
+
return `# 自动模式状态
|
|
74
|
+
|
|
75
|
+
## 运行状态
|
|
76
|
+
|
|
77
|
+
- 状态: ${status.status || "idle"}
|
|
78
|
+
- 当前阶段: ${status.currentStage || ""}
|
|
79
|
+
- 当前命令: ${status.currentCommand || ""}
|
|
80
|
+
- 当前活跃 run id: ${status.activeRunId || ""}
|
|
81
|
+
- 迭代计数: ${status.iterationCount || "0"}
|
|
82
|
+
- 当前 rung: ${status.currentRung || ""}
|
|
83
|
+
- 监视目标: ${status.watchTarget || ""}
|
|
84
|
+
- 下一 rung: ${status.nextRung || ""}
|
|
85
|
+
|
|
86
|
+
## 时间
|
|
87
|
+
|
|
88
|
+
- 开始时间: ${status.startedAt || ""}
|
|
89
|
+
- 最近心跳: ${status.lastHeartbeat || ""}
|
|
90
|
+
|
|
91
|
+
## 工件
|
|
92
|
+
|
|
93
|
+
- 最近 checkpoint: ${status.lastCheckpoint || ""}
|
|
94
|
+
- 最近 summary: ${status.lastSummary || ""}
|
|
95
|
+
|
|
96
|
+
## 决策
|
|
97
|
+
|
|
98
|
+
- 当前决策: ${status.decision || ""}
|
|
99
|
+
`;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return `# Auto Mode Status
|
|
103
|
+
|
|
104
|
+
## Runtime State
|
|
105
|
+
|
|
106
|
+
- Status: ${status.status || "idle"}
|
|
107
|
+
- Current stage: ${status.currentStage || ""}
|
|
108
|
+
- Current command: ${status.currentCommand || ""}
|
|
109
|
+
- Active run id: ${status.activeRunId || ""}
|
|
110
|
+
- Iteration count: ${status.iterationCount || "0"}
|
|
111
|
+
- Current rung: ${status.currentRung || ""}
|
|
112
|
+
- Watch target: ${status.watchTarget || ""}
|
|
113
|
+
- Next rung: ${status.nextRung || ""}
|
|
114
|
+
|
|
115
|
+
## Timing
|
|
116
|
+
|
|
117
|
+
- Started at: ${status.startedAt || ""}
|
|
118
|
+
- Last heartbeat: ${status.lastHeartbeat || ""}
|
|
119
|
+
|
|
120
|
+
## Artifacts
|
|
121
|
+
|
|
122
|
+
- Last checkpoint: ${status.lastCheckpoint || ""}
|
|
123
|
+
- Last summary: ${status.lastSummary || ""}
|
|
124
|
+
|
|
125
|
+
## Decision
|
|
126
|
+
|
|
127
|
+
- Current decision: ${status.decision || ""}
|
|
128
|
+
`;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function writeAutoStatus(targetDir, status, { lang = "en" } = {}) {
|
|
132
|
+
const filePath = contextFile(targetDir, "auto-status.md");
|
|
133
|
+
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
|
134
|
+
fs.writeFileSync(filePath, renderAutoStatus(status, { lang }).trimEnd() + "\n");
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function renderAutoOutcome(outcome, { lang = "en" } = {}) {
|
|
138
|
+
if (lang === "zh") {
|
|
139
|
+
return `# 自动结果
|
|
140
|
+
|
|
141
|
+
## 目标
|
|
142
|
+
|
|
143
|
+
- Objective: ${outcome.objective || ""}
|
|
144
|
+
- Experiment ladder: ${outcome.experimentLadder || ""}
|
|
145
|
+
- Metric glossary: ${outcome.metricGlossary || ""}
|
|
146
|
+
- Metric source papers: ${outcome.metricSourcePapers || ""}
|
|
147
|
+
- Metric implementation source: ${outcome.metricImplementationSource || ""}
|
|
148
|
+
- Comparison source papers: ${outcome.comparisonSourcePapers || ""}
|
|
149
|
+
- Comparison implementation source: ${outcome.comparisonImplementationSource || ""}
|
|
150
|
+
- Deviation from original implementation: ${outcome.deviationFromOriginalImplementation || ""}
|
|
151
|
+
- Terminal goal type: ${outcome.terminalGoalType || ""}
|
|
152
|
+
- Terminal goal target: ${outcome.terminalGoalTarget || ""}
|
|
153
|
+
- Required terminal artifact: ${outcome.requiredTerminalArtifact || ""}
|
|
154
|
+
|
|
155
|
+
## 结果
|
|
156
|
+
|
|
157
|
+
- Status: ${outcome.status || ""}
|
|
158
|
+
- Goal reached: ${outcome.goalReached ? "yes" : "no"}
|
|
159
|
+
- Stop reason: ${outcome.stopReason || ""}
|
|
160
|
+
- Promotion applied: ${outcome.promotionApplied ? "yes" : "no"}
|
|
161
|
+
- Final artifact: ${outcome.finalArtifact || ""}
|
|
162
|
+
- Final rung: ${outcome.finalRung || ""}
|
|
163
|
+
- Executed stages: ${outcome.executedStages || ""}
|
|
164
|
+
- Iterations completed: ${outcome.iterationsCompleted || "0"}
|
|
165
|
+
- Started at: ${outcome.startedAt || ""}
|
|
166
|
+
- Finished at: ${outcome.finishedAt || ""}
|
|
167
|
+
`;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return `# Auto Outcome
|
|
171
|
+
|
|
172
|
+
## Goal
|
|
173
|
+
|
|
174
|
+
- Objective: ${outcome.objective || ""}
|
|
175
|
+
- Experiment ladder: ${outcome.experimentLadder || ""}
|
|
176
|
+
- Metric glossary: ${outcome.metricGlossary || ""}
|
|
177
|
+
- Metric source papers: ${outcome.metricSourcePapers || ""}
|
|
178
|
+
- Metric implementation source: ${outcome.metricImplementationSource || ""}
|
|
179
|
+
- Comparison source papers: ${outcome.comparisonSourcePapers || ""}
|
|
180
|
+
- Comparison implementation source: ${outcome.comparisonImplementationSource || ""}
|
|
181
|
+
- Deviation from original implementation: ${outcome.deviationFromOriginalImplementation || ""}
|
|
182
|
+
- Terminal goal type: ${outcome.terminalGoalType || ""}
|
|
183
|
+
- Terminal goal target: ${outcome.terminalGoalTarget || ""}
|
|
184
|
+
- Required terminal artifact: ${outcome.requiredTerminalArtifact || ""}
|
|
185
|
+
|
|
186
|
+
## Outcome
|
|
187
|
+
|
|
188
|
+
- Status: ${outcome.status || ""}
|
|
189
|
+
- Goal reached: ${outcome.goalReached ? "yes" : "no"}
|
|
190
|
+
- Stop reason: ${outcome.stopReason || ""}
|
|
191
|
+
- Promotion applied: ${outcome.promotionApplied ? "yes" : "no"}
|
|
192
|
+
- Final artifact: ${outcome.finalArtifact || ""}
|
|
193
|
+
- Final rung: ${outcome.finalRung || ""}
|
|
194
|
+
- Executed stages: ${outcome.executedStages || ""}
|
|
195
|
+
- Iterations completed: ${outcome.iterationsCompleted || "0"}
|
|
196
|
+
- Started at: ${outcome.startedAt || ""}
|
|
197
|
+
- Finished at: ${outcome.finishedAt || ""}
|
|
198
|
+
`;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function writeAutoOutcome(targetDir, outcome, { lang = "en" } = {}) {
|
|
202
|
+
const filePath = contextFile(targetDir, "auto-outcome.md");
|
|
203
|
+
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
|
204
|
+
fs.writeFileSync(filePath, renderAutoOutcome(outcome, { lang }).trimEnd() + "\n");
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
function resolveRequiredArtifact(targetDir, configuredPath) {
|
|
208
|
+
if (!isMeaningful(configuredPath)) {
|
|
209
|
+
return { relativePath: "", absolutePath: "" };
|
|
210
|
+
}
|
|
211
|
+
const trimmed = configuredPath.trim();
|
|
212
|
+
return {
|
|
213
|
+
relativePath: trimmed,
|
|
214
|
+
absolutePath: path.isAbsolute(trimmed) ? trimmed : path.resolve(targetDir, trimmed),
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
module.exports = {
|
|
219
|
+
parseAutoMode,
|
|
220
|
+
parseAutoStatus,
|
|
221
|
+
readWorkflowLanguage,
|
|
222
|
+
renderAutoOutcome,
|
|
223
|
+
renderAutoStatus,
|
|
224
|
+
resolveRequiredArtifact,
|
|
225
|
+
writeAutoOutcome,
|
|
226
|
+
writeAutoStatus,
|
|
227
|
+
};
|
package/lib/context.cjs
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
const fs = require("node:fs");
|
|
2
2
|
const path = require("node:path");
|
|
3
|
+
const { parseEvalProtocol } = require("./eval_protocol.cjs");
|
|
3
4
|
|
|
4
5
|
function contextFile(targetDir, name) {
|
|
5
6
|
return path.join(targetDir, ".lab", "context", name);
|
|
@@ -86,7 +87,25 @@ function renderSummary(lang, data) {
|
|
|
86
87
|
- Active stage: ${data.stage || "待补充"}
|
|
87
88
|
- Auto mode: ${data.autoStatus || "未启用"}
|
|
88
89
|
- Auto objective: ${data.autoObjective || "待补充"}
|
|
90
|
+
- Auto rung: ${data.autoCurrentRung || "待补充"}
|
|
91
|
+
- Auto watch target: ${data.autoWatchTarget || "待补充"}
|
|
89
92
|
- Auto decision: ${data.autoDecision || "待补充"}
|
|
93
|
+
- Auto terminal goal: ${joinNonEmpty([data.autoGoalType, data.autoGoalTarget], " | ") || "待补充"}
|
|
94
|
+
- Auto goal reached: ${data.autoGoalReached || "待补充"}
|
|
95
|
+
- Auto stop reason: ${data.autoStopReason || "待补充"}
|
|
96
|
+
- Auto final artifact: ${data.autoFinalArtifact || "待补充"}
|
|
97
|
+
- Auto final rung: ${data.autoFinalRung || "待补充"}
|
|
98
|
+
- Eval objective: ${data.evalObjective || "待补充"}
|
|
99
|
+
- Table plan: ${data.evalTablePlan || "待补充"}
|
|
100
|
+
- Metric glossary: ${data.evalMetricGlossary || "待补充"}
|
|
101
|
+
- Metric source papers: ${data.evalMetricSourcePapers || "待补充"}
|
|
102
|
+
- Metric implementation source: ${data.evalMetricImplementationSource || "待补充"}
|
|
103
|
+
- Comparison source papers: ${data.evalComparisonSourcePapers || "待补充"}
|
|
104
|
+
- Comparison implementation source: ${data.evalComparisonImplementationSource || "待补充"}
|
|
105
|
+
- Deviation from original implementation: ${data.evalDeviationFromOriginalImplementation || "待补充"}
|
|
106
|
+
- Experiment ladder: ${data.evalExperimentLadder || "待补充"}
|
|
107
|
+
- Benchmark ladder: ${data.evalBenchmarkLadder || "待补充"}
|
|
108
|
+
- Promotion gate: ${data.evalPromotionGate || "待补充"}
|
|
90
109
|
- Dataset package: ${data.datasetPackage || "待补充"}
|
|
91
110
|
- Dataset years: ${data.datasetYears || "待补充"}
|
|
92
111
|
- Benchmark role: ${data.benchmarkRole || "待补充"}
|
|
@@ -120,7 +139,25 @@ function renderSummary(lang, data) {
|
|
|
120
139
|
- Active stage: ${data.stage || "TBD"}
|
|
121
140
|
- Auto mode: ${data.autoStatus || "inactive"}
|
|
122
141
|
- Auto objective: ${data.autoObjective || "TBD"}
|
|
142
|
+
- Auto rung: ${data.autoCurrentRung || "TBD"}
|
|
143
|
+
- Auto watch target: ${data.autoWatchTarget || "TBD"}
|
|
123
144
|
- Auto decision: ${data.autoDecision || "TBD"}
|
|
145
|
+
- Auto terminal goal: ${joinNonEmpty([data.autoGoalType, data.autoGoalTarget], " | ") || "TBD"}
|
|
146
|
+
- Auto goal reached: ${data.autoGoalReached || "TBD"}
|
|
147
|
+
- Auto stop reason: ${data.autoStopReason || "TBD"}
|
|
148
|
+
- Auto final artifact: ${data.autoFinalArtifact || "TBD"}
|
|
149
|
+
- Auto final rung: ${data.autoFinalRung || "TBD"}
|
|
150
|
+
- Eval objective: ${data.evalObjective || "TBD"}
|
|
151
|
+
- Table plan: ${data.evalTablePlan || "TBD"}
|
|
152
|
+
- Metric glossary: ${data.evalMetricGlossary || "TBD"}
|
|
153
|
+
- Metric source papers: ${data.evalMetricSourcePapers || "TBD"}
|
|
154
|
+
- Metric implementation source: ${data.evalMetricImplementationSource || "TBD"}
|
|
155
|
+
- Comparison source papers: ${data.evalComparisonSourcePapers || "TBD"}
|
|
156
|
+
- Comparison implementation source: ${data.evalComparisonImplementationSource || "TBD"}
|
|
157
|
+
- Deviation from original implementation: ${data.evalDeviationFromOriginalImplementation || "TBD"}
|
|
158
|
+
- Experiment ladder: ${data.evalExperimentLadder || "TBD"}
|
|
159
|
+
- Benchmark ladder: ${data.evalBenchmarkLadder || "TBD"}
|
|
160
|
+
- Promotion gate: ${data.evalPromotionGate || "TBD"}
|
|
124
161
|
- Dataset package: ${data.datasetPackage || "TBD"}
|
|
125
162
|
- Dataset years: ${data.datasetYears || "TBD"}
|
|
126
163
|
- Benchmark role: ${data.benchmarkRole || "TBD"}
|
|
@@ -209,7 +246,25 @@ ${data.problem || "待补充"}
|
|
|
209
246
|
- Why this is the active path: ${data.why || "当前已批准方向"}
|
|
210
247
|
- Auto mode: ${data.autoStatus || "未启用"}
|
|
211
248
|
- Auto objective: ${data.autoObjective || "待补充"}
|
|
249
|
+
- Auto rung: ${data.autoCurrentRung || "待补充"}
|
|
250
|
+
- Auto watch target: ${data.autoWatchTarget || "待补充"}
|
|
212
251
|
- Auto decision: ${data.autoDecision || "待补充"}
|
|
252
|
+
- Auto terminal goal: ${joinNonEmpty([data.autoGoalType, data.autoGoalTarget], " | ") || "待补充"}
|
|
253
|
+
- Auto goal reached: ${data.autoGoalReached || "待补充"}
|
|
254
|
+
- Auto stop reason: ${data.autoStopReason || "待补充"}
|
|
255
|
+
- Auto final artifact: ${data.autoFinalArtifact || "待补充"}
|
|
256
|
+
- Auto final rung: ${data.autoFinalRung || "待补充"}
|
|
257
|
+
- Eval objective: ${data.evalObjective || "待补充"}
|
|
258
|
+
- Table plan: ${data.evalTablePlan || "待补充"}
|
|
259
|
+
- Metric glossary: ${data.evalMetricGlossary || "待补充"}
|
|
260
|
+
- Metric source papers: ${data.evalMetricSourcePapers || "待补充"}
|
|
261
|
+
- Metric implementation source: ${data.evalMetricImplementationSource || "待补充"}
|
|
262
|
+
- Comparison source papers: ${data.evalComparisonSourcePapers || "待补充"}
|
|
263
|
+
- Comparison implementation source: ${data.evalComparisonImplementationSource || "待补充"}
|
|
264
|
+
- Deviation from original implementation: ${data.evalDeviationFromOriginalImplementation || "待补充"}
|
|
265
|
+
- Experiment ladder: ${data.evalExperimentLadder || "待补充"}
|
|
266
|
+
- Benchmark ladder: ${data.evalBenchmarkLadder || "待补充"}
|
|
267
|
+
- Promotion gate: ${data.evalPromotionGate || "待补充"}
|
|
213
268
|
- Dataset package: ${data.datasetPackage || "待补充"}
|
|
214
269
|
- Dataset years: ${data.datasetYears || "待补充"}
|
|
215
270
|
- Benchmark role: ${data.benchmarkRole || "待补充"}
|
|
@@ -254,7 +309,25 @@ ${data.problem || "TBD"}
|
|
|
254
309
|
- Why this is the active path: ${data.why || "This is the approved direction"}
|
|
255
310
|
- Auto mode: ${data.autoStatus || "inactive"}
|
|
256
311
|
- Auto objective: ${data.autoObjective || "TBD"}
|
|
312
|
+
- Auto rung: ${data.autoCurrentRung || "TBD"}
|
|
313
|
+
- Auto watch target: ${data.autoWatchTarget || "TBD"}
|
|
257
314
|
- Auto decision: ${data.autoDecision || "TBD"}
|
|
315
|
+
- Auto terminal goal: ${joinNonEmpty([data.autoGoalType, data.autoGoalTarget], " | ") || "TBD"}
|
|
316
|
+
- Auto goal reached: ${data.autoGoalReached || "TBD"}
|
|
317
|
+
- Auto stop reason: ${data.autoStopReason || "TBD"}
|
|
318
|
+
- Auto final artifact: ${data.autoFinalArtifact || "TBD"}
|
|
319
|
+
- Auto final rung: ${data.autoFinalRung || "TBD"}
|
|
320
|
+
- Eval objective: ${data.evalObjective || "TBD"}
|
|
321
|
+
- Table plan: ${data.evalTablePlan || "TBD"}
|
|
322
|
+
- Metric glossary: ${data.evalMetricGlossary || "TBD"}
|
|
323
|
+
- Metric source papers: ${data.evalMetricSourcePapers || "TBD"}
|
|
324
|
+
- Metric implementation source: ${data.evalMetricImplementationSource || "TBD"}
|
|
325
|
+
- Comparison source papers: ${data.evalComparisonSourcePapers || "TBD"}
|
|
326
|
+
- Comparison implementation source: ${data.evalComparisonImplementationSource || "TBD"}
|
|
327
|
+
- Deviation from original implementation: ${data.evalDeviationFromOriginalImplementation || "TBD"}
|
|
328
|
+
- Experiment ladder: ${data.evalExperimentLadder || "TBD"}
|
|
329
|
+
- Benchmark ladder: ${data.evalBenchmarkLadder || "TBD"}
|
|
330
|
+
- Promotion gate: ${data.evalPromotionGate || "TBD"}
|
|
258
331
|
- Dataset package: ${data.datasetPackage || "TBD"}
|
|
259
332
|
- Dataset years: ${data.datasetYears || "TBD"}
|
|
260
333
|
- Benchmark role: ${data.benchmarkRole || "TBD"}
|
|
@@ -291,6 +364,8 @@ function buildContextSnapshot(targetDir) {
|
|
|
291
364
|
const dataDecisions = readFileIfExists(contextFile(targetDir, "data-decisions.md"));
|
|
292
365
|
const autoMode = readFileIfExists(contextFile(targetDir, "auto-mode.md"));
|
|
293
366
|
const autoStatus = readFileIfExists(contextFile(targetDir, "auto-status.md"));
|
|
367
|
+
const autoOutcome = readFileIfExists(contextFile(targetDir, "auto-outcome.md"));
|
|
368
|
+
const evalProtocol = parseEvalProtocol(targetDir);
|
|
294
369
|
const classicBenchmarks = labelValue(
|
|
295
370
|
dataDecisions,
|
|
296
371
|
["Classic public benchmarks", "Classic benchmarks"],
|
|
@@ -484,7 +559,26 @@ function buildContextSnapshot(targetDir) {
|
|
|
484
559
|
),
|
|
485
560
|
autoObjective: extractValue(autoMode, ["Objective", "目标"]),
|
|
486
561
|
autoStatus: extractValue(autoStatus, ["Status", "状态"]),
|
|
562
|
+
autoCurrentRung: extractValue(autoStatus, ["Current rung", "当前 rung"]),
|
|
563
|
+
autoWatchTarget: extractValue(autoStatus, ["Watch target", "监视目标"]),
|
|
487
564
|
autoDecision: extractValue(autoStatus, ["Current decision", "当前决策"]),
|
|
565
|
+
autoGoalType: extractValue(autoOutcome, ["Terminal goal type", "终止目标类型"]),
|
|
566
|
+
autoGoalTarget: extractValue(autoOutcome, ["Terminal goal target", "终止目标目标值"]),
|
|
567
|
+
autoGoalReached: extractValue(autoOutcome, ["Goal reached", "目标是否达成"]),
|
|
568
|
+
autoStopReason: extractValue(autoOutcome, ["Stop reason", "停止原因"]),
|
|
569
|
+
autoFinalArtifact: extractValue(autoOutcome, ["Final artifact", "最终工件"]),
|
|
570
|
+
autoFinalRung: extractValue(autoOutcome, ["Final rung", "最终 rung"]),
|
|
571
|
+
evalObjective: evalProtocol.primaryEvaluationObjective,
|
|
572
|
+
evalTablePlan: evalProtocol.tablePlan,
|
|
573
|
+
evalMetricGlossary: evalProtocol.metricGlossary,
|
|
574
|
+
evalMetricSourcePapers: evalProtocol.metricSourcePapers,
|
|
575
|
+
evalMetricImplementationSource: evalProtocol.metricImplementationSource,
|
|
576
|
+
evalComparisonSourcePapers: evalProtocol.comparisonSourcePapers,
|
|
577
|
+
evalComparisonImplementationSource: evalProtocol.comparisonImplementationSource,
|
|
578
|
+
evalDeviationFromOriginalImplementation: evalProtocol.deviationFromOriginalImplementation,
|
|
579
|
+
evalExperimentLadder: evalProtocol.experimentLadder,
|
|
580
|
+
evalBenchmarkLadder: evalProtocol.benchmarkLadder,
|
|
581
|
+
evalPromotionGate: evalProtocol.promotionGate,
|
|
488
582
|
};
|
|
489
583
|
}
|
|
490
584
|
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
const fs = require("node:fs");
|
|
2
|
+
const path = require("node:path");
|
|
3
|
+
|
|
4
|
+
const PLACEHOLDER_VALUES = new Set(["", "tbd", "none", "待补充", "无"]);
|
|
5
|
+
const VALID_LADDER_STAGES = new Set(["run", "iterate", "review", "report", "write"]);
|
|
6
|
+
const EVAL_PROTOCOL_FIELDS = [
|
|
7
|
+
{
|
|
8
|
+
name: "Primary evaluation objective",
|
|
9
|
+
key: "primaryEvaluationObjective",
|
|
10
|
+
labels: ["Primary evaluation objective", "主评估目标"],
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
name: "Primary metrics",
|
|
14
|
+
key: "primaryMetrics",
|
|
15
|
+
labels: ["Primary metrics", "主指标"],
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
name: "Secondary metrics",
|
|
19
|
+
key: "secondaryMetrics",
|
|
20
|
+
labels: ["Secondary metrics", "次级指标"],
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
name: "Required terminal evidence",
|
|
24
|
+
key: "requiredTerminalEvidence",
|
|
25
|
+
labels: ["Required terminal evidence", "必要终局证据"],
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
name: "Table plan",
|
|
29
|
+
key: "tablePlan",
|
|
30
|
+
labels: ["Table plan", "主表计划"],
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
name: "Required claims per table",
|
|
34
|
+
key: "requiredClaimsPerTable",
|
|
35
|
+
labels: ["Required claims per table", "每张表必须支撑的 claims"],
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
name: "Metric glossary",
|
|
39
|
+
key: "metricGlossary",
|
|
40
|
+
labels: ["Metric glossary", "指标释义"],
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
name: "Metric source papers",
|
|
44
|
+
key: "metricSourcePapers",
|
|
45
|
+
labels: ["Metric source papers", "指标来源论文"],
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
name: "Metric implementation source",
|
|
49
|
+
key: "metricImplementationSource",
|
|
50
|
+
labels: ["Metric implementation source", "指标实现来源"],
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
name: "Comparison source papers",
|
|
54
|
+
key: "comparisonSourcePapers",
|
|
55
|
+
labels: ["Comparison source papers", "对比方法来源论文"],
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
name: "Comparison implementation source",
|
|
59
|
+
key: "comparisonImplementationSource",
|
|
60
|
+
labels: ["Comparison implementation source", "对比方法实现来源"],
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
name: "Deviation from original implementation",
|
|
64
|
+
key: "deviationFromOriginalImplementation",
|
|
65
|
+
labels: ["Deviation from original implementation", "与原始实现的偏差"],
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
name: "Benchmark ladder",
|
|
69
|
+
key: "benchmarkLadder",
|
|
70
|
+
labels: ["Benchmark ladder", "benchmark 阶梯"],
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
name: "Experiment ladder",
|
|
74
|
+
key: "experimentLadder",
|
|
75
|
+
labels: ["Experiment ladder", "实验阶梯"],
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
name: "Comparison gate",
|
|
79
|
+
key: "comparisonGate",
|
|
80
|
+
labels: ["Comparison gate", "对比方法 gate"],
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
name: "Promotion gate",
|
|
84
|
+
key: "promotionGate",
|
|
85
|
+
labels: ["Promotion gate", "升格 gate"],
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
name: "Minimum sample sizes",
|
|
89
|
+
key: "minimumSampleSizes",
|
|
90
|
+
labels: ["Minimum sample sizes", "最小样本量"],
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
name: "Required output artifacts",
|
|
94
|
+
key: "requiredOutputArtifacts",
|
|
95
|
+
labels: ["Required output artifacts", "必要输出工件"],
|
|
96
|
+
},
|
|
97
|
+
];
|
|
98
|
+
const EXPERIMENT_RUNG_FIELDS = [
|
|
99
|
+
{ name: "Stage", key: "stage", labels: ["Stage", "阶段"] },
|
|
100
|
+
{ name: "Goal", key: "goal", labels: ["Goal", "目标"] },
|
|
101
|
+
{ name: "Command", key: "command", labels: ["Command", "命令"] },
|
|
102
|
+
{ name: "Watch", key: "watch", labels: ["Watch", "监视目标"] },
|
|
103
|
+
{ name: "Gate", key: "gate", labels: ["Gate", "gate 命令", "检查命令"] },
|
|
104
|
+
{ name: "On pass", key: "onPass", labels: ["On pass", "通过后"] },
|
|
105
|
+
{ name: "On fail", key: "onFail", labels: ["On fail", "失败后"] },
|
|
106
|
+
{ name: "On stop", key: "onStop", labels: ["On stop", "停止后"] },
|
|
107
|
+
];
|
|
108
|
+
|
|
109
|
+
function contextFile(targetDir, name) {
|
|
110
|
+
return path.join(targetDir, ".lab", "context", name);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function readFileIfExists(filePath) {
|
|
114
|
+
if (!fs.existsSync(filePath)) {
|
|
115
|
+
return "";
|
|
116
|
+
}
|
|
117
|
+
return fs.readFileSync(filePath, "utf8");
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function extractValue(text, labels) {
|
|
121
|
+
for (const label of labels) {
|
|
122
|
+
const escaped = label.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
123
|
+
const regex = new RegExp(`^\\s*-\\s*${escaped}:[ \\t]*([^\\n\\r]+?)[ \\t]*$`, "im");
|
|
124
|
+
const match = text.match(regex);
|
|
125
|
+
if (match && match[1]) {
|
|
126
|
+
return match[1].trim();
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
return "";
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function isMeaningful(value) {
|
|
133
|
+
return !PLACEHOLDER_VALUES.has((value || "").trim().toLowerCase());
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function parseEvalProtocol(targetDir) {
|
|
137
|
+
const filePath = contextFile(targetDir, "eval-protocol.md");
|
|
138
|
+
const text = readFileIfExists(filePath);
|
|
139
|
+
const fields = {};
|
|
140
|
+
|
|
141
|
+
for (const field of EVAL_PROTOCOL_FIELDS) {
|
|
142
|
+
fields[field.key] = extractValue(text, field.labels);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
const experimentRungs = [];
|
|
146
|
+
const rungHeadingPattern = /^###\s*Rung:\s*(.+?)\s*$/gm;
|
|
147
|
+
const rungHeadings = Array.from(text.matchAll(rungHeadingPattern));
|
|
148
|
+
for (let index = 0; index < rungHeadings.length; index += 1) {
|
|
149
|
+
const match = rungHeadings[index];
|
|
150
|
+
const rungId = match[1];
|
|
151
|
+
const blockStart = match.index || 0;
|
|
152
|
+
const bodyStart = blockStart + match[0].length;
|
|
153
|
+
const nextBlockStart = index + 1 < rungHeadings.length ? rungHeadings[index + 1].index : text.length;
|
|
154
|
+
const rungBody = text.slice(bodyStart, nextBlockStart).trim();
|
|
155
|
+
const rung = {
|
|
156
|
+
id: rungId.trim(),
|
|
157
|
+
block: text.slice(blockStart, nextBlockStart).trim(),
|
|
158
|
+
};
|
|
159
|
+
for (const field of EXPERIMENT_RUNG_FIELDS) {
|
|
160
|
+
rung[field.key] = extractValue(rungBody, field.labels);
|
|
161
|
+
}
|
|
162
|
+
rung.stage = (rung.stage || "").trim().toLowerCase();
|
|
163
|
+
experimentRungs.push(rung);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
return {
|
|
167
|
+
path: filePath,
|
|
168
|
+
text,
|
|
169
|
+
experimentRungs,
|
|
170
|
+
...fields,
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function validateEvalProtocol(targetDir) {
|
|
175
|
+
const protocol = parseEvalProtocol(targetDir);
|
|
176
|
+
if (!fs.existsSync(protocol.path)) {
|
|
177
|
+
return [];
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const stateText = readFileIfExists(contextFile(targetDir, "state.md"));
|
|
181
|
+
const autoModeText = readFileIfExists(contextFile(targetDir, "auto-mode.md"));
|
|
182
|
+
const activeStage = extractValue(stateText, ["Active stage", "当前阶段", "Stage"]).trim().toLowerCase();
|
|
183
|
+
const autoObjective = extractValue(autoModeText, ["Objective", "目标"]);
|
|
184
|
+
const activated =
|
|
185
|
+
EVAL_PROTOCOL_FIELDS.some((field) => isMeaningful(protocol[field.key])) ||
|
|
186
|
+
["run", "iterate", "review", "report", "write", "auto"].includes(activeStage) ||
|
|
187
|
+
isMeaningful(autoObjective);
|
|
188
|
+
|
|
189
|
+
if (!activated) {
|
|
190
|
+
return [];
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const missing = EVAL_PROTOCOL_FIELDS.filter((field) => !isMeaningful(protocol[field.key])).map(
|
|
194
|
+
(field) => field.name
|
|
195
|
+
);
|
|
196
|
+
if (missing.length === 0) {
|
|
197
|
+
const rungIssues = [];
|
|
198
|
+
const rungIds = new Set();
|
|
199
|
+
for (const rung of protocol.experimentRungs) {
|
|
200
|
+
if (!isMeaningful(rung.id)) {
|
|
201
|
+
rungIssues.push("experiment ladder has a rung with an empty id");
|
|
202
|
+
continue;
|
|
203
|
+
}
|
|
204
|
+
if (rungIds.has(rung.id)) {
|
|
205
|
+
rungIssues.push(`experiment ladder has duplicate rung id: ${rung.id}`);
|
|
206
|
+
}
|
|
207
|
+
rungIds.add(rung.id);
|
|
208
|
+
|
|
209
|
+
const missingRungFields = EXPERIMENT_RUNG_FIELDS.filter((field) => {
|
|
210
|
+
if (field.key === "command" && !isMeaningful(rung[field.key])) {
|
|
211
|
+
return false;
|
|
212
|
+
}
|
|
213
|
+
return !isMeaningful(rung[field.key]);
|
|
214
|
+
}).map((field) => field.name);
|
|
215
|
+
|
|
216
|
+
if (missingRungFields.length > 0) {
|
|
217
|
+
rungIssues.push(`experiment ladder rung ${rung.id} is missing: ${missingRungFields.join(", ")}`);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
if (isMeaningful(rung.stage) && !VALID_LADDER_STAGES.has(rung.stage)) {
|
|
221
|
+
rungIssues.push(`experiment ladder rung ${rung.id} has invalid stage: ${rung.stage}`);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return rungIssues;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return [`missing evaluation protocol fields: ${missing.join(", ")}`];
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
module.exports = {
|
|
232
|
+
EVAL_PROTOCOL_FIELDS,
|
|
233
|
+
EXPERIMENT_RUNG_FIELDS,
|
|
234
|
+
parseEvalProtocol,
|
|
235
|
+
validateEvalProtocol,
|
|
236
|
+
};
|