superlab 0.1.23 → 0.1.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/README.zh-CN.md +3 -2
- package/lib/auto_contracts.cjs +4 -2
- package/lib/auto_runner.cjs +30 -0
- package/lib/auto_state.cjs +30 -0
- package/lib/context.cjs +437 -14
- package/lib/eval_protocol.cjs +75 -0
- package/lib/i18n.cjs +140 -24
- package/lib/install.cjs +2 -0
- package/package-assets/claude/commands/lab.md +2 -2
- package/package-assets/codex/prompts/lab.md +2 -2
- package/package-assets/shared/lab/.managed/scripts/validate_collaborator_report.py +53 -0
- package/package-assets/shared/lab/.managed/templates/artifact-status.md +28 -0
- package/package-assets/shared/lab/.managed/templates/final-report.md +24 -19
- package/package-assets/shared/lab/.managed/templates/review-checklist.md +4 -0
- package/package-assets/shared/lab/context/auto-mode.md +3 -3
- package/package-assets/shared/lab/context/auto-outcome.md +15 -0
- package/package-assets/shared/lab/context/eval-protocol.md +21 -0
- package/package-assets/shared/lab/context/session-brief.md +1 -1
- package/package-assets/shared/lab/context/state.md +19 -13
- package/package-assets/shared/lab/context/workflow-state.md +19 -0
- package/package-assets/shared/lab/system/core.md +4 -2
- package/package-assets/shared/skills/lab/SKILL.md +10 -10
- package/package-assets/shared/skills/lab/stages/auto.md +5 -1
- package/package-assets/shared/skills/lab/stages/iterate.md +4 -0
- package/package-assets/shared/skills/lab/stages/report.md +11 -1
- package/package-assets/shared/skills/lab/stages/review.md +4 -0
- package/package-assets/shared/skills/lab/stages/run.md +4 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -180,7 +180,7 @@ superlab auto stop
|
|
|
180
180
|
|
|
181
181
|
- `run` and `iterate` must change persistent outputs under `results_root`
|
|
182
182
|
- `review` must update canonical review context
|
|
183
|
-
- `report` must write `<deliverables_root>/report.md
|
|
183
|
+
- `report` must write `<deliverables_root>/report.md`, `<deliverables_root>/main-tables.md`, and `<deliverables_root>/artifact-status.md`
|
|
184
184
|
- `write` must produce LaTeX output under `<deliverables_root>/paper/`
|
|
185
185
|
- a successful promotion must write back into `.lab/context/data-decisions.md`, `.lab/context/decisions.md`, `.lab/context/state.md`, and `.lab/context/session-brief.md`
|
|
186
186
|
- every run must end with `.lab/context/auto-outcome.md`, including why it stopped, whether the terminal goal was reached, and which artifact is the final outcome
|
|
@@ -201,7 +201,7 @@ Level Guide for `/lab:auto`:
|
|
|
201
201
|
Example:
|
|
202
202
|
|
|
203
203
|
```text
|
|
204
|
-
/lab:auto Autonomy level L2. Objective: advance paper layer 3
|
|
204
|
+
/lab:auto Autonomy level L2. Objective: advance paper layer 3 through one bounded protocol improvement. Terminal goal: task-completion. Scope: bounded protocol, tests, one minimal implementation, and one small run. Allowed modifications: configuration, evaluation script, and data-loading logic only.
|
|
205
205
|
```
|
|
206
206
|
|
|
207
207
|
## Version
|
|
@@ -309,6 +309,7 @@ See the source command docs in [commands/codex/lab.md](/Users/zhouhao119/coding/
|
|
|
309
309
|
|
|
310
310
|
- `docs/research/report.md`
|
|
311
311
|
- `docs/research/main-tables.md`
|
|
312
|
+
- `docs/research/artifact-status.md`
|
|
312
313
|
- `docs/research/paper/main.tex`
|
|
313
314
|
- `docs/research/paper/sections/*.tex`
|
|
314
315
|
|
package/README.zh-CN.md
CHANGED
|
@@ -178,7 +178,7 @@ superlab auto stop
|
|
|
178
178
|
|
|
179
179
|
- `run` 和 `iterate` 必须更新 `results_root` 下的持久输出
|
|
180
180
|
- `review` 必须更新规范的审查上下文
|
|
181
|
-
- `report` 必须写出 `<deliverables_root>/report.md` 和 `<deliverables_root>/
|
|
181
|
+
- `report` 必须写出 `<deliverables_root>/report.md`、`<deliverables_root>/main-tables.md` 和 `<deliverables_root>/artifact-status.md`
|
|
182
182
|
- `write` 必须写出 `<deliverables_root>/paper/` 下的 LaTeX 论文产物
|
|
183
183
|
- promotion 成功后必须写回 `.lab/context/data-decisions.md`、`.lab/context/decisions.md`、`.lab/context/state.md` 和 `.lab/context/session-brief.md`
|
|
184
184
|
- 每次运行都必须写出 `.lab/context/auto-outcome.md`,记录为什么停止、是否达到终止目标,以及哪一个工件是最终结果
|
|
@@ -199,7 +199,7 @@ superlab auto stop
|
|
|
199
199
|
示例:
|
|
200
200
|
|
|
201
201
|
```text
|
|
202
|
-
/lab:auto 自治级别 L2。目标:推进 paper layer 3
|
|
202
|
+
/lab:auto 自治级别 L2。目标:推进 paper layer 3 的一项有边界协议改进。终止条件:完成 bounded protocol、测试、一项最小实现和一轮小规模结果。允许修改:配置、评估脚本、数据加载逻辑。
|
|
203
203
|
```
|
|
204
204
|
|
|
205
205
|
## 版本查询
|
|
@@ -294,6 +294,7 @@ Codex 和 Claude 的命令入口不一样:
|
|
|
294
294
|
|
|
295
295
|
- `docs/research/report.md`
|
|
296
296
|
- `docs/research/main-tables.md`
|
|
297
|
+
- `docs/research/artifact-status.md`
|
|
297
298
|
- `docs/research/paper/main.tex`
|
|
298
299
|
- `docs/research/paper/sections/*.tex`
|
|
299
300
|
|
package/lib/auto_contracts.cjs
CHANGED
|
@@ -34,6 +34,7 @@ const FROZEN_CORE_ALIASES = {
|
|
|
34
34
|
const REVIEW_CONTEXT_FILES = [
|
|
35
35
|
path.join(".lab", "context", "decisions.md"),
|
|
36
36
|
path.join(".lab", "context", "state.md"),
|
|
37
|
+
path.join(".lab", "context", "workflow-state.md"),
|
|
37
38
|
path.join(".lab", "context", "open-questions.md"),
|
|
38
39
|
path.join(".lab", "context", "evidence-index.md"),
|
|
39
40
|
];
|
|
@@ -288,6 +289,7 @@ function stageContractSnapshot(targetDir, stage) {
|
|
|
288
289
|
report: [
|
|
289
290
|
path.join(deliverablesRoot, "report.md"),
|
|
290
291
|
path.join(deliverablesRoot, "main-tables.md"),
|
|
292
|
+
path.join(deliverablesRoot, "artifact-status.md"),
|
|
291
293
|
],
|
|
292
294
|
write: [
|
|
293
295
|
path.join(deliverablesRoot, "paper", "main.tex"),
|
|
@@ -318,7 +320,7 @@ function verifyStageContract({ stage, snapshot }) {
|
|
|
318
320
|
if (stage === "review") {
|
|
319
321
|
if (changedPaths.length === 0) {
|
|
320
322
|
throw new Error(
|
|
321
|
-
"review stage did not update canonical review context (.lab/context/decisions.md, state.md, open-questions.md, or evidence-index.md)"
|
|
323
|
+
"review stage did not update canonical review context (.lab/context/decisions.md, state.md, workflow-state.md, open-questions.md, or evidence-index.md)"
|
|
322
324
|
);
|
|
323
325
|
}
|
|
324
326
|
return;
|
|
@@ -327,7 +329,7 @@ function verifyStageContract({ stage, snapshot }) {
|
|
|
327
329
|
if (stage === "report") {
|
|
328
330
|
const missing = Array.from(snapshot.keys()).filter((absolutePath) => !changedPaths.includes(absolutePath));
|
|
329
331
|
if (missing.length > 0) {
|
|
330
|
-
throw new Error("report stage did not produce
|
|
332
|
+
throw new Error("report stage did not produce report.md, main-tables.md, and artifact-status.md under deliverables_root");
|
|
331
333
|
}
|
|
332
334
|
return;
|
|
333
335
|
}
|
package/lib/auto_runner.cjs
CHANGED
|
@@ -278,6 +278,21 @@ async function startAutoMode({ targetDir, now = new Date() }) {
|
|
|
278
278
|
comparisonSourcePapers: evalProtocol.comparisonSourcePapers,
|
|
279
279
|
comparisonImplementationSource: evalProtocol.comparisonImplementationSource,
|
|
280
280
|
deviationFromOriginalImplementation: evalProtocol.deviationFromOriginalImplementation,
|
|
281
|
+
evaluationSettingSemantics: evalProtocol.evaluationSettingSemantics,
|
|
282
|
+
visibilityAndLeakageRisks: evalProtocol.visibilityAndLeakageRisks,
|
|
283
|
+
anchorAndLabelPolicy: evalProtocol.anchorAndLabelPolicy,
|
|
284
|
+
scaleAndComparabilityPolicy: evalProtocol.scaleAndComparabilityPolicy,
|
|
285
|
+
metricValidityChecks: evalProtocol.metricValidityChecks,
|
|
286
|
+
comparisonValidityChecks: evalProtocol.comparisonValidityChecks,
|
|
287
|
+
statisticalValidityChecks: evalProtocol.statisticalValidityChecks,
|
|
288
|
+
claimBoundary: evalProtocol.claimBoundary,
|
|
289
|
+
integritySelfCheck: evalProtocol.integritySelfCheck,
|
|
290
|
+
anomalySignals: evalProtocol.anomalySignals,
|
|
291
|
+
implementationRealityChecks: evalProtocol.implementationRealityChecks,
|
|
292
|
+
alternativeExplanationsConsidered: evalProtocol.alternativeExplanationsConsidered,
|
|
293
|
+
crossCheckMethod: evalProtocol.crossCheckMethod,
|
|
294
|
+
bestSupportedInterpretation: evalProtocol.bestSupportedInterpretation,
|
|
295
|
+
escalationThreshold: evalProtocol.escalationThreshold,
|
|
281
296
|
};
|
|
282
297
|
|
|
283
298
|
const writeRunningStatus = (overrides = {}) => {
|
|
@@ -768,6 +783,21 @@ function stopAutoMode({ targetDir, now = new Date() }) {
|
|
|
768
783
|
comparisonSourcePapers: evalProtocol.comparisonSourcePapers,
|
|
769
784
|
comparisonImplementationSource: evalProtocol.comparisonImplementationSource,
|
|
770
785
|
deviationFromOriginalImplementation: evalProtocol.deviationFromOriginalImplementation,
|
|
786
|
+
evaluationSettingSemantics: evalProtocol.evaluationSettingSemantics,
|
|
787
|
+
visibilityAndLeakageRisks: evalProtocol.visibilityAndLeakageRisks,
|
|
788
|
+
anchorAndLabelPolicy: evalProtocol.anchorAndLabelPolicy,
|
|
789
|
+
scaleAndComparabilityPolicy: evalProtocol.scaleAndComparabilityPolicy,
|
|
790
|
+
metricValidityChecks: evalProtocol.metricValidityChecks,
|
|
791
|
+
comparisonValidityChecks: evalProtocol.comparisonValidityChecks,
|
|
792
|
+
statisticalValidityChecks: evalProtocol.statisticalValidityChecks,
|
|
793
|
+
claimBoundary: evalProtocol.claimBoundary,
|
|
794
|
+
integritySelfCheck: evalProtocol.integritySelfCheck,
|
|
795
|
+
anomalySignals: evalProtocol.anomalySignals,
|
|
796
|
+
implementationRealityChecks: evalProtocol.implementationRealityChecks,
|
|
797
|
+
alternativeExplanationsConsidered: evalProtocol.alternativeExplanationsConsidered,
|
|
798
|
+
crossCheckMethod: evalProtocol.crossCheckMethod,
|
|
799
|
+
bestSupportedInterpretation: evalProtocol.bestSupportedInterpretation,
|
|
800
|
+
escalationThreshold: evalProtocol.escalationThreshold,
|
|
771
801
|
};
|
|
772
802
|
const status = {
|
|
773
803
|
...existing,
|
package/lib/auto_state.cjs
CHANGED
|
@@ -154,6 +154,21 @@ function renderAutoOutcome(outcome, { lang = "en" } = {}) {
|
|
|
154
154
|
- 对比方法来源论文: ${outcome.comparisonSourcePapers || ""}
|
|
155
155
|
- 对比方法实现来源: ${outcome.comparisonImplementationSource || ""}
|
|
156
156
|
- 与原始实现的偏差: ${outcome.deviationFromOriginalImplementation || ""}
|
|
157
|
+
- 评测设定语义: ${outcome.evaluationSettingSemantics || ""}
|
|
158
|
+
- 可见性与泄漏风险: ${outcome.visibilityAndLeakageRisks || ""}
|
|
159
|
+
- 锚点与标签策略: ${outcome.anchorAndLabelPolicy || ""}
|
|
160
|
+
- 尺度与可比性策略: ${outcome.scaleAndComparabilityPolicy || ""}
|
|
161
|
+
- 指标有效性检查: ${outcome.metricValidityChecks || ""}
|
|
162
|
+
- 对比有效性检查: ${outcome.comparisonValidityChecks || ""}
|
|
163
|
+
- 统计有效性检查: ${outcome.statisticalValidityChecks || ""}
|
|
164
|
+
- 结论边界: ${outcome.claimBoundary || ""}
|
|
165
|
+
- 完整性自检: ${outcome.integritySelfCheck || ""}
|
|
166
|
+
- 异常信号: ${outcome.anomalySignals || ""}
|
|
167
|
+
- 实现层现实检查: ${outcome.implementationRealityChecks || ""}
|
|
168
|
+
- 已考虑的替代解释: ${outcome.alternativeExplanationsConsidered || ""}
|
|
169
|
+
- 交叉验证方法: ${outcome.crossCheckMethod || ""}
|
|
170
|
+
- 当前最站得住的解释: ${outcome.bestSupportedInterpretation || ""}
|
|
171
|
+
- 升级阈值: ${outcome.escalationThreshold || ""}
|
|
157
172
|
- 终止目标类型: ${outcome.terminalGoalType || ""}
|
|
158
173
|
- 终止目标目标值: ${outcome.terminalGoalTarget || ""}
|
|
159
174
|
- 必要终止工件: ${outcome.requiredTerminalArtifact || ""}
|
|
@@ -191,6 +206,21 @@ function renderAutoOutcome(outcome, { lang = "en" } = {}) {
|
|
|
191
206
|
- Comparison source papers: ${outcome.comparisonSourcePapers || ""}
|
|
192
207
|
- Comparison implementation source: ${outcome.comparisonImplementationSource || ""}
|
|
193
208
|
- Deviation from original implementation: ${outcome.deviationFromOriginalImplementation || ""}
|
|
209
|
+
- Evaluation setting semantics: ${outcome.evaluationSettingSemantics || ""}
|
|
210
|
+
- Visibility and leakage risks: ${outcome.visibilityAndLeakageRisks || ""}
|
|
211
|
+
- Anchor and label policy: ${outcome.anchorAndLabelPolicy || ""}
|
|
212
|
+
- Scale and comparability policy: ${outcome.scaleAndComparabilityPolicy || ""}
|
|
213
|
+
- Metric validity checks: ${outcome.metricValidityChecks || ""}
|
|
214
|
+
- Comparison validity checks: ${outcome.comparisonValidityChecks || ""}
|
|
215
|
+
- Statistical validity checks: ${outcome.statisticalValidityChecks || ""}
|
|
216
|
+
- Claim boundary: ${outcome.claimBoundary || ""}
|
|
217
|
+
- Integrity self-check: ${outcome.integritySelfCheck || ""}
|
|
218
|
+
- Anomaly signals: ${outcome.anomalySignals || ""}
|
|
219
|
+
- Implementation reality checks: ${outcome.implementationRealityChecks || ""}
|
|
220
|
+
- Alternative explanations considered: ${outcome.alternativeExplanationsConsidered || ""}
|
|
221
|
+
- Cross-check method: ${outcome.crossCheckMethod || ""}
|
|
222
|
+
- Best-supported interpretation: ${outcome.bestSupportedInterpretation || ""}
|
|
223
|
+
- Escalation threshold: ${outcome.escalationThreshold || ""}
|
|
194
224
|
- Terminal goal type: ${outcome.terminalGoalType || ""}
|
|
195
225
|
- Terminal goal target: ${outcome.terminalGoalTarget || ""}
|
|
196
226
|
- Required terminal artifact: ${outcome.requiredTerminalArtifact || ""}
|