superlab 0.1.18 → 0.1.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/superlab.cjs +8 -0
- package/lib/auto_runner.cjs +7 -5
- package/lib/context.cjs +610 -0
- package/lib/i18n.cjs +40 -0
- package/package-assets/shared/lab/.managed/templates/final-report.md +23 -0
- package/package-assets/shared/lab/context/eval-protocol.md +1 -0
- package/package-assets/shared/lab/context/mission.md +2 -0
- package/package-assets/shared/skills/lab/SKILL.md +8 -3
- package/package-assets/shared/skills/lab/stages/auto.md +4 -0
- package/package-assets/shared/skills/lab/stages/idea.md +3 -0
- package/package-assets/shared/skills/lab/stages/iterate.md +2 -0
- package/package-assets/shared/skills/lab/stages/report.md +10 -0
- package/package-assets/shared/skills/lab/stages/run.md +3 -0
- package/package.json +1 -1
package/bin/superlab.cjs
CHANGED
|
@@ -12,6 +12,8 @@ const {
|
|
|
12
12
|
} = require("../lib/install.cjs");
|
|
13
13
|
const {
|
|
14
14
|
archiveContext,
|
|
15
|
+
collaboratorEvalIssues,
|
|
16
|
+
collaboratorMissionIssues,
|
|
15
17
|
pruneContext,
|
|
16
18
|
refreshContext,
|
|
17
19
|
} = require("../lib/context.cjs");
|
|
@@ -788,6 +790,8 @@ function printDoctor(options) {
|
|
|
788
790
|
const templateIssues = validatePaperTemplateRoot(options.targetDir, config);
|
|
789
791
|
const dataDecisionIssues = validateDataDecisions(options.targetDir);
|
|
790
792
|
const evalProtocolIssues = validateEvalProtocol(options.targetDir);
|
|
793
|
+
const missionContextIssues = collaboratorMissionIssues(options.targetDir);
|
|
794
|
+
const collaboratorProtocolIssues = collaboratorEvalIssues(options.targetDir);
|
|
791
795
|
const rootIssues = validateProjectRoots(options.targetDir, config);
|
|
792
796
|
const autoStatus = getAutoStatus({ targetDir: options.targetDir });
|
|
793
797
|
const autoIssues = autoStatus.issues;
|
|
@@ -806,6 +810,8 @@ function printDoctor(options) {
|
|
|
806
810
|
templateIssues.length > 0 ||
|
|
807
811
|
dataDecisionIssues.length > 0 ||
|
|
808
812
|
evalProtocolIssues.length > 0 ||
|
|
813
|
+
missionContextIssues.length > 0 ||
|
|
814
|
+
collaboratorProtocolIssues.length > 0 ||
|
|
809
815
|
rootIssues.length > 0 ||
|
|
810
816
|
autoIssues.length > 0
|
|
811
817
|
) {
|
|
@@ -820,6 +826,8 @@ function printDoctor(options) {
|
|
|
820
826
|
templateIssues,
|
|
821
827
|
dataDecisionIssues,
|
|
822
828
|
evalProtocolIssues,
|
|
829
|
+
missionContextIssues,
|
|
830
|
+
collaboratorProtocolIssues,
|
|
823
831
|
rootIssues,
|
|
824
832
|
autoIssues
|
|
825
833
|
);
|
package/lib/auto_runner.cjs
CHANGED
|
@@ -253,7 +253,7 @@ async function startAutoMode({ targetDir, now = new Date() }) {
|
|
|
253
253
|
const maxFailures = parseInteger(mode.maxFailures, 0);
|
|
254
254
|
const maxIterations = parseInteger(mode.maxIterations, 1);
|
|
255
255
|
const requiredArtifact = resolveRequiredArtifact(targetDir, mode.requiredTerminalArtifact);
|
|
256
|
-
|
|
256
|
+
let frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
|
|
257
257
|
const { loopStages, finalStages } = splitAutoStages(mode.allowedStages);
|
|
258
258
|
const executedStages = [];
|
|
259
259
|
let failureCount = 0;
|
|
@@ -363,13 +363,14 @@ async function startAutoMode({ targetDir, now = new Date() }) {
|
|
|
363
363
|
nextRung,
|
|
364
364
|
decision: rungId ? `completed rung ${rungId}` : `completed stage ${stage}`,
|
|
365
365
|
});
|
|
366
|
-
refreshContext({ targetDir });
|
|
367
|
-
|
|
368
366
|
const frozenCoreChanges = detectFrozenCoreChanges(frozenCoreSnapshot);
|
|
369
367
|
if (frozenCoreChanges.length > 0) {
|
|
370
368
|
failAutoMode(`frozen core changed: ${frozenCoreChanges.join(", ")}`);
|
|
371
369
|
}
|
|
372
370
|
|
|
371
|
+
refreshContext({ targetDir });
|
|
372
|
+
frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
|
|
373
|
+
|
|
373
374
|
const stopCheck = await runCheckCommand({
|
|
374
375
|
targetDir,
|
|
375
376
|
label: `stop check after ${rungId || stage}`,
|
|
@@ -437,12 +438,13 @@ async function startAutoMode({ targetDir, now = new Date() }) {
|
|
|
437
438
|
decision: `promotion policy matched after ${label}`,
|
|
438
439
|
});
|
|
439
440
|
promotionApplied = true;
|
|
440
|
-
refreshContext({ targetDir });
|
|
441
|
-
verifyPromotionWriteback(targetDir, promotionSnapshot);
|
|
442
441
|
const frozenCoreChangesAfterPromotion = detectFrozenCoreChanges(frozenCoreSnapshot);
|
|
443
442
|
if (frozenCoreChangesAfterPromotion.length > 0) {
|
|
444
443
|
failAutoMode(`frozen core changed: ${frozenCoreChangesAfterPromotion.join(", ")}`);
|
|
445
444
|
}
|
|
445
|
+
refreshContext({ targetDir });
|
|
446
|
+
verifyPromotionWriteback(targetDir, promotionSnapshot);
|
|
447
|
+
frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
|
|
446
448
|
};
|
|
447
449
|
|
|
448
450
|
if (evalProtocol.experimentRungs.length > 0) {
|
package/lib/context.cjs
CHANGED
|
@@ -2,6 +2,64 @@ const fs = require("node:fs");
|
|
|
2
2
|
const path = require("node:path");
|
|
3
3
|
const { parseEvalProtocol } = require("./eval_protocol.cjs");
|
|
4
4
|
|
|
5
|
+
const PLACEHOLDER_VALUES = new Set(["", "tbd", "none", "待补充", "无"]);
|
|
6
|
+
const MISSION_COLLABORATOR_FIELDS = [
|
|
7
|
+
{ name: "One-sentence problem", labels: ["One-sentence problem", "一句话问题"] },
|
|
8
|
+
{ name: "Why it matters", labels: ["Why it matters", "为什么重要"] },
|
|
9
|
+
{ name: "Primary metric", labels: ["Primary metric", "主指标"] },
|
|
10
|
+
{ name: "Success threshold", labels: ["Success threshold", "成功阈值"] },
|
|
11
|
+
{ name: "Dataset or benchmark scope", labels: ["Dataset or benchmark scope", "数据集或 benchmark 范围"] },
|
|
12
|
+
{ name: "Approved direction", labels: ["Approved direction", "已批准方向"] },
|
|
13
|
+
];
|
|
14
|
+
const EVAL_COLLABORATOR_FIELDS = [
|
|
15
|
+
{ name: "Primary evaluation objective", labels: ["Primary evaluation objective", "主评估目标"] },
|
|
16
|
+
{ name: "Primary metrics", labels: ["Primary metrics", "主指标"] },
|
|
17
|
+
{ name: "Secondary metrics", labels: ["Secondary metrics", "次级指标"] },
|
|
18
|
+
{ name: "Table plan", labels: ["Table plan", "主表计划"] },
|
|
19
|
+
{ name: "Metric glossary", labels: ["Metric glossary", "指标释义"] },
|
|
20
|
+
{ name: "Background sources", labels: ["Background sources", "背景来源"] },
|
|
21
|
+
{
|
|
22
|
+
name: "Method and baseline source papers",
|
|
23
|
+
labels: ["Method and baseline source papers", "方法与基线来源论文"],
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
name: "Method and baseline implementation source",
|
|
27
|
+
labels: ["Method and baseline implementation source", "方法与基线实现来源"],
|
|
28
|
+
},
|
|
29
|
+
{ name: "Metric source papers", labels: ["Metric source papers", "指标来源论文"] },
|
|
30
|
+
{ name: "Required output artifacts", labels: ["Required output artifacts", "必要输出工件"] },
|
|
31
|
+
];
|
|
32
|
+
const REPORT_FIELDS = {
|
|
33
|
+
problem: ["Research problem in plain language", "研究问题白话解释", "研究问题"],
|
|
34
|
+
whyItMatters: ["Why this problem matters", "为什么这个问题重要"],
|
|
35
|
+
setting: ["What setting or workflow this report is actually about", "这份报告实际对应的场景或流程"],
|
|
36
|
+
primaryMetrics: ["Primary metrics", "主指标"],
|
|
37
|
+
secondaryMetrics: ["Secondary metrics", "次级指标"],
|
|
38
|
+
requiredTerminalEvidence: ["Required terminal evidence", "必要终局证据"],
|
|
39
|
+
metricGuidePrimary: ["Primary metric plain-language explanation", "主指标白话解释"],
|
|
40
|
+
metricGuideSecondary: ["Secondary metric plain-language explanation", "次级指标白话解释"],
|
|
41
|
+
metricGuideSupport: [
|
|
42
|
+
"Health or support metrics and why they are not the main claim",
|
|
43
|
+
"健康度或支持性指标以及它们为什么不是主 claim",
|
|
44
|
+
],
|
|
45
|
+
backgroundSources: ["Most important background papers or benchmark references", "最重要的背景论文或 benchmark 参考"],
|
|
46
|
+
backgroundAnchors: ["Why these are the right background anchors", "为什么这些是合适的背景锚点"],
|
|
47
|
+
methodBasis: ["Our method source or implementation basis", "我们的方法来源或实现基础"],
|
|
48
|
+
baselineSourcePapers: ["Baseline and comparison source papers", "基线与对比方法来源论文"],
|
|
49
|
+
baselineImplementationSources: [
|
|
50
|
+
"Baseline and comparison implementation sources",
|
|
51
|
+
"基线与对比方法实现来源",
|
|
52
|
+
],
|
|
53
|
+
metricSourcePapers: ["Metric source papers", "指标来源论文"],
|
|
54
|
+
metricImplementationSource: ["Metric implementation source", "指标实现来源"],
|
|
55
|
+
metricDeviation: ["Deviation from original implementation", "与原始实现的偏差"],
|
|
56
|
+
datasets: ["Datasets", "数据集"],
|
|
57
|
+
baselines: ["Baselines", "基线"],
|
|
58
|
+
metrics: ["Metrics", "指标"],
|
|
59
|
+
finalPerformanceSummary: ["Final performance summary", "最终表现总结"],
|
|
60
|
+
tableCoverage: ["Table coverage", "表格覆盖范围"],
|
|
61
|
+
};
|
|
62
|
+
|
|
5
63
|
function contextFile(targetDir, name) {
|
|
6
64
|
return path.join(targetDir, ".lab", "context", name);
|
|
7
65
|
}
|
|
@@ -58,6 +116,278 @@ function joinNonEmpty(parts, separator = "; ") {
|
|
|
58
116
|
return parts.filter(Boolean).join(separator);
|
|
59
117
|
}
|
|
60
118
|
|
|
119
|
+
function isMeaningful(value) {
|
|
120
|
+
return !PLACEHOLDER_VALUES.has((value || "").trim().toLowerCase());
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function readWorkflowConfig(targetDir) {
|
|
124
|
+
const configPath = path.join(targetDir, ".lab", "config", "workflow.json");
|
|
125
|
+
if (!fs.existsSync(configPath)) {
|
|
126
|
+
return {};
|
|
127
|
+
}
|
|
128
|
+
try {
|
|
129
|
+
return JSON.parse(fs.readFileSync(configPath, "utf8"));
|
|
130
|
+
} catch {
|
|
131
|
+
return {};
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function resolveProjectPath(targetDir, configuredPath) {
|
|
136
|
+
if (!configuredPath || typeof configuredPath !== "string") {
|
|
137
|
+
return "";
|
|
138
|
+
}
|
|
139
|
+
return path.resolve(targetDir, configuredPath);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function getCollaboratorDeliverablePaths(targetDir) {
|
|
143
|
+
const config = readWorkflowConfig(targetDir);
|
|
144
|
+
const deliverablesRoot = resolveProjectPath(targetDir, config.deliverables_root || "docs/research");
|
|
145
|
+
return {
|
|
146
|
+
deliverablesRoot,
|
|
147
|
+
reportPath: path.join(deliverablesRoot, "report.md"),
|
|
148
|
+
mainTablesPath: path.join(deliverablesRoot, "main-tables.md"),
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function hasCollaboratorFacingDeliverables(targetDir) {
|
|
153
|
+
const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
|
|
154
|
+
return fs.existsSync(reportPath) || fs.existsSync(mainTablesPath);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
function missingCollaboratorFields(text, fields) {
|
|
158
|
+
return fields.filter((field) => !isMeaningful(extractValue(text, field.labels))).map((field) => field.name);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
function collaboratorMissionIssues(targetDir) {
|
|
162
|
+
if (!hasCollaboratorFacingDeliverables(targetDir)) {
|
|
163
|
+
return [];
|
|
164
|
+
}
|
|
165
|
+
const mission = readFileIfExists(contextFile(targetDir, "mission.md"));
|
|
166
|
+
if (!mission) {
|
|
167
|
+
return [];
|
|
168
|
+
}
|
|
169
|
+
const missing = missingCollaboratorFields(mission, MISSION_COLLABORATOR_FIELDS);
|
|
170
|
+
return missing.length > 0
|
|
171
|
+
? [`mission context is still skeletal for collaborator-facing reporting: ${missing.join(", ")}`]
|
|
172
|
+
: [];
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
function collaboratorEvalIssues(targetDir) {
|
|
176
|
+
if (!hasCollaboratorFacingDeliverables(targetDir)) {
|
|
177
|
+
return [];
|
|
178
|
+
}
|
|
179
|
+
const protocol = readFileIfExists(contextFile(targetDir, "eval-protocol.md"));
|
|
180
|
+
if (!protocol) {
|
|
181
|
+
return [];
|
|
182
|
+
}
|
|
183
|
+
const missing = missingCollaboratorFields(protocol, EVAL_COLLABORATOR_FIELDS);
|
|
184
|
+
return missing.length > 0
|
|
185
|
+
? [`evaluation protocol is still skeletal for collaborator-facing reporting: ${missing.join(", ")}`]
|
|
186
|
+
: [];
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
function extractReportValue(reportText, key) {
|
|
190
|
+
return extractValue(reportText, REPORT_FIELDS[key] || []);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function mergePreferred(existingValue, ...candidates) {
|
|
194
|
+
if (isMeaningful(existingValue)) {
|
|
195
|
+
return existingValue;
|
|
196
|
+
}
|
|
197
|
+
for (const candidate of candidates) {
|
|
198
|
+
if (isMeaningful(candidate)) {
|
|
199
|
+
return candidate;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return "";
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
function firstMetric(metrics) {
|
|
206
|
+
return (metrics || "")
|
|
207
|
+
.split(/[;,]/)
|
|
208
|
+
.map((value) => value.trim())
|
|
209
|
+
.filter(Boolean)[0] || "";
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
function buildMissionContextText(lang, fields) {
|
|
213
|
+
if (lang === "zh") {
|
|
214
|
+
return `# 研究主线
|
|
215
|
+
|
|
216
|
+
## 核心问题
|
|
217
|
+
|
|
218
|
+
- 一句话问题: ${fields.problem || "待补充"}
|
|
219
|
+
- 为什么重要: ${fields.whyItMatters || "待补充"}
|
|
220
|
+
- 目标失败场景: ${fields.targetFailureCase || "待补充"}
|
|
221
|
+
|
|
222
|
+
## 成功标准
|
|
223
|
+
|
|
224
|
+
- 主指标: ${fields.primaryMetric || "待补充"}
|
|
225
|
+
- 成功阈值: ${fields.successThreshold || "待补充"}
|
|
226
|
+
- 必须对比的 baseline: ${fields.requiredBaselineComparison || "待补充"}
|
|
227
|
+
- 最小证据要求: ${fields.minimumEvidenceRequirement || "待补充"}
|
|
228
|
+
|
|
229
|
+
## 冻结边界
|
|
230
|
+
|
|
231
|
+
- 数据集或 benchmark 范围: ${fields.datasetScope || "待补充"}
|
|
232
|
+
- 切分策略: ${fields.splitPolicy || "待补充"}
|
|
233
|
+
- 评估协议: ${fields.evaluationProtocol || "待补充"}
|
|
234
|
+
- 硬约束: ${fields.hardConstraints || "待补充"}
|
|
235
|
+
|
|
236
|
+
## 当前状态
|
|
237
|
+
|
|
238
|
+
- 已批准方向: ${fields.approvedDirection || "待补充"}
|
|
239
|
+
- 当前 owner 或会话: ${fields.currentOwner || "待补充"}
|
|
240
|
+
- 最近一次允许更新 mission 的阶段: ${fields.latestStage || "待补充"}
|
|
241
|
+
- 回填来源: ${fields.hydrationProvenance || "待补充"}
|
|
242
|
+
- 协作者可读状态: ${fields.collaboratorReadyStatus || "待补充"}
|
|
243
|
+
`;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return `# Research Mission
|
|
247
|
+
|
|
248
|
+
## Core Problem
|
|
249
|
+
|
|
250
|
+
- One-sentence problem: ${fields.problem || "TBD"}
|
|
251
|
+
- Why it matters: ${fields.whyItMatters || "TBD"}
|
|
252
|
+
- Target failure case: ${fields.targetFailureCase || "TBD"}
|
|
253
|
+
|
|
254
|
+
## Success Criteria
|
|
255
|
+
|
|
256
|
+
- Primary metric: ${fields.primaryMetric || "TBD"}
|
|
257
|
+
- Success threshold: ${fields.successThreshold || "TBD"}
|
|
258
|
+
- Required baseline comparison: ${fields.requiredBaselineComparison || "TBD"}
|
|
259
|
+
- Minimum evidence requirement: ${fields.minimumEvidenceRequirement || "TBD"}
|
|
260
|
+
|
|
261
|
+
## Frozen Boundaries
|
|
262
|
+
|
|
263
|
+
- Dataset or benchmark scope: ${fields.datasetScope || "TBD"}
|
|
264
|
+
- Split policy: ${fields.splitPolicy || "TBD"}
|
|
265
|
+
- Evaluation protocol: ${fields.evaluationProtocol || "TBD"}
|
|
266
|
+
- Hard constraints: ${fields.hardConstraints || "TBD"}
|
|
267
|
+
|
|
268
|
+
## Current Status
|
|
269
|
+
|
|
270
|
+
- Approved direction: ${fields.approvedDirection || "TBD"}
|
|
271
|
+
- Current owner or session: ${fields.currentOwner || "TBD"}
|
|
272
|
+
- Latest stage to update this mission: ${fields.latestStage || "TBD"}
|
|
273
|
+
- Hydration provenance: ${fields.hydrationProvenance || "TBD"}
|
|
274
|
+
- Collaborator-ready status: ${fields.collaboratorReadyStatus || "TBD"}
|
|
275
|
+
`;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
function buildEvalProtocolText(lang, fields, rungs) {
|
|
279
|
+
const rungBlocks = (rungs || [])
|
|
280
|
+
.map((rung) => {
|
|
281
|
+
if (lang === "zh") {
|
|
282
|
+
return `### Rung: ${rung.id}
|
|
283
|
+
|
|
284
|
+
- 阶段: ${rung.stage || "待补充"}
|
|
285
|
+
- 目标: ${rung.goal || "待补充"}
|
|
286
|
+
- 命令: ${rung.command || "待补充"}
|
|
287
|
+
- 监视目标: ${rung.watch || "待补充"}
|
|
288
|
+
- gate 命令: ${rung.gate || "待补充"}
|
|
289
|
+
- 通过后: ${rung.onPass || "待补充"}
|
|
290
|
+
- 失败后: ${rung.onFail || "待补充"}
|
|
291
|
+
- 停止后: ${rung.onStop || "待补充"}`;
|
|
292
|
+
}
|
|
293
|
+
return `### Rung: ${rung.id}
|
|
294
|
+
|
|
295
|
+
- Stage: ${rung.stage || "TBD"}
|
|
296
|
+
- Goal: ${rung.goal || "TBD"}
|
|
297
|
+
- Command: ${rung.command || "TBD"}
|
|
298
|
+
- Watch: ${rung.watch || "TBD"}
|
|
299
|
+
- Gate: ${rung.gate || "TBD"}
|
|
300
|
+
- On pass: ${rung.onPass || "TBD"}
|
|
301
|
+
- On fail: ${rung.onFail || "TBD"}
|
|
302
|
+
- On stop: ${rung.onStop || "TBD"}`;
|
|
303
|
+
})
|
|
304
|
+
.join("\n\n");
|
|
305
|
+
|
|
306
|
+
if (lang === "zh") {
|
|
307
|
+
return `# 评估协议
|
|
308
|
+
|
|
309
|
+
用这份文件定义 \`/lab:run\`、\`/lab:iterate\`、\`/lab:auto\` 和 \`/lab:report\` 共用的论文导向评估目标、主表计划、gate 与 benchmark ladder。
|
|
310
|
+
|
|
311
|
+
## 主评估目标
|
|
312
|
+
|
|
313
|
+
- 主评估目标: ${fields.primaryEvaluationObjective || "待补充"}
|
|
314
|
+
- 主指标: ${fields.primaryMetrics || "待补充"}
|
|
315
|
+
- 次级指标: ${fields.secondaryMetrics || "待补充"}
|
|
316
|
+
- 必要终局证据: ${fields.requiredTerminalEvidence || "待补充"}
|
|
317
|
+
|
|
318
|
+
## 主表计划
|
|
319
|
+
|
|
320
|
+
- 主表计划: ${fields.tablePlan || "待补充"}
|
|
321
|
+
- 每张表必须支撑的 claims: ${fields.requiredClaimsPerTable || "待补充"}
|
|
322
|
+
|
|
323
|
+
## 指标释义
|
|
324
|
+
|
|
325
|
+
- 指标释义: ${fields.metricGlossary || "待补充"}
|
|
326
|
+
- 回填来源: ${fields.hydrationProvenance || "待补充"}
|
|
327
|
+
- 背景来源: ${fields.backgroundSources || "待补充"}
|
|
328
|
+
- 方法与基线来源论文: ${fields.methodAndBaselineSourcePapers || "待补充"}
|
|
329
|
+
- 方法与基线实现来源: ${fields.methodAndBaselineImplementationSource || "待补充"}
|
|
330
|
+
- 指标来源论文: ${fields.metricSourcePapers || "待补充"}
|
|
331
|
+
- 指标实现来源: ${fields.metricImplementationSource || "待补充"}
|
|
332
|
+
- 对比方法来源论文: ${fields.comparisonSourcePapers || "待补充"}
|
|
333
|
+
- 对比方法实现来源: ${fields.comparisonImplementationSource || "待补充"}
|
|
334
|
+
- 与原始实现的偏差: ${fields.deviationFromOriginalImplementation || "待补充"}
|
|
335
|
+
|
|
336
|
+
## Gate Ladder
|
|
337
|
+
|
|
338
|
+
- 实验阶梯: ${fields.experimentLadder || "待补充"}
|
|
339
|
+
- benchmark 阶梯: ${fields.benchmarkLadder || "待补充"}
|
|
340
|
+
- 对比方法 gate: ${fields.comparisonGate || "待补充"}
|
|
341
|
+
- 升格 gate: ${fields.promotionGate || "待补充"}
|
|
342
|
+
- 最小样本量: ${fields.minimumSampleSizes || "待补充"}
|
|
343
|
+
- 必要输出工件: ${fields.requiredOutputArtifacts || "待补充"}
|
|
344
|
+
|
|
345
|
+
${rungBlocks}
|
|
346
|
+
`;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
return `# Evaluation Protocol
|
|
350
|
+
|
|
351
|
+
Use this file to define the paper-facing evaluation target, table plan, gates, and benchmark ladder shared by \`/lab:run\`, \`/lab:iterate\`, \`/lab:auto\`, and \`/lab:report\`.
|
|
352
|
+
|
|
353
|
+
## Primary Evaluation Objective
|
|
354
|
+
|
|
355
|
+
- Primary evaluation objective: ${fields.primaryEvaluationObjective || "TBD"}
|
|
356
|
+
- Primary metrics: ${fields.primaryMetrics || "TBD"}
|
|
357
|
+
- Secondary metrics: ${fields.secondaryMetrics || "TBD"}
|
|
358
|
+
- Required terminal evidence: ${fields.requiredTerminalEvidence || "TBD"}
|
|
359
|
+
|
|
360
|
+
## Table Plan
|
|
361
|
+
|
|
362
|
+
- Table plan: ${fields.tablePlan || "TBD"}
|
|
363
|
+
- Required claims per table: ${fields.requiredClaimsPerTable || "TBD"}
|
|
364
|
+
|
|
365
|
+
## Metric Glossary
|
|
366
|
+
|
|
367
|
+
- Metric glossary: ${fields.metricGlossary || "TBD"}
|
|
368
|
+
- Hydration provenance: ${fields.hydrationProvenance || "TBD"}
|
|
369
|
+
- Background sources: ${fields.backgroundSources || "TBD"}
|
|
370
|
+
- Method and baseline source papers: ${fields.methodAndBaselineSourcePapers || "TBD"}
|
|
371
|
+
- Method and baseline implementation source: ${fields.methodAndBaselineImplementationSource || "TBD"}
|
|
372
|
+
- Metric source papers: ${fields.metricSourcePapers || "TBD"}
|
|
373
|
+
- Metric implementation source: ${fields.metricImplementationSource || "TBD"}
|
|
374
|
+
- Comparison source papers: ${fields.comparisonSourcePapers || "TBD"}
|
|
375
|
+
- Comparison implementation source: ${fields.comparisonImplementationSource || "TBD"}
|
|
376
|
+
- Deviation from original implementation: ${fields.deviationFromOriginalImplementation || "TBD"}
|
|
377
|
+
|
|
378
|
+
## Gate Ladder
|
|
379
|
+
|
|
380
|
+
- Experiment ladder: ${fields.experimentLadder || "TBD"}
|
|
381
|
+
- Benchmark ladder: ${fields.benchmarkLadder || "TBD"}
|
|
382
|
+
- Comparison gate: ${fields.comparisonGate || "TBD"}
|
|
383
|
+
- Promotion gate: ${fields.promotionGate || "TBD"}
|
|
384
|
+
- Minimum sample sizes: ${fields.minimumSampleSizes || "TBD"}
|
|
385
|
+
- Required output artifacts: ${fields.requiredOutputArtifacts || "TBD"}
|
|
386
|
+
|
|
387
|
+
${rungBlocks}
|
|
388
|
+
`;
|
|
389
|
+
}
|
|
390
|
+
|
|
61
391
|
function extractClaim(text) {
|
|
62
392
|
const blocks = text
|
|
63
393
|
.split(/\n(?=\d+\.\s)/)
|
|
@@ -76,6 +406,264 @@ function labelValue(text, englishLabels, chineseLabels = []) {
|
|
|
76
406
|
return extractValue(text, [...englishLabels, ...chineseLabels]);
|
|
77
407
|
}
|
|
78
408
|
|
|
409
|
+
function collectHydrationSources(targetDir) {
|
|
410
|
+
const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
|
|
411
|
+
return [
|
|
412
|
+
fs.existsSync(reportPath) ? path.relative(targetDir, reportPath) : "",
|
|
413
|
+
fs.existsSync(mainTablesPath) ? path.relative(targetDir, mainTablesPath) : "",
|
|
414
|
+
readFileIfExists(contextFile(targetDir, "data-decisions.md")) ? ".lab/context/data-decisions.md" : "",
|
|
415
|
+
readFileIfExists(contextFile(targetDir, "state.md")) ? ".lab/context/state.md" : "",
|
|
416
|
+
readFileIfExists(contextFile(targetDir, "evidence-index.md")) ? ".lab/context/evidence-index.md" : "",
|
|
417
|
+
].filter(Boolean);
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
function hydrateMissionContext(targetDir) {
|
|
421
|
+
if (!hasCollaboratorFacingDeliverables(targetDir)) {
|
|
422
|
+
return false;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
const lang = readWorkflowLanguage(targetDir);
|
|
426
|
+
const missionText = readFileIfExists(contextFile(targetDir, "mission.md"));
|
|
427
|
+
const stateText = readFileIfExists(contextFile(targetDir, "state.md"));
|
|
428
|
+
const evidenceText = readFileIfExists(contextFile(targetDir, "evidence-index.md"));
|
|
429
|
+
const dataDecisions = readFileIfExists(contextFile(targetDir, "data-decisions.md"));
|
|
430
|
+
const reportText = readFileIfExists(getCollaboratorDeliverablePaths(targetDir).reportPath);
|
|
431
|
+
const evalProtocol = parseEvalProtocol(targetDir);
|
|
432
|
+
const hydrationSources = collectHydrationSources(targetDir);
|
|
433
|
+
|
|
434
|
+
const fields = {
|
|
435
|
+
problem: mergePreferred(
|
|
436
|
+
extractValue(missionText, ["One-sentence problem", "一句话问题"]),
|
|
437
|
+
extractReportValue(reportText, "problem"),
|
|
438
|
+
extractValue(stateText, ["Current objective", "当前目标", "Current objective"])
|
|
439
|
+
),
|
|
440
|
+
whyItMatters: mergePreferred(
|
|
441
|
+
extractValue(missionText, ["Why it matters", "为什么重要"]),
|
|
442
|
+
extractReportValue(reportText, "whyItMatters")
|
|
443
|
+
),
|
|
444
|
+
targetFailureCase: extractValue(missionText, ["Target failure case", "目标失败场景"]),
|
|
445
|
+
primaryMetric: mergePreferred(
|
|
446
|
+
extractValue(missionText, ["Primary metric", "主指标"]),
|
|
447
|
+
evalProtocol.primaryMetrics,
|
|
448
|
+
extractReportValue(reportText, "primaryMetrics"),
|
|
449
|
+
firstMetric(evalProtocol.primaryMetrics)
|
|
450
|
+
),
|
|
451
|
+
successThreshold: mergePreferred(
|
|
452
|
+
extractValue(missionText, ["Success threshold", "成功阈值"]),
|
|
453
|
+
extractValue(readFileIfExists(contextFile(targetDir, "auto-outcome.md")), ["Terminal goal target", "终止目标目标值"])
|
|
454
|
+
),
|
|
455
|
+
requiredBaselineComparison: extractValue(
|
|
456
|
+
missionText,
|
|
457
|
+
["Required baseline comparison", "必须对比的 baseline"]
|
|
458
|
+
),
|
|
459
|
+
minimumEvidenceRequirement: extractValue(
|
|
460
|
+
missionText,
|
|
461
|
+
["Minimum evidence requirement", "最小证据要求"]
|
|
462
|
+
),
|
|
463
|
+
datasetScope: mergePreferred(
|
|
464
|
+
extractValue(missionText, ["Dataset or benchmark scope", "数据集或 benchmark 范围"]),
|
|
465
|
+
extractValue(dataDecisions, ["Approved dataset package", "Approved datasets", "已批准数据集包", "已批准数据集"]),
|
|
466
|
+
extractReportValue(reportText, "datasets")
|
|
467
|
+
),
|
|
468
|
+
splitPolicy: extractValue(missionText, ["Split policy", "切分策略"]),
|
|
469
|
+
evaluationProtocol: mergePreferred(
|
|
470
|
+
extractValue(missionText, ["Evaluation protocol", "评估协议"]),
|
|
471
|
+
".lab/context/eval-protocol.md"
|
|
472
|
+
),
|
|
473
|
+
hardConstraints: extractValue(missionText, ["Hard constraints", "硬约束"]),
|
|
474
|
+
approvedDirection: mergePreferred(
|
|
475
|
+
extractValue(missionText, ["Approved direction", "已批准方向"]),
|
|
476
|
+
extractClaim(evidenceText)
|
|
477
|
+
),
|
|
478
|
+
currentOwner: extractValue(missionText, ["Current owner or session", "当前 owner 或会话"]),
|
|
479
|
+
latestStage: mergePreferred(
|
|
480
|
+
extractValue(missionText, ["Latest stage to update this mission", "最近一次允许更新 mission 的阶段"]),
|
|
481
|
+
extractValue(stateText, ["Active stage", "当前阶段", "Stage"])
|
|
482
|
+
),
|
|
483
|
+
};
|
|
484
|
+
|
|
485
|
+
const before = missionText;
|
|
486
|
+
fields.hydrationProvenance = hydrationSources.length > 0 ? hydrationSources.join("; ") : "";
|
|
487
|
+
fields.collaboratorReadyStatus =
|
|
488
|
+
missingCollaboratorFields(buildMissionContextText(lang, fields), MISSION_COLLABORATOR_FIELDS).length === 0
|
|
489
|
+
? "hydrated"
|
|
490
|
+
: "artifact-anchored interim";
|
|
491
|
+
|
|
492
|
+
const nextText = buildMissionContextText(lang, fields);
|
|
493
|
+
if (nextText !== before) {
|
|
494
|
+
writeContextFile(targetDir, "mission.md", nextText);
|
|
495
|
+
return true;
|
|
496
|
+
}
|
|
497
|
+
return false;
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
function hydrateEvalProtocol(targetDir) {
|
|
501
|
+
if (!hasCollaboratorFacingDeliverables(targetDir)) {
|
|
502
|
+
return false;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
const lang = readWorkflowLanguage(targetDir);
|
|
506
|
+
const protocol = parseEvalProtocol(targetDir);
|
|
507
|
+
const missionText = readFileIfExists(contextFile(targetDir, "mission.md"));
|
|
508
|
+
const reportText = readFileIfExists(getCollaboratorDeliverablePaths(targetDir).reportPath);
|
|
509
|
+
const dataDecisions = readFileIfExists(contextFile(targetDir, "data-decisions.md"));
|
|
510
|
+
const hydrationSources = collectHydrationSources(targetDir);
|
|
511
|
+
const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
|
|
512
|
+
const requiredOutputs = [
|
|
513
|
+
fs.existsSync(reportPath) ? path.relative(targetDir, reportPath) : "",
|
|
514
|
+
fs.existsSync(mainTablesPath) ? path.relative(targetDir, mainTablesPath) : "",
|
|
515
|
+
".lab/context/evidence-index.md",
|
|
516
|
+
]
|
|
517
|
+
.filter(Boolean)
|
|
518
|
+
.join(", ");
|
|
519
|
+
|
|
520
|
+
const fields = {
|
|
521
|
+
primaryEvaluationObjective: mergePreferred(
|
|
522
|
+
protocol.primaryEvaluationObjective,
|
|
523
|
+
extractValue(missionText, ["One-sentence problem", "一句话问题"]),
|
|
524
|
+
extractReportValue(reportText, "problem")
|
|
525
|
+
),
|
|
526
|
+
primaryMetrics: mergePreferred(
|
|
527
|
+
protocol.primaryMetrics,
|
|
528
|
+
extractValue(missionText, ["Primary metric", "主指标"]),
|
|
529
|
+
extractReportValue(reportText, "primaryMetrics")
|
|
530
|
+
),
|
|
531
|
+
secondaryMetrics: mergePreferred(
|
|
532
|
+
protocol.secondaryMetrics,
|
|
533
|
+
extractReportValue(reportText, "secondaryMetrics")
|
|
534
|
+
),
|
|
535
|
+
requiredTerminalEvidence: mergePreferred(
|
|
536
|
+
protocol.requiredTerminalEvidence,
|
|
537
|
+
extractReportValue(reportText, "requiredTerminalEvidence"),
|
|
538
|
+
requiredOutputs
|
|
539
|
+
),
|
|
540
|
+
tablePlan: mergePreferred(
|
|
541
|
+
protocol.tablePlan,
|
|
542
|
+
fs.existsSync(mainTablesPath) ? `See ${path.relative(targetDir, mainTablesPath)}` : ""
|
|
543
|
+
),
|
|
544
|
+
requiredClaimsPerTable: protocol.requiredClaimsPerTable,
|
|
545
|
+
metricGlossary: mergePreferred(
|
|
546
|
+
protocol.metricGlossary,
|
|
547
|
+
joinNonEmpty(
|
|
548
|
+
[
|
|
549
|
+
extractReportValue(reportText, "metricGuidePrimary"),
|
|
550
|
+
extractReportValue(reportText, "metricGuideSecondary"),
|
|
551
|
+
extractReportValue(reportText, "metricGuideSupport"),
|
|
552
|
+
],
|
|
553
|
+
" | "
|
|
554
|
+
)
|
|
555
|
+
),
|
|
556
|
+
backgroundSources: mergePreferred(
|
|
557
|
+
protocol.backgroundSources,
|
|
558
|
+
extractReportValue(reportText, "backgroundSources"),
|
|
559
|
+
extractValue(dataDecisions, ["Papers that used the approved datasets", "使用过已批准数据集的论文", "使用过该数据集的论文"])
|
|
560
|
+
),
|
|
561
|
+
methodAndBaselineSourcePapers: mergePreferred(
|
|
562
|
+
protocol.methodAndBaselineSourcePapers,
|
|
563
|
+
extractReportValue(reportText, "baselineSourcePapers")
|
|
564
|
+
),
|
|
565
|
+
methodAndBaselineImplementationSource: mergePreferred(
|
|
566
|
+
protocol.methodAndBaselineImplementationSource,
|
|
567
|
+
extractReportValue(reportText, "baselineImplementationSources")
|
|
568
|
+
),
|
|
569
|
+
metricSourcePapers: mergePreferred(
|
|
570
|
+
protocol.metricSourcePapers,
|
|
571
|
+
extractReportValue(reportText, "metricSourcePapers")
|
|
572
|
+
),
|
|
573
|
+
metricImplementationSource: mergePreferred(
|
|
574
|
+
protocol.metricImplementationSource,
|
|
575
|
+
extractReportValue(reportText, "metricImplementationSource")
|
|
576
|
+
),
|
|
577
|
+
comparisonSourcePapers: mergePreferred(
|
|
578
|
+
protocol.comparisonSourcePapers,
|
|
579
|
+
extractReportValue(reportText, "baselineSourcePapers")
|
|
580
|
+
),
|
|
581
|
+
comparisonImplementationSource: mergePreferred(
|
|
582
|
+
protocol.comparisonImplementationSource,
|
|
583
|
+
extractReportValue(reportText, "baselineImplementationSources")
|
|
584
|
+
),
|
|
585
|
+
deviationFromOriginalImplementation: mergePreferred(
|
|
586
|
+
protocol.deviationFromOriginalImplementation,
|
|
587
|
+
extractReportValue(reportText, "metricDeviation")
|
|
588
|
+
),
|
|
589
|
+
benchmarkLadder: protocol.benchmarkLadder,
|
|
590
|
+
experimentLadder: protocol.experimentLadder,
|
|
591
|
+
comparisonGate: protocol.comparisonGate,
|
|
592
|
+
promotionGate: protocol.promotionGate,
|
|
593
|
+
minimumSampleSizes: protocol.minimumSampleSizes,
|
|
594
|
+
requiredOutputArtifacts: mergePreferred(protocol.requiredOutputArtifacts, requiredOutputs),
|
|
595
|
+
hydrationProvenance: hydrationSources.length > 0 ? hydrationSources.join("; ") : "",
|
|
596
|
+
};
|
|
597
|
+
|
|
598
|
+
const nextText = buildEvalProtocolText(lang, fields, protocol.experimentRungs);
|
|
599
|
+
if (nextText !== protocol.text) {
|
|
600
|
+
writeContextFile(targetDir, "eval-protocol.md", nextText);
|
|
601
|
+
return true;
|
|
602
|
+
}
|
|
603
|
+
return false;
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
function getCollaboratorReportStatus(targetDir) {
|
|
607
|
+
const missionIssues = collaboratorMissionIssues(targetDir);
|
|
608
|
+
const evalIssues = collaboratorEvalIssues(targetDir);
|
|
609
|
+
const issues = missionIssues.concat(evalIssues);
|
|
610
|
+
if (issues.length > 0) {
|
|
611
|
+
return {
|
|
612
|
+
mode: "artifact-anchored interim",
|
|
613
|
+
readiness: "hydrated but incomplete",
|
|
614
|
+
reason: issues.join(" | "),
|
|
615
|
+
issues,
|
|
616
|
+
};
|
|
617
|
+
}
|
|
618
|
+
return {
|
|
619
|
+
mode: "collaborator-ready",
|
|
620
|
+
readiness: "ready",
|
|
621
|
+
reason: "canonical mission and evaluation context are complete enough for collaborator-facing reporting",
|
|
622
|
+
issues: [],
|
|
623
|
+
};
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
function upsertSection(text, heading, bodyLines) {
|
|
627
|
+
const sectionText = `${heading}\n\n${bodyLines.join("\n")}`.trimEnd();
|
|
628
|
+
const pattern = new RegExp(`^${heading.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\s*$[\\s\\S]*?(?=^##\\s|\\Z)`, "m");
|
|
629
|
+
if (pattern.test(text)) {
|
|
630
|
+
return text.replace(pattern, sectionText);
|
|
631
|
+
}
|
|
632
|
+
return `${text.trimEnd()}\n\n${sectionText}\n`;
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
function syncCollaboratorReportStatus(targetDir, status) {
|
|
636
|
+
const { reportPath } = getCollaboratorDeliverablePaths(targetDir);
|
|
637
|
+
if (!fs.existsSync(reportPath)) {
|
|
638
|
+
return false;
|
|
639
|
+
}
|
|
640
|
+
const reportText = fs.readFileSync(reportPath, "utf8");
|
|
641
|
+
const heading = /^#\s/.test(reportText) ? "## Report Status" : "# Report Status";
|
|
642
|
+
const nextText = upsertSection(reportText, heading, [
|
|
643
|
+
`- Report mode: ${status.mode}`,
|
|
644
|
+
`- Canonical context readiness: ${status.readiness}`,
|
|
645
|
+
`- Why the current mode is appropriate: ${status.reason}`,
|
|
646
|
+
]);
|
|
647
|
+
if (nextText !== reportText) {
|
|
648
|
+
fs.writeFileSync(reportPath, nextText.trimEnd() + "\n");
|
|
649
|
+
return true;
|
|
650
|
+
}
|
|
651
|
+
return false;
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
function hydrateCanonicalContext(targetDir) {
|
|
655
|
+
const missionChanged = hydrateMissionContext(targetDir);
|
|
656
|
+
const evalChanged = hydrateEvalProtocol(targetDir);
|
|
657
|
+
const reportStatus = getCollaboratorReportStatus(targetDir);
|
|
658
|
+
const reportChanged = syncCollaboratorReportStatus(targetDir, reportStatus);
|
|
659
|
+
return {
|
|
660
|
+
missionChanged,
|
|
661
|
+
evalChanged,
|
|
662
|
+
reportChanged,
|
|
663
|
+
reportStatus,
|
|
664
|
+
};
|
|
665
|
+
}
|
|
666
|
+
|
|
79
667
|
function renderSummary(lang, data) {
|
|
80
668
|
if (lang === "zh") {
|
|
81
669
|
return `# 研究摘要
|
|
@@ -95,6 +683,9 @@ function renderSummary(lang, data) {
|
|
|
95
683
|
- Auto stop reason: ${data.autoStopReason || "待补充"}
|
|
96
684
|
- Auto final artifact: ${data.autoFinalArtifact || "待补充"}
|
|
97
685
|
- Auto final rung: ${data.autoFinalRung || "待补充"}
|
|
686
|
+
- Collaborator report mode: ${data.reportMode || "待补充"}
|
|
687
|
+
- Canonical context readiness: ${data.reportReadiness || "待补充"}
|
|
688
|
+
- Why this report mode is active: ${data.reportModeReason || "待补充"}
|
|
98
689
|
- Eval objective: ${data.evalObjective || "待补充"}
|
|
99
690
|
- Primary metrics: ${data.evalPrimaryMetrics || "待补充"}
|
|
100
691
|
- Secondary metrics: ${data.evalSecondaryMetrics || "待补充"}
|
|
@@ -153,6 +744,9 @@ function renderSummary(lang, data) {
|
|
|
153
744
|
- Auto stop reason: ${data.autoStopReason || "TBD"}
|
|
154
745
|
- Auto final artifact: ${data.autoFinalArtifact || "TBD"}
|
|
155
746
|
- Auto final rung: ${data.autoFinalRung || "TBD"}
|
|
747
|
+
- Collaborator report mode: ${data.reportMode || "TBD"}
|
|
748
|
+
- Canonical context readiness: ${data.reportReadiness || "TBD"}
|
|
749
|
+
- Why this report mode is active: ${data.reportModeReason || "TBD"}
|
|
156
750
|
- Eval objective: ${data.evalObjective || "TBD"}
|
|
157
751
|
- Primary metrics: ${data.evalPrimaryMetrics || "TBD"}
|
|
158
752
|
- Secondary metrics: ${data.evalSecondaryMetrics || "TBD"}
|
|
@@ -266,6 +860,9 @@ ${data.problem || "待补充"}
|
|
|
266
860
|
- Auto stop reason: ${data.autoStopReason || "待补充"}
|
|
267
861
|
- Auto final artifact: ${data.autoFinalArtifact || "待补充"}
|
|
268
862
|
- Auto final rung: ${data.autoFinalRung || "待补充"}
|
|
863
|
+
- Collaborator report mode: ${data.reportMode || "待补充"}
|
|
864
|
+
- Canonical context readiness: ${data.reportReadiness || "待补充"}
|
|
865
|
+
- Why this report mode is active: ${data.reportModeReason || "待补充"}
|
|
269
866
|
- Eval objective: ${data.evalObjective || "待补充"}
|
|
270
867
|
- Primary metrics: ${data.evalPrimaryMetrics || "待补充"}
|
|
271
868
|
- Secondary metrics: ${data.evalSecondaryMetrics || "待补充"}
|
|
@@ -335,6 +932,9 @@ ${data.problem || "TBD"}
|
|
|
335
932
|
- Auto stop reason: ${data.autoStopReason || "TBD"}
|
|
336
933
|
- Auto final artifact: ${data.autoFinalArtifact || "TBD"}
|
|
337
934
|
- Auto final rung: ${data.autoFinalRung || "TBD"}
|
|
935
|
+
- Collaborator report mode: ${data.reportMode || "TBD"}
|
|
936
|
+
- Canonical context readiness: ${data.reportReadiness || "TBD"}
|
|
937
|
+
- Why this report mode is active: ${data.reportModeReason || "TBD"}
|
|
338
938
|
- Eval objective: ${data.evalObjective || "TBD"}
|
|
339
939
|
- Primary metrics: ${data.evalPrimaryMetrics || "TBD"}
|
|
340
940
|
- Secondary metrics: ${data.evalSecondaryMetrics || "TBD"}
|
|
@@ -381,6 +981,7 @@ ${data.problem || "TBD"}
|
|
|
381
981
|
}
|
|
382
982
|
|
|
383
983
|
function buildContextSnapshot(targetDir) {
|
|
984
|
+
const reportStatus = getCollaboratorReportStatus(targetDir);
|
|
384
985
|
const mission = readFileIfExists(contextFile(targetDir, "mission.md"));
|
|
385
986
|
const state = readFileIfExists(contextFile(targetDir, "state.md"));
|
|
386
987
|
const evidence = readFileIfExists(contextFile(targetDir, "evidence-index.md"));
|
|
@@ -592,6 +1193,9 @@ function buildContextSnapshot(targetDir) {
|
|
|
592
1193
|
autoStopReason: extractValue(autoOutcome, ["Stop reason", "停止原因"]),
|
|
593
1194
|
autoFinalArtifact: extractValue(autoOutcome, ["Final artifact", "最终工件"]),
|
|
594
1195
|
autoFinalRung: extractValue(autoOutcome, ["Final rung", "最终 rung"]),
|
|
1196
|
+
reportMode: reportStatus.mode,
|
|
1197
|
+
reportReadiness: reportStatus.readiness,
|
|
1198
|
+
reportModeReason: reportStatus.reason,
|
|
595
1199
|
evalObjective: evalProtocol.primaryEvaluationObjective,
|
|
596
1200
|
evalPrimaryMetrics: evalProtocol.primaryMetrics,
|
|
597
1201
|
evalSecondaryMetrics: evalProtocol.secondaryMetrics,
|
|
@@ -619,6 +1223,7 @@ function writeContextFile(targetDir, name, content) {
|
|
|
619
1223
|
}
|
|
620
1224
|
|
|
621
1225
|
function refreshContext({ targetDir }) {
|
|
1226
|
+
hydrateCanonicalContext(targetDir);
|
|
622
1227
|
const lang = readWorkflowLanguage(targetDir);
|
|
623
1228
|
const snapshot = buildContextSnapshot(targetDir);
|
|
624
1229
|
writeContextFile(targetDir, "summary.md", renderSummary(lang, snapshot));
|
|
@@ -695,6 +1300,11 @@ function archiveContext({ targetDir, now = new Date() }) {
|
|
|
695
1300
|
|
|
696
1301
|
module.exports = {
|
|
697
1302
|
archiveContext,
|
|
1303
|
+
collaboratorEvalIssues,
|
|
1304
|
+
collaboratorMissionIssues,
|
|
1305
|
+
getCollaboratorReportStatus,
|
|
1306
|
+
hasCollaboratorFacingDeliverables,
|
|
1307
|
+
hydrateCanonicalContext,
|
|
698
1308
|
pruneContext,
|
|
699
1309
|
refreshContext,
|
|
700
1310
|
};
|
package/lib/i18n.cjs
CHANGED
|
@@ -289,7 +289,10 @@ const ZH_SKILL_FILES = {
|
|
|
289
289
|
|
|
290
290
|
## 必要输出
|
|
291
291
|
|
|
292
|
+
- 报告状态:collaborator-ready 或 artifact-anchored interim
|
|
292
293
|
- 给用户看的总结
|
|
294
|
+
- 问题与背景的白话说明
|
|
295
|
+
- 数据集场景说明
|
|
293
296
|
- 方法概述
|
|
294
297
|
- 选定指标摘要
|
|
295
298
|
- 指标白话释义
|
|
@@ -301,6 +304,7 @@ const ZH_SKILL_FILES = {
|
|
|
301
304
|
- 失败尝试
|
|
302
305
|
- 局限性
|
|
303
306
|
- 下一步
|
|
307
|
+
- 单独列出的工件状态,而不是混进已验证结果
|
|
304
308
|
|
|
305
309
|
## 上下文读取
|
|
306
310
|
|
|
@@ -312,6 +316,8 @@ const ZH_SKILL_FILES = {
|
|
|
312
316
|
|
|
313
317
|
## 上下文写回
|
|
314
318
|
|
|
319
|
+
- \`.lab/context/mission.md\`
|
|
320
|
+
- \`.lab/context/eval-protocol.md\`
|
|
315
321
|
- \`.lab/context/state.md\`
|
|
316
322
|
- \`.lab/context/evidence-index.md\`
|
|
317
323
|
|
|
@@ -324,7 +330,11 @@ const ZH_SKILL_FILES = {
|
|
|
324
330
|
- 必须把已批准的主指标、次级指标和必要终局证据明确写进 \`report.md\` 与受管的 \`main-tables.md\`。
|
|
325
331
|
- 必须用白话解释选定的主指标和次级指标:每个指标在衡量什么、越高还是越低更好、它是主结果指标还是健康度/支持性指标。
|
|
326
332
|
- 如果出现 coverage、completeness、confidence 或类似健康度指标,必须明确说明这类指标回答的是“实验是否跑稳、证据是否完整”,而不是主要科学效应本身。
|
|
333
|
+
- 在起草报告前,先检查 \`.lab/context/mission.md\` 和 \`.lab/context/eval-protocol.md\` 是否仍是模板空壳。
|
|
334
|
+
- 如果 canonical context 还是空壳,要先根据 frozen result artifacts、data-decisions、evidence-index 和已批准上下文回填“最小可信版本”,再写报告。
|
|
335
|
+
- 如果回填后仍缺少协作者可读所需的关键字段,就必须把输出降级成 \`artifact-anchored interim report\`,不能冒充最终协作者报告。
|
|
327
336
|
- 如果报告依赖了对原始指标或原始实现的偏差,必须明确写出这个偏差。
|
|
337
|
+
- workflow 工件状态、rerun id 或 LaTeX 骨架状态不能混进“已验证主结果”;这些内容必须单列到工件状态部分。
|
|
328
338
|
- 如果 workflow language 是中文,\`report.md\` 和 \`<deliverables_root>/main-tables.md\` 也应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。
|
|
329
339
|
- 解释优先保守,不要写成营销文案。
|
|
330
340
|
- 要给 \`/lab:write\` 留下清晰 handoff,尤其是 section draft 可以直接引用的证据链接。
|
|
@@ -691,6 +701,12 @@ const ZH_SKILL_FILES = {
|
|
|
691
701
|
[path.join(".lab", ".managed", "templates", "final-report.md")]:
|
|
692
702
|
`# 最终报告
|
|
693
703
|
|
|
704
|
+
## 报告状态
|
|
705
|
+
|
|
706
|
+
- 报告模式:collaborator-ready 或 artifact-anchored interim
|
|
707
|
+
- canonical context 完整度:
|
|
708
|
+
- 为什么当前只能用这个模式:
|
|
709
|
+
|
|
694
710
|
## 给用户看的总结
|
|
695
711
|
|
|
696
712
|
- 一句话结论:
|
|
@@ -698,6 +714,18 @@ const ZH_SKILL_FILES = {
|
|
|
698
714
|
- 还没有被证明的内容:
|
|
699
715
|
- 当前最大报告风险:
|
|
700
716
|
|
|
717
|
+
## 问题与背景
|
|
718
|
+
|
|
719
|
+
- 这项研究在解决什么问题:
|
|
720
|
+
- 为什么这个问题重要:
|
|
721
|
+
- 当前报告到底覆盖了什么 setting 或 workflow:
|
|
722
|
+
|
|
723
|
+
## 数据集场景说明
|
|
724
|
+
|
|
725
|
+
- 数据集或 benchmark 1 代表什么真实场景:
|
|
726
|
+
- 数据集或 benchmark 2 代表什么真实场景:
|
|
727
|
+
- 数据集或 benchmark 3 代表什么真实场景:
|
|
728
|
+
|
|
701
729
|
## 选定指标
|
|
702
730
|
|
|
703
731
|
- 主指标:
|
|
@@ -740,6 +768,11 @@ const ZH_SKILL_FILES = {
|
|
|
740
768
|
- 最终表现摘要:
|
|
741
769
|
- 主表覆盖情况:
|
|
742
770
|
|
|
771
|
+
## 工件状态
|
|
772
|
+
|
|
773
|
+
- 已就绪的交付物或工作流工件:
|
|
774
|
+
- 这些工件状态为什么不是科学结论:
|
|
775
|
+
|
|
743
776
|
## 主要结果
|
|
744
777
|
|
|
745
778
|
- 主要发现 1:
|
|
@@ -953,6 +986,7 @@ const ZH_SKILL_FILES = {
|
|
|
953
986
|
- 一句话问题:
|
|
954
987
|
- 为什么重要:
|
|
955
988
|
- 目标失败场景:
|
|
989
|
+
- 回填来源:
|
|
956
990
|
|
|
957
991
|
## 成功标准
|
|
958
992
|
|
|
@@ -973,6 +1007,7 @@ const ZH_SKILL_FILES = {
|
|
|
973
1007
|
- 已批准方向:
|
|
974
1008
|
- 当前 owner 或会话:
|
|
975
1009
|
- 最近更新该 mission 的 stage:
|
|
1010
|
+
- 协作者可读状态:
|
|
976
1011
|
`,
|
|
977
1012
|
[path.join(".lab", "context", "state.md")]:
|
|
978
1013
|
`# 工作流状态
|
|
@@ -1966,6 +2001,7 @@ ZH_CONTENT[path.join(".lab", "context", "eval-protocol.md")] = `# 评估协议
|
|
|
1966
2001
|
## 指标释义
|
|
1967
2002
|
|
|
1968
2003
|
- 指标释义:
|
|
2004
|
+
- 回填来源:
|
|
1969
2005
|
- 背景来源:
|
|
1970
2006
|
- 方法与基线来源论文:
|
|
1971
2007
|
- 方法与基线实现来源:
|
|
@@ -2021,10 +2057,12 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
|
|
|
2021
2057
|
|
|
2022
2058
|
## 上下文写回
|
|
2023
2059
|
|
|
2060
|
+
- \`.lab/context/mission.md\`
|
|
2024
2061
|
- \`.lab/context/state.md\`
|
|
2025
2062
|
- \`.lab/context/decisions.md\`
|
|
2026
2063
|
- \`.lab/context/data-decisions.md\`
|
|
2027
2064
|
- \`.lab/context/evidence-index.md\`
|
|
2065
|
+
- \`.lab/context/eval-protocol.md\`
|
|
2028
2066
|
- \`.lab/context/summary.md\`
|
|
2029
2067
|
- \`.lab/context/session-brief.md\`
|
|
2030
2068
|
- \`.lab/context/auto-status.md\`
|
|
@@ -2053,6 +2091,8 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
|
|
|
2053
2091
|
- \`review\` 更新规范审查上下文
|
|
2054
2092
|
- \`report\` 写出 \`<deliverables_root>/report.md\`
|
|
2055
2093
|
- \`write\` 写出 \`<deliverables_root>/paper/\` 下的 LaTeX 产物
|
|
2094
|
+
- 如果即将进入 \`report\`,而 \`.lab/context/mission.md\` 或 \`.lab/context/eval-protocol.md\` 仍是模板空壳,就先根据冻结工件和已批准上下文回填最小可信版本。
|
|
2095
|
+
- 如果回填后仍缺少协作者可读所需的关键字段,就必须强制生成 \`artifact-anchored interim report\`,不能冒充最终协作者报告。
|
|
2056
2096
|
- promotion 成功后,必须写回 \`data-decisions.md\`、\`decisions.md\`、\`state.md\` 和 \`session-brief.md\`。
|
|
2057
2097
|
- 如果某个指标或对比 claim 在评估协议里没有带来源的定义,就不能拿它做 stop 或 promotion 判断。
|
|
2058
2098
|
|
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# Final Report
|
|
2
2
|
|
|
3
|
+
## Report Status
|
|
4
|
+
|
|
5
|
+
- Report mode: collaborator-ready or artifact-anchored interim
|
|
6
|
+
- Canonical context readiness:
|
|
7
|
+
- Why the current mode is appropriate:
|
|
8
|
+
|
|
3
9
|
## Reader Summary
|
|
4
10
|
|
|
5
11
|
- One-sentence conclusion:
|
|
@@ -7,6 +13,18 @@
|
|
|
7
13
|
- What is still unproven:
|
|
8
14
|
- Biggest reporting risk:
|
|
9
15
|
|
|
16
|
+
## Problem and Background
|
|
17
|
+
|
|
18
|
+
- Research problem in plain language:
|
|
19
|
+
- Why this problem matters:
|
|
20
|
+
- What setting or workflow this report is actually about:
|
|
21
|
+
|
|
22
|
+
## Dataset Scene Notes
|
|
23
|
+
|
|
24
|
+
- Dataset or benchmark 1 and what real-world setting it represents:
|
|
25
|
+
- Dataset or benchmark 2 and what real-world setting it represents:
|
|
26
|
+
- Dataset or benchmark 3 and what real-world setting it represents:
|
|
27
|
+
|
|
10
28
|
## Selected Metrics
|
|
11
29
|
|
|
12
30
|
- Primary metrics:
|
|
@@ -56,6 +74,11 @@
|
|
|
56
74
|
- Final performance summary:
|
|
57
75
|
- Table coverage:
|
|
58
76
|
|
|
77
|
+
## Artifact Status
|
|
78
|
+
|
|
79
|
+
- Deliverables or workflow artifacts that are ready:
|
|
80
|
+
- Artifact status notes that are not scientific findings:
|
|
81
|
+
|
|
59
82
|
## Main Results
|
|
60
83
|
|
|
61
84
|
Summarize validated iteration outcomes.
|
|
@@ -17,6 +17,7 @@ Use this file to define the paper-facing evaluation objective, table plan, gates
|
|
|
17
17
|
## Metric Glossary
|
|
18
18
|
|
|
19
19
|
- Metric glossary:
|
|
20
|
+
- Hydration provenance:
|
|
20
21
|
- Background sources:
|
|
21
22
|
- Method and baseline source papers:
|
|
22
23
|
- Method and baseline implementation source:
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
- One-sentence problem:
|
|
6
6
|
- Why it matters:
|
|
7
7
|
- Target failure case:
|
|
8
|
+
- Hydration provenance:
|
|
8
9
|
|
|
9
10
|
## Success Criteria
|
|
10
11
|
|
|
@@ -25,3 +26,4 @@
|
|
|
25
26
|
- Approved direction:
|
|
26
27
|
- Current owner or session:
|
|
27
28
|
- Last stage that updated this mission:
|
|
29
|
+
- Collaborator-ready status:
|
|
@@ -46,6 +46,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
46
46
|
- Keep an explicit approval gate before `/lab:spec`.
|
|
47
47
|
- Write idea artifacts with the template in `.lab/.managed/templates/idea.md`.
|
|
48
48
|
- Update `.lab/context/mission.md`, `.lab/context/decisions.md`, and `.lab/context/open-questions.md` after convergence.
|
|
49
|
+
- Do not leave `.lab/context/mission.md` as a template shell once the problem statement and approved direction are known.
|
|
49
50
|
- Do not implement code in this stage.
|
|
50
51
|
|
|
51
52
|
### `/lab:data`
|
|
@@ -111,7 +112,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
111
112
|
- Normalize the result with `.lab/.managed/scripts/eval_report.py`.
|
|
112
113
|
- Validate normalized output with `.lab/.managed/scripts/validate_results.py`.
|
|
113
114
|
- Read `.lab/context/eval-protocol.md` before choosing the smallest run so the first experiment already targets the approved tables, metrics, and gates.
|
|
114
|
-
- Update `.lab/context/state.md
|
|
115
|
+
- Update `.lab/context/state.md`, `.lab/context/evidence-index.md`, and `.lab/context/eval-protocol.md` after the run.
|
|
116
|
+
- If the evaluation protocol is still skeletal, initialize the smallest trustworthy source-backed version before treating the run as the protocol anchor.
|
|
115
117
|
|
|
116
118
|
### `/lab:iterate`
|
|
117
119
|
|
|
@@ -131,7 +133,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
131
133
|
- Keep metric definitions, baseline behavior, and comparison implementations anchored to the source-backed evaluation protocol before changing thresholds, gates, or ladder transitions.
|
|
132
134
|
- Switch to diagnostic mode if risk increases for two consecutive rounds.
|
|
133
135
|
- Write round reports with `.lab/.managed/templates/iteration-report.md`.
|
|
134
|
-
- Update `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`,
|
|
136
|
+
- Update `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, `.lab/context/open-questions.md`, and `.lab/context/eval-protocol.md` each round as needed.
|
|
137
|
+
- Keep `.lab/context/eval-protocol.md` synchronized with accepted ladder changes, benchmark scope, and source-backed implementation deviations.
|
|
135
138
|
- Stop at threshold success or iteration cap, and record blockers plus next-best actions when the campaign ends without success.
|
|
136
139
|
|
|
137
140
|
### `/lab:review`
|
|
@@ -153,7 +156,9 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
|
|
|
153
156
|
- Aggregate them with `.lab/.managed/scripts/summarize_iterations.py`.
|
|
154
157
|
- Write the final document with `.lab/.managed/templates/final-report.md` and the managed table summary with `.lab/.managed/templates/main-tables.md`.
|
|
155
158
|
- Keep failed attempts and limitations visible.
|
|
156
|
-
- Update `.lab/context/state.md
|
|
159
|
+
- Update `.lab/context/mission.md`, `.lab/context/eval-protocol.md`, `.lab/context/state.md`, and `.lab/context/evidence-index.md` with report-level handoff notes.
|
|
160
|
+
- If canonical context is still skeletal, hydrate the smallest trustworthy version from frozen artifacts before finalizing the report.
|
|
161
|
+
- If collaborator-critical fields remain missing after hydration, downgrade to an `artifact-anchored interim report` instead of presenting a final collaborator-ready report.
|
|
157
162
|
|
|
158
163
|
### `/lab:write`
|
|
159
164
|
|
|
@@ -24,6 +24,8 @@
|
|
|
24
24
|
|
|
25
25
|
## Context Write Set
|
|
26
26
|
|
|
27
|
+
- `.lab/context/mission.md`
|
|
28
|
+
- `.lab/context/eval-protocol.md`
|
|
27
29
|
- `.lab/context/state.md`
|
|
28
30
|
- `.lab/context/decisions.md`
|
|
29
31
|
- `.lab/context/data-decisions.md`
|
|
@@ -63,6 +65,8 @@
|
|
|
63
65
|
- keep the session alive while the current rung is running
|
|
64
66
|
- write the current rung, watch target, and next rung to `.lab/context/auto-status.md`
|
|
65
67
|
- Reuse the existing `/lab:run`, `/lab:iterate`, `/lab:review`, `/lab:report`, and optional `/lab:write` contracts instead of inventing a parallel workflow.
|
|
68
|
+
- If the loop is about to reach `report` while `.lab/context/mission.md` or `.lab/context/eval-protocol.md` is still skeletal, hydrate the smallest trustworthy canonical version from frozen artifacts and approved context before drafting the report.
|
|
69
|
+
- If hydration still leaves collaborator-critical fields blank, force `report` to emit an `artifact-anchored interim report` instead of a collaborator-ready final report.
|
|
66
70
|
- Enforce stage contracts, not just exit codes:
|
|
67
71
|
- `run` and `iterate` must change persistent outputs under `results_root`
|
|
68
72
|
- `review` must update canonical review context
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
## Required Output
|
|
4
4
|
|
|
5
5
|
- one-sentence problem statement
|
|
6
|
+
- why the problem matters in plain language
|
|
6
7
|
- failure case
|
|
7
8
|
- idea classification
|
|
8
9
|
- contribution category
|
|
@@ -18,6 +19,7 @@
|
|
|
18
19
|
- critique before convergence
|
|
19
20
|
- minimum viable experiment
|
|
20
21
|
- explicit approval gate before `/lab:spec`
|
|
22
|
+
- canonical mission context updated with the approved problem, importance, failure case, and direction
|
|
21
23
|
|
|
22
24
|
## Evidence Discipline
|
|
23
25
|
|
|
@@ -60,3 +62,4 @@
|
|
|
60
62
|
- State why the target problem matters before talking about the method.
|
|
61
63
|
- Compare against existing methods explicitly, not by vague novelty language.
|
|
62
64
|
- The three meaningful points should each fit in one direct sentence.
|
|
65
|
+
- Do not leave `.lab/context/mission.md` as an empty template after convergence; write the approved problem, why it matters, the current benchmark scope, and the approved direction back into canonical context.
|
|
@@ -29,6 +29,7 @@ Declare and keep fixed:
|
|
|
29
29
|
- `.lab/context/decisions.md`
|
|
30
30
|
- `.lab/context/evidence-index.md`
|
|
31
31
|
- `.lab/context/open-questions.md`
|
|
32
|
+
- `.lab/context/eval-protocol.md`
|
|
32
33
|
|
|
33
34
|
## Per-Round Output
|
|
34
35
|
|
|
@@ -62,6 +63,7 @@ If the loop stops without success, record:
|
|
|
62
63
|
- Do not accumulate long-lived results under `.lab/changes/<change-id>/runs`.
|
|
63
64
|
- Do not change metric definitions, baseline semantics, or comparison implementations unless the approved evaluation protocol records both their sources and any deviations.
|
|
64
65
|
- When you change ladders, sample sizes, or promotion gates, keep the resulting logic anchored to the source-backed evaluation protocol instead of ad-hoc chat reasoning.
|
|
66
|
+
- Keep `.lab/context/eval-protocol.md` synchronized with the active benchmark scope, ladder gates, source-backed metric definitions, and any accepted implementation deviations instead of leaving it as a stale template.
|
|
65
67
|
|
|
66
68
|
## Interaction Contract
|
|
67
69
|
|
|
@@ -2,7 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
## Required Output
|
|
4
4
|
|
|
5
|
+
- report status: collaborator-ready or artifact-anchored interim
|
|
5
6
|
- reader summary for the user
|
|
7
|
+
- problem and background in plain language
|
|
8
|
+
- dataset scene notes in plain language
|
|
6
9
|
- method overview
|
|
7
10
|
- selected metrics summary
|
|
8
11
|
- plain-language metric guide
|
|
@@ -17,6 +20,7 @@
|
|
|
17
20
|
- failed attempts
|
|
18
21
|
- limitations
|
|
19
22
|
- next steps
|
|
23
|
+
- artifact status kept separate from validated findings
|
|
20
24
|
|
|
21
25
|
## Context Read Set
|
|
22
26
|
|
|
@@ -30,6 +34,8 @@
|
|
|
30
34
|
|
|
31
35
|
## Context Write Set
|
|
32
36
|
|
|
37
|
+
- `.lab/context/mission.md`
|
|
38
|
+
- `.lab/context/eval-protocol.md`
|
|
33
39
|
- `.lab/context/state.md`
|
|
34
40
|
- `.lab/context/evidence-index.md`
|
|
35
41
|
|
|
@@ -45,6 +51,10 @@
|
|
|
45
51
|
- Pull the core background references, method or baseline references, and metric references out of the approved evaluation protocol instead of hiding them in `.lab/context/*`.
|
|
46
52
|
- Report only the few references a collaborator needs to orient themselves quickly; do not turn `report.md` into a full bibliography dump.
|
|
47
53
|
- If the report depends on a deviation from an original metric or implementation, state that deviation explicitly instead of smoothing it over.
|
|
54
|
+
- Before drafting the report, inspect `.lab/context/mission.md` and `.lab/context/eval-protocol.md` for skeletal template fields.
|
|
55
|
+
- If either canonical context file is still skeletal, hydrate the smallest trustworthy version from frozen result artifacts, dataset decisions, evidence-index, and prior approved context, and write that back before finalizing the report.
|
|
56
|
+
- If collaborator-critical fields still remain missing after hydration, downgrade the output to an `artifact-anchored interim report` instead of presenting it as a final collaborator-ready report.
|
|
57
|
+
- Do not mix workflow deliverable status, rerun ids, or manuscript skeleton status into validated scientific findings; keep those in a separate artifact-status section.
|
|
48
58
|
- If `.lab/config/workflow.json` sets the workflow language to Chinese, write `report.md` and `<deliverables_root>/main-tables.md` in Chinese unless a file path, code identifier, or literal metric name must remain unchanged.
|
|
49
59
|
- Prefer conservative interpretation over marketing language.
|
|
50
60
|
- Leave a clear handoff path into `/lab:write` with evidence links that section drafts can cite.
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
- run registry entry
|
|
7
7
|
- normalized evaluation summary
|
|
8
8
|
- validation result for the normalized summary
|
|
9
|
+
- canonical evaluation context initialized or refined when the active protocol is still skeletal
|
|
9
10
|
|
|
10
11
|
## Context Read Set
|
|
11
12
|
|
|
@@ -19,6 +20,7 @@
|
|
|
19
20
|
|
|
20
21
|
- `.lab/context/state.md`
|
|
21
22
|
- `.lab/context/evidence-index.md`
|
|
23
|
+
- `.lab/context/eval-protocol.md`
|
|
22
24
|
|
|
23
25
|
## Constraints
|
|
24
26
|
|
|
@@ -26,6 +28,7 @@
|
|
|
26
28
|
- Fail fast on data, environment, or metric wiring problems.
|
|
27
29
|
- Tie the run to the approved evaluation protocol, not just an ad-hoc chat goal.
|
|
28
30
|
- Do not invent metric definitions, baseline behavior, or comparison implementations from memory; anchor them to the approved evaluation protocol and its recorded sources.
|
|
31
|
+
- If `.lab/context/eval-protocol.md` is still skeletal, write the smallest trustworthy version of the current evaluation objective, metric set, ladder, and source-backed implementation notes before treating the run as the new protocol anchor.
|
|
29
32
|
- Record the exact launch command and output location.
|
|
30
33
|
- Write durable run outputs, logs, and checkpoints under `results_root`.
|
|
31
34
|
- Write figures or plots under `figures_root`.
|