superlab 0.1.17 → 0.1.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/superlab.cjs CHANGED
@@ -12,6 +12,8 @@ const {
12
12
  } = require("../lib/install.cjs");
13
13
  const {
14
14
  archiveContext,
15
+ collaboratorEvalIssues,
16
+ collaboratorMissionIssues,
15
17
  pruneContext,
16
18
  refreshContext,
17
19
  } = require("../lib/context.cjs");
@@ -788,6 +790,8 @@ function printDoctor(options) {
788
790
  const templateIssues = validatePaperTemplateRoot(options.targetDir, config);
789
791
  const dataDecisionIssues = validateDataDecisions(options.targetDir);
790
792
  const evalProtocolIssues = validateEvalProtocol(options.targetDir);
793
+ const missionContextIssues = collaboratorMissionIssues(options.targetDir);
794
+ const collaboratorProtocolIssues = collaboratorEvalIssues(options.targetDir);
791
795
  const rootIssues = validateProjectRoots(options.targetDir, config);
792
796
  const autoStatus = getAutoStatus({ targetDir: options.targetDir });
793
797
  const autoIssues = autoStatus.issues;
@@ -806,6 +810,8 @@ function printDoctor(options) {
806
810
  templateIssues.length > 0 ||
807
811
  dataDecisionIssues.length > 0 ||
808
812
  evalProtocolIssues.length > 0 ||
813
+ missionContextIssues.length > 0 ||
814
+ collaboratorProtocolIssues.length > 0 ||
809
815
  rootIssues.length > 0 ||
810
816
  autoIssues.length > 0
811
817
  ) {
@@ -820,6 +826,8 @@ function printDoctor(options) {
820
826
  templateIssues,
821
827
  dataDecisionIssues,
822
828
  evalProtocolIssues,
829
+ missionContextIssues,
830
+ collaboratorProtocolIssues,
823
831
  rootIssues,
824
832
  autoIssues
825
833
  );
@@ -253,7 +253,7 @@ async function startAutoMode({ targetDir, now = new Date() }) {
253
253
  const maxFailures = parseInteger(mode.maxFailures, 0);
254
254
  const maxIterations = parseInteger(mode.maxIterations, 1);
255
255
  const requiredArtifact = resolveRequiredArtifact(targetDir, mode.requiredTerminalArtifact);
256
- const frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
256
+ let frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
257
257
  const { loopStages, finalStages } = splitAutoStages(mode.allowedStages);
258
258
  const executedStages = [];
259
259
  let failureCount = 0;
@@ -270,6 +270,9 @@ async function startAutoMode({ targetDir, now = new Date() }) {
270
270
  requiredTerminalEvidence: evalProtocol.requiredTerminalEvidence,
271
271
  experimentLadder: evalProtocol.experimentLadder,
272
272
  metricGlossary: evalProtocol.metricGlossary,
273
+ backgroundSources: evalProtocol.backgroundSources,
274
+ methodAndBaselineSourcePapers: evalProtocol.methodAndBaselineSourcePapers,
275
+ methodAndBaselineImplementationSource: evalProtocol.methodAndBaselineImplementationSource,
273
276
  metricSourcePapers: evalProtocol.metricSourcePapers,
274
277
  metricImplementationSource: evalProtocol.metricImplementationSource,
275
278
  comparisonSourcePapers: evalProtocol.comparisonSourcePapers,
@@ -360,13 +363,14 @@ async function startAutoMode({ targetDir, now = new Date() }) {
360
363
  nextRung,
361
364
  decision: rungId ? `completed rung ${rungId}` : `completed stage ${stage}`,
362
365
  });
363
- refreshContext({ targetDir });
364
-
365
366
  const frozenCoreChanges = detectFrozenCoreChanges(frozenCoreSnapshot);
366
367
  if (frozenCoreChanges.length > 0) {
367
368
  failAutoMode(`frozen core changed: ${frozenCoreChanges.join(", ")}`);
368
369
  }
369
370
 
371
+ refreshContext({ targetDir });
372
+ frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
373
+
370
374
  const stopCheck = await runCheckCommand({
371
375
  targetDir,
372
376
  label: `stop check after ${rungId || stage}`,
@@ -434,12 +438,13 @@ async function startAutoMode({ targetDir, now = new Date() }) {
434
438
  decision: `promotion policy matched after ${label}`,
435
439
  });
436
440
  promotionApplied = true;
437
- refreshContext({ targetDir });
438
- verifyPromotionWriteback(targetDir, promotionSnapshot);
439
441
  const frozenCoreChangesAfterPromotion = detectFrozenCoreChanges(frozenCoreSnapshot);
440
442
  if (frozenCoreChangesAfterPromotion.length > 0) {
441
443
  failAutoMode(`frozen core changed: ${frozenCoreChangesAfterPromotion.join(", ")}`);
442
444
  }
445
+ refreshContext({ targetDir });
446
+ verifyPromotionWriteback(targetDir, promotionSnapshot);
447
+ frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
443
448
  };
444
449
 
445
450
  if (evalProtocol.experimentRungs.length > 0) {
@@ -755,6 +760,9 @@ function stopAutoMode({ targetDir, now = new Date() }) {
755
760
  requiredTerminalEvidence: evalProtocol.requiredTerminalEvidence,
756
761
  experimentLadder: evalProtocol.experimentLadder,
757
762
  metricGlossary: evalProtocol.metricGlossary,
763
+ backgroundSources: evalProtocol.backgroundSources,
764
+ methodAndBaselineSourcePapers: evalProtocol.methodAndBaselineSourcePapers,
765
+ methodAndBaselineImplementationSource: evalProtocol.methodAndBaselineImplementationSource,
758
766
  metricSourcePapers: evalProtocol.metricSourcePapers,
759
767
  metricImplementationSource: evalProtocol.metricImplementationSource,
760
768
  comparisonSourcePapers: evalProtocol.comparisonSourcePapers,
@@ -146,6 +146,9 @@ function renderAutoOutcome(outcome, { lang = "en" } = {}) {
146
146
  - 必要终局证据: ${outcome.requiredTerminalEvidence || ""}
147
147
  - 实验阶梯: ${outcome.experimentLadder || ""}
148
148
  - 指标释义: ${outcome.metricGlossary || ""}
149
+ - 背景来源: ${outcome.backgroundSources || ""}
150
+ - 方法与基线来源论文: ${outcome.methodAndBaselineSourcePapers || ""}
151
+ - 方法与基线实现来源: ${outcome.methodAndBaselineImplementationSource || ""}
149
152
  - 指标来源论文: ${outcome.metricSourcePapers || ""}
150
153
  - 指标实现来源: ${outcome.metricImplementationSource || ""}
151
154
  - 对比方法来源论文: ${outcome.comparisonSourcePapers || ""}
@@ -180,6 +183,9 @@ function renderAutoOutcome(outcome, { lang = "en" } = {}) {
180
183
  - Required terminal evidence: ${outcome.requiredTerminalEvidence || ""}
181
184
  - Experiment ladder: ${outcome.experimentLadder || ""}
182
185
  - Metric glossary: ${outcome.metricGlossary || ""}
186
+ - Background sources: ${outcome.backgroundSources || ""}
187
+ - Method and baseline source papers: ${outcome.methodAndBaselineSourcePapers || ""}
188
+ - Method and baseline implementation source: ${outcome.methodAndBaselineImplementationSource || ""}
183
189
  - Metric source papers: ${outcome.metricSourcePapers || ""}
184
190
  - Metric implementation source: ${outcome.metricImplementationSource || ""}
185
191
  - Comparison source papers: ${outcome.comparisonSourcePapers || ""}
package/lib/context.cjs CHANGED
@@ -2,6 +2,64 @@ const fs = require("node:fs");
2
2
  const path = require("node:path");
3
3
  const { parseEvalProtocol } = require("./eval_protocol.cjs");
4
4
 
5
+ const PLACEHOLDER_VALUES = new Set(["", "tbd", "none", "待补充", "无"]);
6
+ const MISSION_COLLABORATOR_FIELDS = [
7
+ { name: "One-sentence problem", labels: ["One-sentence problem", "一句话问题"] },
8
+ { name: "Why it matters", labels: ["Why it matters", "为什么重要"] },
9
+ { name: "Primary metric", labels: ["Primary metric", "主指标"] },
10
+ { name: "Success threshold", labels: ["Success threshold", "成功阈值"] },
11
+ { name: "Dataset or benchmark scope", labels: ["Dataset or benchmark scope", "数据集或 benchmark 范围"] },
12
+ { name: "Approved direction", labels: ["Approved direction", "已批准方向"] },
13
+ ];
14
+ const EVAL_COLLABORATOR_FIELDS = [
15
+ { name: "Primary evaluation objective", labels: ["Primary evaluation objective", "主评估目标"] },
16
+ { name: "Primary metrics", labels: ["Primary metrics", "主指标"] },
17
+ { name: "Secondary metrics", labels: ["Secondary metrics", "次级指标"] },
18
+ { name: "Table plan", labels: ["Table plan", "主表计划"] },
19
+ { name: "Metric glossary", labels: ["Metric glossary", "指标释义"] },
20
+ { name: "Background sources", labels: ["Background sources", "背景来源"] },
21
+ {
22
+ name: "Method and baseline source papers",
23
+ labels: ["Method and baseline source papers", "方法与基线来源论文"],
24
+ },
25
+ {
26
+ name: "Method and baseline implementation source",
27
+ labels: ["Method and baseline implementation source", "方法与基线实现来源"],
28
+ },
29
+ { name: "Metric source papers", labels: ["Metric source papers", "指标来源论文"] },
30
+ { name: "Required output artifacts", labels: ["Required output artifacts", "必要输出工件"] },
31
+ ];
32
+ const REPORT_FIELDS = {
33
+ problem: ["Research problem in plain language", "研究问题白话解释", "研究问题"],
34
+ whyItMatters: ["Why this problem matters", "为什么这个问题重要"],
35
+ setting: ["What setting or workflow this report is actually about", "这份报告实际对应的场景或流程"],
36
+ primaryMetrics: ["Primary metrics", "主指标"],
37
+ secondaryMetrics: ["Secondary metrics", "次级指标"],
38
+ requiredTerminalEvidence: ["Required terminal evidence", "必要终局证据"],
39
+ metricGuidePrimary: ["Primary metric plain-language explanation", "主指标白话解释"],
40
+ metricGuideSecondary: ["Secondary metric plain-language explanation", "次级指标白话解释"],
41
+ metricGuideSupport: [
42
+ "Health or support metrics and why they are not the main claim",
43
+ "健康度或支持性指标以及它们为什么不是主 claim",
44
+ ],
45
+ backgroundSources: ["Most important background papers or benchmark references", "最重要的背景论文或 benchmark 参考"],
46
+ backgroundAnchors: ["Why these are the right background anchors", "为什么这些是合适的背景锚点"],
47
+ methodBasis: ["Our method source or implementation basis", "我们的方法来源或实现基础"],
48
+ baselineSourcePapers: ["Baseline and comparison source papers", "基线与对比方法来源论文"],
49
+ baselineImplementationSources: [
50
+ "Baseline and comparison implementation sources",
51
+ "基线与对比方法实现来源",
52
+ ],
53
+ metricSourcePapers: ["Metric source papers", "指标来源论文"],
54
+ metricImplementationSource: ["Metric implementation source", "指标实现来源"],
55
+ metricDeviation: ["Deviation from original implementation", "与原始实现的偏差"],
56
+ datasets: ["Datasets", "数据集"],
57
+ baselines: ["Baselines", "基线"],
58
+ metrics: ["Metrics", "指标"],
59
+ finalPerformanceSummary: ["Final performance summary", "最终表现总结"],
60
+ tableCoverage: ["Table coverage", "表格覆盖范围"],
61
+ };
62
+
5
63
  function contextFile(targetDir, name) {
6
64
  return path.join(targetDir, ".lab", "context", name);
7
65
  }
@@ -58,6 +116,278 @@ function joinNonEmpty(parts, separator = "; ") {
58
116
  return parts.filter(Boolean).join(separator);
59
117
  }
60
118
 
119
+ function isMeaningful(value) {
120
+ return !PLACEHOLDER_VALUES.has((value || "").trim().toLowerCase());
121
+ }
122
+
123
+ function readWorkflowConfig(targetDir) {
124
+ const configPath = path.join(targetDir, ".lab", "config", "workflow.json");
125
+ if (!fs.existsSync(configPath)) {
126
+ return {};
127
+ }
128
+ try {
129
+ return JSON.parse(fs.readFileSync(configPath, "utf8"));
130
+ } catch {
131
+ return {};
132
+ }
133
+ }
134
+
135
+ function resolveProjectPath(targetDir, configuredPath) {
136
+ if (!configuredPath || typeof configuredPath !== "string") {
137
+ return "";
138
+ }
139
+ return path.resolve(targetDir, configuredPath);
140
+ }
141
+
142
+ function getCollaboratorDeliverablePaths(targetDir) {
143
+ const config = readWorkflowConfig(targetDir);
144
+ const deliverablesRoot = resolveProjectPath(targetDir, config.deliverables_root || "docs/research");
145
+ return {
146
+ deliverablesRoot,
147
+ reportPath: path.join(deliverablesRoot, "report.md"),
148
+ mainTablesPath: path.join(deliverablesRoot, "main-tables.md"),
149
+ };
150
+ }
151
+
152
+ function hasCollaboratorFacingDeliverables(targetDir) {
153
+ const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
154
+ return fs.existsSync(reportPath) || fs.existsSync(mainTablesPath);
155
+ }
156
+
157
+ function missingCollaboratorFields(text, fields) {
158
+ return fields.filter((field) => !isMeaningful(extractValue(text, field.labels))).map((field) => field.name);
159
+ }
160
+
161
+ function collaboratorMissionIssues(targetDir) {
162
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
163
+ return [];
164
+ }
165
+ const mission = readFileIfExists(contextFile(targetDir, "mission.md"));
166
+ if (!mission) {
167
+ return [];
168
+ }
169
+ const missing = missingCollaboratorFields(mission, MISSION_COLLABORATOR_FIELDS);
170
+ return missing.length > 0
171
+ ? [`mission context is still skeletal for collaborator-facing reporting: ${missing.join(", ")}`]
172
+ : [];
173
+ }
174
+
175
+ function collaboratorEvalIssues(targetDir) {
176
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
177
+ return [];
178
+ }
179
+ const protocol = readFileIfExists(contextFile(targetDir, "eval-protocol.md"));
180
+ if (!protocol) {
181
+ return [];
182
+ }
183
+ const missing = missingCollaboratorFields(protocol, EVAL_COLLABORATOR_FIELDS);
184
+ return missing.length > 0
185
+ ? [`evaluation protocol is still skeletal for collaborator-facing reporting: ${missing.join(", ")}`]
186
+ : [];
187
+ }
188
+
189
+ function extractReportValue(reportText, key) {
190
+ return extractValue(reportText, REPORT_FIELDS[key] || []);
191
+ }
192
+
193
+ function mergePreferred(existingValue, ...candidates) {
194
+ if (isMeaningful(existingValue)) {
195
+ return existingValue;
196
+ }
197
+ for (const candidate of candidates) {
198
+ if (isMeaningful(candidate)) {
199
+ return candidate;
200
+ }
201
+ }
202
+ return "";
203
+ }
204
+
205
+ function firstMetric(metrics) {
206
+ return (metrics || "")
207
+ .split(/[;,]/)
208
+ .map((value) => value.trim())
209
+ .filter(Boolean)[0] || "";
210
+ }
211
+
212
+ function buildMissionContextText(lang, fields) {
213
+ if (lang === "zh") {
214
+ return `# 研究主线
215
+
216
+ ## 核心问题
217
+
218
+ - 一句话问题: ${fields.problem || "待补充"}
219
+ - 为什么重要: ${fields.whyItMatters || "待补充"}
220
+ - 目标失败场景: ${fields.targetFailureCase || "待补充"}
221
+
222
+ ## 成功标准
223
+
224
+ - 主指标: ${fields.primaryMetric || "待补充"}
225
+ - 成功阈值: ${fields.successThreshold || "待补充"}
226
+ - 必须对比的 baseline: ${fields.requiredBaselineComparison || "待补充"}
227
+ - 最小证据要求: ${fields.minimumEvidenceRequirement || "待补充"}
228
+
229
+ ## 冻结边界
230
+
231
+ - 数据集或 benchmark 范围: ${fields.datasetScope || "待补充"}
232
+ - 切分策略: ${fields.splitPolicy || "待补充"}
233
+ - 评估协议: ${fields.evaluationProtocol || "待补充"}
234
+ - 硬约束: ${fields.hardConstraints || "待补充"}
235
+
236
+ ## 当前状态
237
+
238
+ - 已批准方向: ${fields.approvedDirection || "待补充"}
239
+ - 当前 owner 或会话: ${fields.currentOwner || "待补充"}
240
+ - 最近一次允许更新 mission 的阶段: ${fields.latestStage || "待补充"}
241
+ - 回填来源: ${fields.hydrationProvenance || "待补充"}
242
+ - 协作者可读状态: ${fields.collaboratorReadyStatus || "待补充"}
243
+ `;
244
+ }
245
+
246
+ return `# Research Mission
247
+
248
+ ## Core Problem
249
+
250
+ - One-sentence problem: ${fields.problem || "TBD"}
251
+ - Why it matters: ${fields.whyItMatters || "TBD"}
252
+ - Target failure case: ${fields.targetFailureCase || "TBD"}
253
+
254
+ ## Success Criteria
255
+
256
+ - Primary metric: ${fields.primaryMetric || "TBD"}
257
+ - Success threshold: ${fields.successThreshold || "TBD"}
258
+ - Required baseline comparison: ${fields.requiredBaselineComparison || "TBD"}
259
+ - Minimum evidence requirement: ${fields.minimumEvidenceRequirement || "TBD"}
260
+
261
+ ## Frozen Boundaries
262
+
263
+ - Dataset or benchmark scope: ${fields.datasetScope || "TBD"}
264
+ - Split policy: ${fields.splitPolicy || "TBD"}
265
+ - Evaluation protocol: ${fields.evaluationProtocol || "TBD"}
266
+ - Hard constraints: ${fields.hardConstraints || "TBD"}
267
+
268
+ ## Current Status
269
+
270
+ - Approved direction: ${fields.approvedDirection || "TBD"}
271
+ - Current owner or session: ${fields.currentOwner || "TBD"}
272
+ - Latest stage to update this mission: ${fields.latestStage || "TBD"}
273
+ - Hydration provenance: ${fields.hydrationProvenance || "TBD"}
274
+ - Collaborator-ready status: ${fields.collaboratorReadyStatus || "TBD"}
275
+ `;
276
+ }
277
+
278
+ function buildEvalProtocolText(lang, fields, rungs) {
279
+ const rungBlocks = (rungs || [])
280
+ .map((rung) => {
281
+ if (lang === "zh") {
282
+ return `### Rung: ${rung.id}
283
+
284
+ - 阶段: ${rung.stage || "待补充"}
285
+ - 目标: ${rung.goal || "待补充"}
286
+ - 命令: ${rung.command || "待补充"}
287
+ - 监视目标: ${rung.watch || "待补充"}
288
+ - gate 命令: ${rung.gate || "待补充"}
289
+ - 通过后: ${rung.onPass || "待补充"}
290
+ - 失败后: ${rung.onFail || "待补充"}
291
+ - 停止后: ${rung.onStop || "待补充"}`;
292
+ }
293
+ return `### Rung: ${rung.id}
294
+
295
+ - Stage: ${rung.stage || "TBD"}
296
+ - Goal: ${rung.goal || "TBD"}
297
+ - Command: ${rung.command || "TBD"}
298
+ - Watch: ${rung.watch || "TBD"}
299
+ - Gate: ${rung.gate || "TBD"}
300
+ - On pass: ${rung.onPass || "TBD"}
301
+ - On fail: ${rung.onFail || "TBD"}
302
+ - On stop: ${rung.onStop || "TBD"}`;
303
+ })
304
+ .join("\n\n");
305
+
306
+ if (lang === "zh") {
307
+ return `# 评估协议
308
+
309
+ 用这份文件定义 \`/lab:run\`、\`/lab:iterate\`、\`/lab:auto\` 和 \`/lab:report\` 共用的论文导向评估目标、主表计划、gate 与 benchmark ladder。
310
+
311
+ ## 主评估目标
312
+
313
+ - 主评估目标: ${fields.primaryEvaluationObjective || "待补充"}
314
+ - 主指标: ${fields.primaryMetrics || "待补充"}
315
+ - 次级指标: ${fields.secondaryMetrics || "待补充"}
316
+ - 必要终局证据: ${fields.requiredTerminalEvidence || "待补充"}
317
+
318
+ ## 主表计划
319
+
320
+ - 主表计划: ${fields.tablePlan || "待补充"}
321
+ - 每张表必须支撑的 claims: ${fields.requiredClaimsPerTable || "待补充"}
322
+
323
+ ## 指标释义
324
+
325
+ - 指标释义: ${fields.metricGlossary || "待补充"}
326
+ - 回填来源: ${fields.hydrationProvenance || "待补充"}
327
+ - 背景来源: ${fields.backgroundSources || "待补充"}
328
+ - 方法与基线来源论文: ${fields.methodAndBaselineSourcePapers || "待补充"}
329
+ - 方法与基线实现来源: ${fields.methodAndBaselineImplementationSource || "待补充"}
330
+ - 指标来源论文: ${fields.metricSourcePapers || "待补充"}
331
+ - 指标实现来源: ${fields.metricImplementationSource || "待补充"}
332
+ - 对比方法来源论文: ${fields.comparisonSourcePapers || "待补充"}
333
+ - 对比方法实现来源: ${fields.comparisonImplementationSource || "待补充"}
334
+ - 与原始实现的偏差: ${fields.deviationFromOriginalImplementation || "待补充"}
335
+
336
+ ## Gate Ladder
337
+
338
+ - 实验阶梯: ${fields.experimentLadder || "待补充"}
339
+ - benchmark 阶梯: ${fields.benchmarkLadder || "待补充"}
340
+ - 对比方法 gate: ${fields.comparisonGate || "待补充"}
341
+ - 升格 gate: ${fields.promotionGate || "待补充"}
342
+ - 最小样本量: ${fields.minimumSampleSizes || "待补充"}
343
+ - 必要输出工件: ${fields.requiredOutputArtifacts || "待补充"}
344
+
345
+ ${rungBlocks}
346
+ `;
347
+ }
348
+
349
+ return `# Evaluation Protocol
350
+
351
+ Use this file to define the paper-facing evaluation target, table plan, gates, and benchmark ladder shared by \`/lab:run\`, \`/lab:iterate\`, \`/lab:auto\`, and \`/lab:report\`.
352
+
353
+ ## Primary Evaluation Objective
354
+
355
+ - Primary evaluation objective: ${fields.primaryEvaluationObjective || "TBD"}
356
+ - Primary metrics: ${fields.primaryMetrics || "TBD"}
357
+ - Secondary metrics: ${fields.secondaryMetrics || "TBD"}
358
+ - Required terminal evidence: ${fields.requiredTerminalEvidence || "TBD"}
359
+
360
+ ## Table Plan
361
+
362
+ - Table plan: ${fields.tablePlan || "TBD"}
363
+ - Required claims per table: ${fields.requiredClaimsPerTable || "TBD"}
364
+
365
+ ## Metric Glossary
366
+
367
+ - Metric glossary: ${fields.metricGlossary || "TBD"}
368
+ - Hydration provenance: ${fields.hydrationProvenance || "TBD"}
369
+ - Background sources: ${fields.backgroundSources || "TBD"}
370
+ - Method and baseline source papers: ${fields.methodAndBaselineSourcePapers || "TBD"}
371
+ - Method and baseline implementation source: ${fields.methodAndBaselineImplementationSource || "TBD"}
372
+ - Metric source papers: ${fields.metricSourcePapers || "TBD"}
373
+ - Metric implementation source: ${fields.metricImplementationSource || "TBD"}
374
+ - Comparison source papers: ${fields.comparisonSourcePapers || "TBD"}
375
+ - Comparison implementation source: ${fields.comparisonImplementationSource || "TBD"}
376
+ - Deviation from original implementation: ${fields.deviationFromOriginalImplementation || "TBD"}
377
+
378
+ ## Gate Ladder
379
+
380
+ - Experiment ladder: ${fields.experimentLadder || "TBD"}
381
+ - Benchmark ladder: ${fields.benchmarkLadder || "TBD"}
382
+ - Comparison gate: ${fields.comparisonGate || "TBD"}
383
+ - Promotion gate: ${fields.promotionGate || "TBD"}
384
+ - Minimum sample sizes: ${fields.minimumSampleSizes || "TBD"}
385
+ - Required output artifacts: ${fields.requiredOutputArtifacts || "TBD"}
386
+
387
+ ${rungBlocks}
388
+ `;
389
+ }
390
+
61
391
  function extractClaim(text) {
62
392
  const blocks = text
63
393
  .split(/\n(?=\d+\.\s)/)
@@ -76,6 +406,264 @@ function labelValue(text, englishLabels, chineseLabels = []) {
76
406
  return extractValue(text, [...englishLabels, ...chineseLabels]);
77
407
  }
78
408
 
409
+ function collectHydrationSources(targetDir) {
410
+ const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
411
+ return [
412
+ fs.existsSync(reportPath) ? path.relative(targetDir, reportPath) : "",
413
+ fs.existsSync(mainTablesPath) ? path.relative(targetDir, mainTablesPath) : "",
414
+ readFileIfExists(contextFile(targetDir, "data-decisions.md")) ? ".lab/context/data-decisions.md" : "",
415
+ readFileIfExists(contextFile(targetDir, "state.md")) ? ".lab/context/state.md" : "",
416
+ readFileIfExists(contextFile(targetDir, "evidence-index.md")) ? ".lab/context/evidence-index.md" : "",
417
+ ].filter(Boolean);
418
+ }
419
+
420
+ function hydrateMissionContext(targetDir) {
421
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
422
+ return false;
423
+ }
424
+
425
+ const lang = readWorkflowLanguage(targetDir);
426
+ const missionText = readFileIfExists(contextFile(targetDir, "mission.md"));
427
+ const stateText = readFileIfExists(contextFile(targetDir, "state.md"));
428
+ const evidenceText = readFileIfExists(contextFile(targetDir, "evidence-index.md"));
429
+ const dataDecisions = readFileIfExists(contextFile(targetDir, "data-decisions.md"));
430
+ const reportText = readFileIfExists(getCollaboratorDeliverablePaths(targetDir).reportPath);
431
+ const evalProtocol = parseEvalProtocol(targetDir);
432
+ const hydrationSources = collectHydrationSources(targetDir);
433
+
434
+ const fields = {
435
+ problem: mergePreferred(
436
+ extractValue(missionText, ["One-sentence problem", "一句话问题"]),
437
+ extractReportValue(reportText, "problem"),
438
+ extractValue(stateText, ["Current objective", "当前目标", "Current objective"])
439
+ ),
440
+ whyItMatters: mergePreferred(
441
+ extractValue(missionText, ["Why it matters", "为什么重要"]),
442
+ extractReportValue(reportText, "whyItMatters")
443
+ ),
444
+ targetFailureCase: extractValue(missionText, ["Target failure case", "目标失败场景"]),
445
+ primaryMetric: mergePreferred(
446
+ extractValue(missionText, ["Primary metric", "主指标"]),
447
+ evalProtocol.primaryMetrics,
448
+ extractReportValue(reportText, "primaryMetrics"),
449
+ firstMetric(evalProtocol.primaryMetrics)
450
+ ),
451
+ successThreshold: mergePreferred(
452
+ extractValue(missionText, ["Success threshold", "成功阈值"]),
453
+ extractValue(readFileIfExists(contextFile(targetDir, "auto-outcome.md")), ["Terminal goal target", "终止目标目标值"])
454
+ ),
455
+ requiredBaselineComparison: extractValue(
456
+ missionText,
457
+ ["Required baseline comparison", "必须对比的 baseline"]
458
+ ),
459
+ minimumEvidenceRequirement: extractValue(
460
+ missionText,
461
+ ["Minimum evidence requirement", "最小证据要求"]
462
+ ),
463
+ datasetScope: mergePreferred(
464
+ extractValue(missionText, ["Dataset or benchmark scope", "数据集或 benchmark 范围"]),
465
+ extractValue(dataDecisions, ["Approved dataset package", "Approved datasets", "已批准数据集包", "已批准数据集"]),
466
+ extractReportValue(reportText, "datasets")
467
+ ),
468
+ splitPolicy: extractValue(missionText, ["Split policy", "切分策略"]),
469
+ evaluationProtocol: mergePreferred(
470
+ extractValue(missionText, ["Evaluation protocol", "评估协议"]),
471
+ ".lab/context/eval-protocol.md"
472
+ ),
473
+ hardConstraints: extractValue(missionText, ["Hard constraints", "硬约束"]),
474
+ approvedDirection: mergePreferred(
475
+ extractValue(missionText, ["Approved direction", "已批准方向"]),
476
+ extractClaim(evidenceText)
477
+ ),
478
+ currentOwner: extractValue(missionText, ["Current owner or session", "当前 owner 或会话"]),
479
+ latestStage: mergePreferred(
480
+ extractValue(missionText, ["Latest stage to update this mission", "最近一次允许更新 mission 的阶段"]),
481
+ extractValue(stateText, ["Active stage", "当前阶段", "Stage"])
482
+ ),
483
+ };
484
+
485
+ const before = missionText;
486
+ fields.hydrationProvenance = hydrationSources.length > 0 ? hydrationSources.join("; ") : "";
487
+ fields.collaboratorReadyStatus =
488
+ missingCollaboratorFields(buildMissionContextText(lang, fields), MISSION_COLLABORATOR_FIELDS).length === 0
489
+ ? "hydrated"
490
+ : "artifact-anchored interim";
491
+
492
+ const nextText = buildMissionContextText(lang, fields);
493
+ if (nextText !== before) {
494
+ writeContextFile(targetDir, "mission.md", nextText);
495
+ return true;
496
+ }
497
+ return false;
498
+ }
499
+
500
+ function hydrateEvalProtocol(targetDir) {
501
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
502
+ return false;
503
+ }
504
+
505
+ const lang = readWorkflowLanguage(targetDir);
506
+ const protocol = parseEvalProtocol(targetDir);
507
+ const missionText = readFileIfExists(contextFile(targetDir, "mission.md"));
508
+ const reportText = readFileIfExists(getCollaboratorDeliverablePaths(targetDir).reportPath);
509
+ const dataDecisions = readFileIfExists(contextFile(targetDir, "data-decisions.md"));
510
+ const hydrationSources = collectHydrationSources(targetDir);
511
+ const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
512
+ const requiredOutputs = [
513
+ fs.existsSync(reportPath) ? path.relative(targetDir, reportPath) : "",
514
+ fs.existsSync(mainTablesPath) ? path.relative(targetDir, mainTablesPath) : "",
515
+ ".lab/context/evidence-index.md",
516
+ ]
517
+ .filter(Boolean)
518
+ .join(", ");
519
+
520
+ const fields = {
521
+ primaryEvaluationObjective: mergePreferred(
522
+ protocol.primaryEvaluationObjective,
523
+ extractValue(missionText, ["One-sentence problem", "一句话问题"]),
524
+ extractReportValue(reportText, "problem")
525
+ ),
526
+ primaryMetrics: mergePreferred(
527
+ protocol.primaryMetrics,
528
+ extractValue(missionText, ["Primary metric", "主指标"]),
529
+ extractReportValue(reportText, "primaryMetrics")
530
+ ),
531
+ secondaryMetrics: mergePreferred(
532
+ protocol.secondaryMetrics,
533
+ extractReportValue(reportText, "secondaryMetrics")
534
+ ),
535
+ requiredTerminalEvidence: mergePreferred(
536
+ protocol.requiredTerminalEvidence,
537
+ extractReportValue(reportText, "requiredTerminalEvidence"),
538
+ requiredOutputs
539
+ ),
540
+ tablePlan: mergePreferred(
541
+ protocol.tablePlan,
542
+ fs.existsSync(mainTablesPath) ? `See ${path.relative(targetDir, mainTablesPath)}` : ""
543
+ ),
544
+ requiredClaimsPerTable: protocol.requiredClaimsPerTable,
545
+ metricGlossary: mergePreferred(
546
+ protocol.metricGlossary,
547
+ joinNonEmpty(
548
+ [
549
+ extractReportValue(reportText, "metricGuidePrimary"),
550
+ extractReportValue(reportText, "metricGuideSecondary"),
551
+ extractReportValue(reportText, "metricGuideSupport"),
552
+ ],
553
+ " | "
554
+ )
555
+ ),
556
+ backgroundSources: mergePreferred(
557
+ protocol.backgroundSources,
558
+ extractReportValue(reportText, "backgroundSources"),
559
+ extractValue(dataDecisions, ["Papers that used the approved datasets", "使用过已批准数据集的论文", "使用过该数据集的论文"])
560
+ ),
561
+ methodAndBaselineSourcePapers: mergePreferred(
562
+ protocol.methodAndBaselineSourcePapers,
563
+ extractReportValue(reportText, "baselineSourcePapers")
564
+ ),
565
+ methodAndBaselineImplementationSource: mergePreferred(
566
+ protocol.methodAndBaselineImplementationSource,
567
+ extractReportValue(reportText, "baselineImplementationSources")
568
+ ),
569
+ metricSourcePapers: mergePreferred(
570
+ protocol.metricSourcePapers,
571
+ extractReportValue(reportText, "metricSourcePapers")
572
+ ),
573
+ metricImplementationSource: mergePreferred(
574
+ protocol.metricImplementationSource,
575
+ extractReportValue(reportText, "metricImplementationSource")
576
+ ),
577
+ comparisonSourcePapers: mergePreferred(
578
+ protocol.comparisonSourcePapers,
579
+ extractReportValue(reportText, "baselineSourcePapers")
580
+ ),
581
+ comparisonImplementationSource: mergePreferred(
582
+ protocol.comparisonImplementationSource,
583
+ extractReportValue(reportText, "baselineImplementationSources")
584
+ ),
585
+ deviationFromOriginalImplementation: mergePreferred(
586
+ protocol.deviationFromOriginalImplementation,
587
+ extractReportValue(reportText, "metricDeviation")
588
+ ),
589
+ benchmarkLadder: protocol.benchmarkLadder,
590
+ experimentLadder: protocol.experimentLadder,
591
+ comparisonGate: protocol.comparisonGate,
592
+ promotionGate: protocol.promotionGate,
593
+ minimumSampleSizes: protocol.minimumSampleSizes,
594
+ requiredOutputArtifacts: mergePreferred(protocol.requiredOutputArtifacts, requiredOutputs),
595
+ hydrationProvenance: hydrationSources.length > 0 ? hydrationSources.join("; ") : "",
596
+ };
597
+
598
+ const nextText = buildEvalProtocolText(lang, fields, protocol.experimentRungs);
599
+ if (nextText !== protocol.text) {
600
+ writeContextFile(targetDir, "eval-protocol.md", nextText);
601
+ return true;
602
+ }
603
+ return false;
604
+ }
605
+
606
+ function getCollaboratorReportStatus(targetDir) {
607
+ const missionIssues = collaboratorMissionIssues(targetDir);
608
+ const evalIssues = collaboratorEvalIssues(targetDir);
609
+ const issues = missionIssues.concat(evalIssues);
610
+ if (issues.length > 0) {
611
+ return {
612
+ mode: "artifact-anchored interim",
613
+ readiness: "hydrated but incomplete",
614
+ reason: issues.join(" | "),
615
+ issues,
616
+ };
617
+ }
618
+ return {
619
+ mode: "collaborator-ready",
620
+ readiness: "ready",
621
+ reason: "canonical mission and evaluation context are complete enough for collaborator-facing reporting",
622
+ issues: [],
623
+ };
624
+ }
625
+
626
+ function upsertSection(text, heading, bodyLines) {
627
+ const sectionText = `${heading}\n\n${bodyLines.join("\n")}`.trimEnd();
628
+ const pattern = new RegExp(`^${heading.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\s*$[\\s\\S]*?(?=^##\\s|\\Z)`, "m");
629
+ if (pattern.test(text)) {
630
+ return text.replace(pattern, sectionText);
631
+ }
632
+ return `${text.trimEnd()}\n\n${sectionText}\n`;
633
+ }
634
+
635
+ function syncCollaboratorReportStatus(targetDir, status) {
636
+ const { reportPath } = getCollaboratorDeliverablePaths(targetDir);
637
+ if (!fs.existsSync(reportPath)) {
638
+ return false;
639
+ }
640
+ const reportText = fs.readFileSync(reportPath, "utf8");
641
+ const heading = /^#\s/.test(reportText) ? "## Report Status" : "# Report Status";
642
+ const nextText = upsertSection(reportText, heading, [
643
+ `- Report mode: ${status.mode}`,
644
+ `- Canonical context readiness: ${status.readiness}`,
645
+ `- Why the current mode is appropriate: ${status.reason}`,
646
+ ]);
647
+ if (nextText !== reportText) {
648
+ fs.writeFileSync(reportPath, nextText.trimEnd() + "\n");
649
+ return true;
650
+ }
651
+ return false;
652
+ }
653
+
654
+ function hydrateCanonicalContext(targetDir) {
655
+ const missionChanged = hydrateMissionContext(targetDir);
656
+ const evalChanged = hydrateEvalProtocol(targetDir);
657
+ const reportStatus = getCollaboratorReportStatus(targetDir);
658
+ const reportChanged = syncCollaboratorReportStatus(targetDir, reportStatus);
659
+ return {
660
+ missionChanged,
661
+ evalChanged,
662
+ reportChanged,
663
+ reportStatus,
664
+ };
665
+ }
666
+
79
667
  function renderSummary(lang, data) {
80
668
  if (lang === "zh") {
81
669
  return `# 研究摘要
@@ -95,12 +683,18 @@ function renderSummary(lang, data) {
95
683
  - Auto stop reason: ${data.autoStopReason || "待补充"}
96
684
  - Auto final artifact: ${data.autoFinalArtifact || "待补充"}
97
685
  - Auto final rung: ${data.autoFinalRung || "待补充"}
686
+ - Collaborator report mode: ${data.reportMode || "待补充"}
687
+ - Canonical context readiness: ${data.reportReadiness || "待补充"}
688
+ - Why this report mode is active: ${data.reportModeReason || "待补充"}
98
689
  - Eval objective: ${data.evalObjective || "待补充"}
99
690
  - Primary metrics: ${data.evalPrimaryMetrics || "待补充"}
100
691
  - Secondary metrics: ${data.evalSecondaryMetrics || "待补充"}
101
692
  - Required terminal evidence: ${data.evalRequiredTerminalEvidence || "待补充"}
102
693
  - Table plan: ${data.evalTablePlan || "待补充"}
103
694
  - Metric glossary: ${data.evalMetricGlossary || "待补充"}
695
+ - Background sources: ${data.evalBackgroundSources || "待补充"}
696
+ - Method and baseline source papers: ${data.evalMethodAndBaselineSourcePapers || "待补充"}
697
+ - Method and baseline implementation source: ${data.evalMethodAndBaselineImplementationSource || "待补充"}
104
698
  - Metric source papers: ${data.evalMetricSourcePapers || "待补充"}
105
699
  - Metric implementation source: ${data.evalMetricImplementationSource || "待补充"}
106
700
  - Comparison source papers: ${data.evalComparisonSourcePapers || "待补充"}
@@ -150,12 +744,18 @@ function renderSummary(lang, data) {
150
744
  - Auto stop reason: ${data.autoStopReason || "TBD"}
151
745
  - Auto final artifact: ${data.autoFinalArtifact || "TBD"}
152
746
  - Auto final rung: ${data.autoFinalRung || "TBD"}
747
+ - Collaborator report mode: ${data.reportMode || "TBD"}
748
+ - Canonical context readiness: ${data.reportReadiness || "TBD"}
749
+ - Why this report mode is active: ${data.reportModeReason || "TBD"}
153
750
  - Eval objective: ${data.evalObjective || "TBD"}
154
751
  - Primary metrics: ${data.evalPrimaryMetrics || "TBD"}
155
752
  - Secondary metrics: ${data.evalSecondaryMetrics || "TBD"}
156
753
  - Required terminal evidence: ${data.evalRequiredTerminalEvidence || "TBD"}
157
754
  - Table plan: ${data.evalTablePlan || "TBD"}
158
755
  - Metric glossary: ${data.evalMetricGlossary || "TBD"}
756
+ - Background sources: ${data.evalBackgroundSources || "TBD"}
757
+ - Method and baseline source papers: ${data.evalMethodAndBaselineSourcePapers || "TBD"}
758
+ - Method and baseline implementation source: ${data.evalMethodAndBaselineImplementationSource || "TBD"}
159
759
  - Metric source papers: ${data.evalMetricSourcePapers || "TBD"}
160
760
  - Metric implementation source: ${data.evalMetricImplementationSource || "TBD"}
161
761
  - Comparison source papers: ${data.evalComparisonSourcePapers || "TBD"}
@@ -260,12 +860,18 @@ ${data.problem || "待补充"}
260
860
  - Auto stop reason: ${data.autoStopReason || "待补充"}
261
861
  - Auto final artifact: ${data.autoFinalArtifact || "待补充"}
262
862
  - Auto final rung: ${data.autoFinalRung || "待补充"}
863
+ - Collaborator report mode: ${data.reportMode || "待补充"}
864
+ - Canonical context readiness: ${data.reportReadiness || "待补充"}
865
+ - Why this report mode is active: ${data.reportModeReason || "待补充"}
263
866
  - Eval objective: ${data.evalObjective || "待补充"}
264
867
  - Primary metrics: ${data.evalPrimaryMetrics || "待补充"}
265
868
  - Secondary metrics: ${data.evalSecondaryMetrics || "待补充"}
266
869
  - Required terminal evidence: ${data.evalRequiredTerminalEvidence || "待补充"}
267
870
  - Table plan: ${data.evalTablePlan || "待补充"}
268
871
  - Metric glossary: ${data.evalMetricGlossary || "待补充"}
872
+ - Background sources: ${data.evalBackgroundSources || "待补充"}
873
+ - Method and baseline source papers: ${data.evalMethodAndBaselineSourcePapers || "待补充"}
874
+ - Method and baseline implementation source: ${data.evalMethodAndBaselineImplementationSource || "待补充"}
269
875
  - Metric source papers: ${data.evalMetricSourcePapers || "待补充"}
270
876
  - Metric implementation source: ${data.evalMetricImplementationSource || "待补充"}
271
877
  - Comparison source papers: ${data.evalComparisonSourcePapers || "待补充"}
@@ -326,12 +932,18 @@ ${data.problem || "TBD"}
326
932
  - Auto stop reason: ${data.autoStopReason || "TBD"}
327
933
  - Auto final artifact: ${data.autoFinalArtifact || "TBD"}
328
934
  - Auto final rung: ${data.autoFinalRung || "TBD"}
935
+ - Collaborator report mode: ${data.reportMode || "TBD"}
936
+ - Canonical context readiness: ${data.reportReadiness || "TBD"}
937
+ - Why this report mode is active: ${data.reportModeReason || "TBD"}
329
938
  - Eval objective: ${data.evalObjective || "TBD"}
330
939
  - Primary metrics: ${data.evalPrimaryMetrics || "TBD"}
331
940
  - Secondary metrics: ${data.evalSecondaryMetrics || "TBD"}
332
941
  - Required terminal evidence: ${data.evalRequiredTerminalEvidence || "TBD"}
333
942
  - Table plan: ${data.evalTablePlan || "TBD"}
334
943
  - Metric glossary: ${data.evalMetricGlossary || "TBD"}
944
+ - Background sources: ${data.evalBackgroundSources || "TBD"}
945
+ - Method and baseline source papers: ${data.evalMethodAndBaselineSourcePapers || "TBD"}
946
+ - Method and baseline implementation source: ${data.evalMethodAndBaselineImplementationSource || "TBD"}
335
947
  - Metric source papers: ${data.evalMetricSourcePapers || "TBD"}
336
948
  - Metric implementation source: ${data.evalMetricImplementationSource || "TBD"}
337
949
  - Comparison source papers: ${data.evalComparisonSourcePapers || "TBD"}
@@ -369,6 +981,7 @@ ${data.problem || "TBD"}
369
981
  }
370
982
 
371
983
  function buildContextSnapshot(targetDir) {
984
+ const reportStatus = getCollaboratorReportStatus(targetDir);
372
985
  const mission = readFileIfExists(contextFile(targetDir, "mission.md"));
373
986
  const state = readFileIfExists(contextFile(targetDir, "state.md"));
374
987
  const evidence = readFileIfExists(contextFile(targetDir, "evidence-index.md"));
@@ -580,12 +1193,18 @@ function buildContextSnapshot(targetDir) {
580
1193
  autoStopReason: extractValue(autoOutcome, ["Stop reason", "停止原因"]),
581
1194
  autoFinalArtifact: extractValue(autoOutcome, ["Final artifact", "最终工件"]),
582
1195
  autoFinalRung: extractValue(autoOutcome, ["Final rung", "最终 rung"]),
1196
+ reportMode: reportStatus.mode,
1197
+ reportReadiness: reportStatus.readiness,
1198
+ reportModeReason: reportStatus.reason,
583
1199
  evalObjective: evalProtocol.primaryEvaluationObjective,
584
1200
  evalPrimaryMetrics: evalProtocol.primaryMetrics,
585
1201
  evalSecondaryMetrics: evalProtocol.secondaryMetrics,
586
1202
  evalRequiredTerminalEvidence: evalProtocol.requiredTerminalEvidence,
587
1203
  evalTablePlan: evalProtocol.tablePlan,
588
1204
  evalMetricGlossary: evalProtocol.metricGlossary,
1205
+ evalBackgroundSources: evalProtocol.backgroundSources,
1206
+ evalMethodAndBaselineSourcePapers: evalProtocol.methodAndBaselineSourcePapers,
1207
+ evalMethodAndBaselineImplementationSource: evalProtocol.methodAndBaselineImplementationSource,
589
1208
  evalMetricSourcePapers: evalProtocol.metricSourcePapers,
590
1209
  evalMetricImplementationSource: evalProtocol.metricImplementationSource,
591
1210
  evalComparisonSourcePapers: evalProtocol.comparisonSourcePapers,
@@ -604,6 +1223,7 @@ function writeContextFile(targetDir, name, content) {
604
1223
  }
605
1224
 
606
1225
  function refreshContext({ targetDir }) {
1226
+ hydrateCanonicalContext(targetDir);
607
1227
  const lang = readWorkflowLanguage(targetDir);
608
1228
  const snapshot = buildContextSnapshot(targetDir);
609
1229
  writeContextFile(targetDir, "summary.md", renderSummary(lang, snapshot));
@@ -680,6 +1300,11 @@ function archiveContext({ targetDir, now = new Date() }) {
680
1300
 
681
1301
  module.exports = {
682
1302
  archiveContext,
1303
+ collaboratorEvalIssues,
1304
+ collaboratorMissionIssues,
1305
+ getCollaboratorReportStatus,
1306
+ hasCollaboratorFacingDeliverables,
1307
+ hydrateCanonicalContext,
683
1308
  pruneContext,
684
1309
  refreshContext,
685
1310
  };
@@ -39,6 +39,21 @@ const EVAL_PROTOCOL_FIELDS = [
39
39
  key: "metricGlossary",
40
40
  labels: ["Metric glossary", "指标释义"],
41
41
  },
42
+ {
43
+ name: "Background sources",
44
+ key: "backgroundSources",
45
+ labels: ["Background sources", "背景来源"],
46
+ },
47
+ {
48
+ name: "Method and baseline source papers",
49
+ key: "methodAndBaselineSourcePapers",
50
+ labels: ["Method and baseline source papers", "方法与基线来源论文"],
51
+ },
52
+ {
53
+ name: "Method and baseline implementation source",
54
+ key: "methodAndBaselineImplementationSource",
55
+ labels: ["Method and baseline implementation source", "方法与基线实现来源"],
56
+ },
42
57
  {
43
58
  name: "Metric source papers",
44
59
  key: "metricSourcePapers",
package/lib/i18n.cjs CHANGED
@@ -289,7 +289,10 @@ const ZH_SKILL_FILES = {
289
289
 
290
290
  ## 必要输出
291
291
 
292
+ - 报告状态:collaborator-ready 或 artifact-anchored interim
292
293
  - 给用户看的总结
294
+ - 问题与背景的白话说明
295
+ - 数据集场景说明
293
296
  - 方法概述
294
297
  - 选定指标摘要
295
298
  - 指标白话释义
@@ -301,6 +304,7 @@ const ZH_SKILL_FILES = {
301
304
  - 失败尝试
302
305
  - 局限性
303
306
  - 下一步
307
+ - 单独列出的工件状态,而不是混进已验证结果
304
308
 
305
309
  ## 上下文读取
306
310
 
@@ -312,6 +316,8 @@ const ZH_SKILL_FILES = {
312
316
 
313
317
  ## 上下文写回
314
318
 
319
+ - \`.lab/context/mission.md\`
320
+ - \`.lab/context/eval-protocol.md\`
315
321
  - \`.lab/context/state.md\`
316
322
  - \`.lab/context/evidence-index.md\`
317
323
 
@@ -324,7 +330,11 @@ const ZH_SKILL_FILES = {
324
330
  - 必须把已批准的主指标、次级指标和必要终局证据明确写进 \`report.md\` 与受管的 \`main-tables.md\`。
325
331
  - 必须用白话解释选定的主指标和次级指标:每个指标在衡量什么、越高还是越低更好、它是主结果指标还是健康度/支持性指标。
326
332
  - 如果出现 coverage、completeness、confidence 或类似健康度指标,必须明确说明这类指标回答的是“实验是否跑稳、证据是否完整”,而不是主要科学效应本身。
333
+ - 在起草报告前,先检查 \`.lab/context/mission.md\` 和 \`.lab/context/eval-protocol.md\` 是否仍是模板空壳。
334
+ - 如果 canonical context 还是空壳,要先根据 frozen result artifacts、data-decisions、evidence-index 和已批准上下文回填“最小可信版本”,再写报告。
335
+ - 如果回填后仍缺少协作者可读所需的关键字段,就必须把输出降级成 \`artifact-anchored interim report\`,不能冒充最终协作者报告。
327
336
  - 如果报告依赖了对原始指标或原始实现的偏差,必须明确写出这个偏差。
337
+ - workflow 工件状态、rerun id 或 LaTeX 骨架状态不能混进“已验证主结果”;这些内容必须单列到工件状态部分。
328
338
  - 如果 workflow language 是中文,\`report.md\` 和 \`<deliverables_root>/main-tables.md\` 也应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。
329
339
  - 解释优先保守,不要写成营销文案。
330
340
  - 要给 \`/lab:write\` 留下清晰 handoff,尤其是 section draft 可以直接引用的证据链接。
@@ -691,6 +701,12 @@ const ZH_SKILL_FILES = {
691
701
  [path.join(".lab", ".managed", "templates", "final-report.md")]:
692
702
  `# 最终报告
693
703
 
704
+ ## 报告状态
705
+
706
+ - 报告模式:collaborator-ready 或 artifact-anchored interim
707
+ - canonical context 完整度:
708
+ - 为什么当前只能用这个模式:
709
+
694
710
  ## 给用户看的总结
695
711
 
696
712
  - 一句话结论:
@@ -698,6 +714,18 @@ const ZH_SKILL_FILES = {
698
714
  - 还没有被证明的内容:
699
715
  - 当前最大报告风险:
700
716
 
717
+ ## 问题与背景
718
+
719
+ - 这项研究在解决什么问题:
720
+ - 为什么这个问题重要:
721
+ - 当前报告到底覆盖了什么 setting 或 workflow:
722
+
723
+ ## 数据集场景说明
724
+
725
+ - 数据集或 benchmark 1 代表什么真实场景:
726
+ - 数据集或 benchmark 2 代表什么真实场景:
727
+ - 数据集或 benchmark 3 代表什么真实场景:
728
+
701
729
  ## 选定指标
702
730
 
703
731
  - 主指标:
@@ -710,6 +738,23 @@ const ZH_SKILL_FILES = {
710
738
  - 次级指标在衡量什么:
711
739
  - 健康度/支持性指标在衡量什么,为什么它们不是主结论:
712
740
 
741
+ ## 背景来源
742
+
743
+ - 最关键的背景论文或 benchmark 参考:
744
+ - 为什么这些来源足以锚定当前问题:
745
+
746
+ ## 方法与基线来源
747
+
748
+ - 我们的方法来源或实现基础:
749
+ - baseline 与 comparison 的来源论文:
750
+ - baseline 与 comparison 的实现来源:
751
+
752
+ ## 指标来源
753
+
754
+ - 指标来源论文:
755
+ - 指标实现来源:
756
+ - 与原始实现的偏差:
757
+
713
758
  ## 怎么看主表
714
759
 
715
760
  - Table 1 负责回答什么:
@@ -723,6 +768,11 @@ const ZH_SKILL_FILES = {
723
768
  - 最终表现摘要:
724
769
  - 主表覆盖情况:
725
770
 
771
+ ## 工件状态
772
+
773
+ - 已就绪的交付物或工作流工件:
774
+ - 这些工件状态为什么不是科学结论:
775
+
726
776
  ## 主要结果
727
777
 
728
778
  - 主要发现 1:
@@ -936,6 +986,7 @@ const ZH_SKILL_FILES = {
936
986
  - 一句话问题:
937
987
  - 为什么重要:
938
988
  - 目标失败场景:
989
+ - 回填来源:
939
990
 
940
991
  ## 成功标准
941
992
 
@@ -956,6 +1007,7 @@ const ZH_SKILL_FILES = {
956
1007
  - 已批准方向:
957
1008
  - 当前 owner 或会话:
958
1009
  - 最近更新该 mission 的 stage:
1010
+ - 协作者可读状态:
959
1011
  `,
960
1012
  [path.join(".lab", "context", "state.md")]:
961
1013
  `# 工作流状态
@@ -1949,6 +2001,10 @@ ZH_CONTENT[path.join(".lab", "context", "eval-protocol.md")] = `# 评估协议
1949
2001
  ## 指标释义
1950
2002
 
1951
2003
  - 指标释义:
2004
+ - 回填来源:
2005
+ - 背景来源:
2006
+ - 方法与基线来源论文:
2007
+ - 方法与基线实现来源:
1952
2008
  - 指标来源论文:
1953
2009
  - 指标实现来源:
1954
2010
  - 对比方法来源论文:
@@ -2001,10 +2057,12 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
2001
2057
 
2002
2058
  ## 上下文写回
2003
2059
 
2060
+ - \`.lab/context/mission.md\`
2004
2061
  - \`.lab/context/state.md\`
2005
2062
  - \`.lab/context/decisions.md\`
2006
2063
  - \`.lab/context/data-decisions.md\`
2007
2064
  - \`.lab/context/evidence-index.md\`
2065
+ - \`.lab/context/eval-protocol.md\`
2008
2066
  - \`.lab/context/summary.md\`
2009
2067
  - \`.lab/context/session-brief.md\`
2010
2068
  - \`.lab/context/auto-status.md\`
@@ -2033,6 +2091,8 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
2033
2091
  - \`review\` 更新规范审查上下文
2034
2092
  - \`report\` 写出 \`<deliverables_root>/report.md\`
2035
2093
  - \`write\` 写出 \`<deliverables_root>/paper/\` 下的 LaTeX 产物
2094
+ - 如果即将进入 \`report\`,而 \`.lab/context/mission.md\` 或 \`.lab/context/eval-protocol.md\` 仍是模板空壳,就先根据冻结工件和已批准上下文回填最小可信版本。
2095
+ - 如果回填后仍缺少协作者可读所需的关键字段,就必须强制生成 \`artifact-anchored interim report\`,不能冒充最终协作者报告。
2036
2096
  - promotion 成功后,必须写回 \`data-decisions.md\`、\`decisions.md\`、\`state.md\` 和 \`session-brief.md\`。
2037
2097
  - 如果某个指标或对比 claim 在评估协议里没有带来源的定义,就不能拿它做 stop 或 promotion 判断。
2038
2098
 
@@ -1,5 +1,11 @@
1
1
  # Final Report
2
2
 
3
+ ## Report Status
4
+
5
+ - Report mode: collaborator-ready or artifact-anchored interim
6
+ - Canonical context readiness:
7
+ - Why the current mode is appropriate:
8
+
3
9
  ## Reader Summary
4
10
 
5
11
  - One-sentence conclusion:
@@ -7,6 +13,18 @@
7
13
  - What is still unproven:
8
14
  - Biggest reporting risk:
9
15
 
16
+ ## Problem and Background
17
+
18
+ - Research problem in plain language:
19
+ - Why this problem matters:
20
+ - What setting or workflow this report is actually about:
21
+
22
+ ## Dataset Scene Notes
23
+
24
+ - Dataset or benchmark 1 and what real-world setting it represents:
25
+ - Dataset or benchmark 2 and what real-world setting it represents:
26
+ - Dataset or benchmark 3 and what real-world setting it represents:
27
+
10
28
  ## Selected Metrics
11
29
 
12
30
  - Primary metrics:
@@ -19,6 +37,23 @@
19
37
  - Secondary metric plain-language explanation:
20
38
  - Health or support metrics and why they are not the main claim:
21
39
 
40
+ ## Background Sources
41
+
42
+ - Most important background papers or benchmark references:
43
+ - Why these are the right background anchors:
44
+
45
+ ## Method and Baseline Sources
46
+
47
+ - Our method source or implementation basis:
48
+ - Baseline and comparison source papers:
49
+ - Baseline and comparison implementation sources:
50
+
51
+ ## Metric Sources
52
+
53
+ - Metric source papers:
54
+ - Metric implementation source:
55
+ - Deviation from original implementation:
56
+
22
57
  ## Experiment Setup
23
58
 
24
59
  - Datasets:
@@ -39,6 +74,11 @@
39
74
  - Final performance summary:
40
75
  - Table coverage:
41
76
 
77
+ ## Artifact Status
78
+
79
+ - Deliverables or workflow artifacts that are ready:
80
+ - Artifact status notes that are not scientific findings:
81
+
42
82
  ## Main Results
43
83
 
44
84
  Summarize validated iteration outcomes.
@@ -17,6 +17,10 @@ Use this file to define the paper-facing evaluation objective, table plan, gates
17
17
  ## Metric Glossary
18
18
 
19
19
  - Metric glossary:
20
+ - Hydration provenance:
21
+ - Background sources:
22
+ - Method and baseline source papers:
23
+ - Method and baseline implementation source:
20
24
  - Metric source papers:
21
25
  - Metric implementation source:
22
26
  - Comparison source papers:
@@ -5,6 +5,7 @@
5
5
  - One-sentence problem:
6
6
  - Why it matters:
7
7
  - Target failure case:
8
+ - Hydration provenance:
8
9
 
9
10
  ## Success Criteria
10
11
 
@@ -25,3 +26,4 @@
25
26
  - Approved direction:
26
27
  - Current owner or session:
27
28
  - Last stage that updated this mission:
29
+ - Collaborator-ready status:
@@ -46,6 +46,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
46
46
  - Keep an explicit approval gate before `/lab:spec`.
47
47
  - Write idea artifacts with the template in `.lab/.managed/templates/idea.md`.
48
48
  - Update `.lab/context/mission.md`, `.lab/context/decisions.md`, and `.lab/context/open-questions.md` after convergence.
49
+ - Do not leave `.lab/context/mission.md` as a template shell once the problem statement and approved direction are known.
49
50
  - Do not implement code in this stage.
50
51
 
51
52
  ### `/lab:data`
@@ -111,7 +112,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
111
112
  - Normalize the result with `.lab/.managed/scripts/eval_report.py`.
112
113
  - Validate normalized output with `.lab/.managed/scripts/validate_results.py`.
113
114
  - Read `.lab/context/eval-protocol.md` before choosing the smallest run so the first experiment already targets the approved tables, metrics, and gates.
114
- - Update `.lab/context/state.md` and `.lab/context/evidence-index.md` after the run.
115
+ - Update `.lab/context/state.md`, `.lab/context/evidence-index.md`, and `.lab/context/eval-protocol.md` after the run.
116
+ - If the evaluation protocol is still skeletal, initialize the smallest trustworthy source-backed version before treating the run as the protocol anchor.
115
117
 
116
118
  ### `/lab:iterate`
117
119
 
@@ -131,7 +133,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
131
133
  - Keep metric definitions, baseline behavior, and comparison implementations anchored to the source-backed evaluation protocol before changing thresholds, gates, or ladder transitions.
132
134
  - Switch to diagnostic mode if risk increases for two consecutive rounds.
133
135
  - Write round reports with `.lab/.managed/templates/iteration-report.md`.
134
- - Update `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, and `.lab/context/open-questions.md` each round as needed.
136
+ - Update `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, `.lab/context/open-questions.md`, and `.lab/context/eval-protocol.md` each round as needed.
137
+ - Keep `.lab/context/eval-protocol.md` synchronized with accepted ladder changes, benchmark scope, and source-backed implementation deviations.
135
138
  - Stop at threshold success or iteration cap, and record blockers plus next-best actions when the campaign ends without success.
136
139
 
137
140
  ### `/lab:review`
@@ -153,7 +156,9 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
153
156
  - Aggregate them with `.lab/.managed/scripts/summarize_iterations.py`.
154
157
  - Write the final document with `.lab/.managed/templates/final-report.md` and the managed table summary with `.lab/.managed/templates/main-tables.md`.
155
158
  - Keep failed attempts and limitations visible.
156
- - Update `.lab/context/state.md` and `.lab/context/evidence-index.md` with report-level handoff notes.
159
+ - Update `.lab/context/mission.md`, `.lab/context/eval-protocol.md`, `.lab/context/state.md`, and `.lab/context/evidence-index.md` with report-level handoff notes.
160
+ - If canonical context is still skeletal, hydrate the smallest trustworthy version from frozen artifacts before finalizing the report.
161
+ - If collaborator-critical fields remain missing after hydration, downgrade to an `artifact-anchored interim report` instead of presenting a final collaborator-ready report.
157
162
 
158
163
  ### `/lab:write`
159
164
 
@@ -24,6 +24,8 @@
24
24
 
25
25
  ## Context Write Set
26
26
 
27
+ - `.lab/context/mission.md`
28
+ - `.lab/context/eval-protocol.md`
27
29
  - `.lab/context/state.md`
28
30
  - `.lab/context/decisions.md`
29
31
  - `.lab/context/data-decisions.md`
@@ -63,6 +65,8 @@
63
65
  - keep the session alive while the current rung is running
64
66
  - write the current rung, watch target, and next rung to `.lab/context/auto-status.md`
65
67
  - Reuse the existing `/lab:run`, `/lab:iterate`, `/lab:review`, `/lab:report`, and optional `/lab:write` contracts instead of inventing a parallel workflow.
68
+ - If the loop is about to reach `report` while `.lab/context/mission.md` or `.lab/context/eval-protocol.md` is still skeletal, hydrate the smallest trustworthy canonical version from frozen artifacts and approved context before drafting the report.
69
+ - If hydration still leaves collaborator-critical fields blank, force `report` to emit an `artifact-anchored interim report` instead of a collaborator-ready final report.
66
70
  - Enforce stage contracts, not just exit codes:
67
71
  - `run` and `iterate` must change persistent outputs under `results_root`
68
72
  - `review` must update canonical review context
@@ -3,6 +3,7 @@
3
3
  ## Required Output
4
4
 
5
5
  - one-sentence problem statement
6
+ - why the problem matters in plain language
6
7
  - failure case
7
8
  - idea classification
8
9
  - contribution category
@@ -18,6 +19,7 @@
18
19
  - critique before convergence
19
20
  - minimum viable experiment
20
21
  - explicit approval gate before `/lab:spec`
22
+ - canonical mission context updated with the approved problem, importance, failure case, and direction
21
23
 
22
24
  ## Evidence Discipline
23
25
 
@@ -60,3 +62,4 @@
60
62
  - State why the target problem matters before talking about the method.
61
63
  - Compare against existing methods explicitly, not by vague novelty language.
62
64
  - The three meaningful points should each fit in one direct sentence.
65
+ - Do not leave `.lab/context/mission.md` as an empty template after convergence; write the approved problem, why it matters, the current benchmark scope, and the approved direction back into canonical context.
@@ -29,6 +29,7 @@ Declare and keep fixed:
29
29
  - `.lab/context/decisions.md`
30
30
  - `.lab/context/evidence-index.md`
31
31
  - `.lab/context/open-questions.md`
32
+ - `.lab/context/eval-protocol.md`
32
33
 
33
34
  ## Per-Round Output
34
35
 
@@ -62,6 +63,7 @@ If the loop stops without success, record:
62
63
  - Do not accumulate long-lived results under `.lab/changes/<change-id>/runs`.
63
64
  - Do not change metric definitions, baseline semantics, or comparison implementations unless the approved evaluation protocol records both their sources and any deviations.
64
65
  - When you change ladders, sample sizes, or promotion gates, keep the resulting logic anchored to the source-backed evaluation protocol instead of ad-hoc chat reasoning.
66
+ - Keep `.lab/context/eval-protocol.md` synchronized with the active benchmark scope, ladder gates, source-backed metric definitions, and any accepted implementation deviations instead of leaving it as a stale template.
65
67
 
66
68
  ## Interaction Contract
67
69
 
@@ -2,10 +2,16 @@
2
2
 
3
3
  ## Required Output
4
4
 
5
+ - report status: collaborator-ready or artifact-anchored interim
5
6
  - reader summary for the user
7
+ - problem and background in plain language
8
+ - dataset scene notes in plain language
6
9
  - method overview
7
10
  - selected metrics summary
8
11
  - plain-language metric guide
12
+ - background sources
13
+ - method and baseline sources
14
+ - metric sources
9
15
  - experiment setup
10
16
  - validated main results
11
17
  - managed main tables artifact under `<deliverables_root>/main-tables.md`
@@ -14,6 +20,7 @@
14
20
  - failed attempts
15
21
  - limitations
16
22
  - next steps
23
+ - artifact status kept separate from validated findings
17
24
 
18
25
  ## Context Read Set
19
26
 
@@ -27,6 +34,8 @@
27
34
 
28
35
  ## Context Write Set
29
36
 
37
+ - `.lab/context/mission.md`
38
+ - `.lab/context/eval-protocol.md`
30
39
  - `.lab/context/state.md`
31
40
  - `.lab/context/evidence-index.md`
32
41
 
@@ -39,7 +48,13 @@
39
48
  - Carry the approved `Primary metrics`, `Secondary metrics`, and `Required terminal evidence` into both the report and the managed main-tables artifact.
40
49
  - Explain the selected primary and secondary metrics in plain language for the user: what each metric measures, whether higher or lower is better, and whether it is a main result metric or only a health/support metric.
41
50
  - If coverage, completeness, confidence, or similar health metrics appear, explicitly say that they describe experimental reliability rather than the main scientific effect.
51
+ - Pull the core background references, method or baseline references, and metric references out of the approved evaluation protocol instead of hiding them in `.lab/context/*`.
52
+ - Report only the few references a collaborator needs to orient themselves quickly; do not turn `report.md` into a full bibliography dump.
42
53
  - If the report depends on a deviation from an original metric or implementation, state that deviation explicitly instead of smoothing it over.
54
+ - Before drafting the report, inspect `.lab/context/mission.md` and `.lab/context/eval-protocol.md` for skeletal template fields.
55
+ - If either canonical context file is still skeletal, hydrate the smallest trustworthy version from frozen result artifacts, dataset decisions, evidence-index, and prior approved context, and write that back before finalizing the report.
56
+ - If collaborator-critical fields still remain missing after hydration, downgrade the output to an `artifact-anchored interim report` instead of presenting it as a final collaborator-ready report.
57
+ - Do not mix workflow deliverable status, rerun ids, or manuscript skeleton status into validated scientific findings; keep those in a separate artifact-status section.
43
58
  - If `.lab/config/workflow.json` sets the workflow language to Chinese, write `report.md` and `<deliverables_root>/main-tables.md` in Chinese unless a file path, code identifier, or literal metric name must remain unchanged.
44
59
  - Prefer conservative interpretation over marketing language.
45
60
  - Leave a clear handoff path into `/lab:write` with evidence links that section drafts can cite.
@@ -6,6 +6,7 @@
6
6
  - run registry entry
7
7
  - normalized evaluation summary
8
8
  - validation result for the normalized summary
9
+ - canonical evaluation context initialized or refined when the active protocol is still skeletal
9
10
 
10
11
  ## Context Read Set
11
12
 
@@ -19,6 +20,7 @@
19
20
 
20
21
  - `.lab/context/state.md`
21
22
  - `.lab/context/evidence-index.md`
23
+ - `.lab/context/eval-protocol.md`
22
24
 
23
25
  ## Constraints
24
26
 
@@ -26,6 +28,7 @@
26
28
  - Fail fast on data, environment, or metric wiring problems.
27
29
  - Tie the run to the approved evaluation protocol, not just an ad-hoc chat goal.
28
30
  - Do not invent metric definitions, baseline behavior, or comparison implementations from memory; anchor them to the approved evaluation protocol and its recorded sources.
31
+ - If `.lab/context/eval-protocol.md` is still skeletal, write the smallest trustworthy version of the current evaluation objective, metric set, ladder, and source-backed implementation notes before treating the run as the new protocol anchor.
29
32
  - Record the exact launch command and output location.
30
33
  - Write durable run outputs, logs, and checkpoints under `results_root`.
31
34
  - Write figures or plots under `figures_root`.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlab",
3
- "version": "0.1.17",
3
+ "version": "0.1.19",
4
4
  "description": "Strict /lab research workflow installer for Codex and Claude",
5
5
  "keywords": [
6
6
  "codex",