superlab 0.1.18 → 0.1.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/superlab.cjs CHANGED
@@ -12,6 +12,8 @@ const {
12
12
  } = require("../lib/install.cjs");
13
13
  const {
14
14
  archiveContext,
15
+ collaboratorEvalIssues,
16
+ collaboratorMissionIssues,
15
17
  pruneContext,
16
18
  refreshContext,
17
19
  } = require("../lib/context.cjs");
@@ -788,6 +790,8 @@ function printDoctor(options) {
788
790
  const templateIssues = validatePaperTemplateRoot(options.targetDir, config);
789
791
  const dataDecisionIssues = validateDataDecisions(options.targetDir);
790
792
  const evalProtocolIssues = validateEvalProtocol(options.targetDir);
793
+ const missionContextIssues = collaboratorMissionIssues(options.targetDir);
794
+ const collaboratorProtocolIssues = collaboratorEvalIssues(options.targetDir);
791
795
  const rootIssues = validateProjectRoots(options.targetDir, config);
792
796
  const autoStatus = getAutoStatus({ targetDir: options.targetDir });
793
797
  const autoIssues = autoStatus.issues;
@@ -806,6 +810,8 @@ function printDoctor(options) {
806
810
  templateIssues.length > 0 ||
807
811
  dataDecisionIssues.length > 0 ||
808
812
  evalProtocolIssues.length > 0 ||
813
+ missionContextIssues.length > 0 ||
814
+ collaboratorProtocolIssues.length > 0 ||
809
815
  rootIssues.length > 0 ||
810
816
  autoIssues.length > 0
811
817
  ) {
@@ -820,6 +826,8 @@ function printDoctor(options) {
820
826
  templateIssues,
821
827
  dataDecisionIssues,
822
828
  evalProtocolIssues,
829
+ missionContextIssues,
830
+ collaboratorProtocolIssues,
823
831
  rootIssues,
824
832
  autoIssues
825
833
  );
@@ -253,7 +253,7 @@ async function startAutoMode({ targetDir, now = new Date() }) {
253
253
  const maxFailures = parseInteger(mode.maxFailures, 0);
254
254
  const maxIterations = parseInteger(mode.maxIterations, 1);
255
255
  const requiredArtifact = resolveRequiredArtifact(targetDir, mode.requiredTerminalArtifact);
256
- const frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
256
+ let frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
257
257
  const { loopStages, finalStages } = splitAutoStages(mode.allowedStages);
258
258
  const executedStages = [];
259
259
  let failureCount = 0;
@@ -363,13 +363,14 @@ async function startAutoMode({ targetDir, now = new Date() }) {
363
363
  nextRung,
364
364
  decision: rungId ? `completed rung ${rungId}` : `completed stage ${stage}`,
365
365
  });
366
- refreshContext({ targetDir });
367
-
368
366
  const frozenCoreChanges = detectFrozenCoreChanges(frozenCoreSnapshot);
369
367
  if (frozenCoreChanges.length > 0) {
370
368
  failAutoMode(`frozen core changed: ${frozenCoreChanges.join(", ")}`);
371
369
  }
372
370
 
371
+ refreshContext({ targetDir });
372
+ frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
373
+
373
374
  const stopCheck = await runCheckCommand({
374
375
  targetDir,
375
376
  label: `stop check after ${rungId || stage}`,
@@ -437,12 +438,13 @@ async function startAutoMode({ targetDir, now = new Date() }) {
437
438
  decision: `promotion policy matched after ${label}`,
438
439
  });
439
440
  promotionApplied = true;
440
- refreshContext({ targetDir });
441
- verifyPromotionWriteback(targetDir, promotionSnapshot);
442
441
  const frozenCoreChangesAfterPromotion = detectFrozenCoreChanges(frozenCoreSnapshot);
443
442
  if (frozenCoreChangesAfterPromotion.length > 0) {
444
443
  failAutoMode(`frozen core changed: ${frozenCoreChangesAfterPromotion.join(", ")}`);
445
444
  }
445
+ refreshContext({ targetDir });
446
+ verifyPromotionWriteback(targetDir, promotionSnapshot);
447
+ frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
446
448
  };
447
449
 
448
450
  if (evalProtocol.experimentRungs.length > 0) {
package/lib/context.cjs CHANGED
@@ -2,6 +2,69 @@ const fs = require("node:fs");
2
2
  const path = require("node:path");
3
3
  const { parseEvalProtocol } = require("./eval_protocol.cjs");
4
4
 
5
+ const PLACEHOLDER_VALUES = new Set(["", "tbd", "none", "待补充", "无"]);
6
+ const MISSION_COLLABORATOR_FIELDS = [
7
+ { name: "One-sentence problem", labels: ["One-sentence problem", "一句话问题"] },
8
+ { name: "Why it matters", labels: ["Why it matters", "为什么重要"] },
9
+ { name: "Primary metric", labels: ["Primary metric", "主指标"] },
10
+ { name: "Success threshold", labels: ["Success threshold", "成功阈值"] },
11
+ { name: "Dataset or benchmark scope", labels: ["Dataset or benchmark scope", "数据集或 benchmark 范围"] },
12
+ { name: "Approved direction", labels: ["Approved direction", "已批准方向"] },
13
+ ];
14
+ const EVAL_COLLABORATOR_FIELDS = [
15
+ { name: "Primary evaluation objective", labels: ["Primary evaluation objective", "主评估目标"] },
16
+ { name: "Primary metrics", labels: ["Primary metrics", "主指标"] },
17
+ { name: "Secondary metrics", labels: ["Secondary metrics", "次级指标"] },
18
+ { name: "Table plan", labels: ["Table plan", "主表计划"] },
19
+ { name: "Metric glossary", labels: ["Metric glossary", "指标释义"] },
20
+ { name: "Background sources", labels: ["Background sources", "背景来源"] },
21
+ {
22
+ name: "Method and baseline source papers",
23
+ labels: ["Method and baseline source papers", "方法与基线来源论文"],
24
+ },
25
+ {
26
+ name: "Method and baseline implementation source",
27
+ labels: ["Method and baseline implementation source", "方法与基线实现来源"],
28
+ },
29
+ { name: "Metric source papers", labels: ["Metric source papers", "指标来源论文"] },
30
+ { name: "Required output artifacts", labels: ["Required output artifacts", "必要输出工件"] },
31
+ ];
32
+ const REPORT_FIELDS = {
33
+ problem: ["Research problem in plain language", "研究问题白话解释", "研究问题"],
34
+ whyItMatters: ["Why this problem matters", "为什么这个问题重要"],
35
+ setting: ["What setting or workflow this report is actually about", "这份报告实际对应的场景或流程"],
36
+ primaryMetrics: ["Primary metrics", "主指标"],
37
+ secondaryMetrics: ["Secondary metrics", "次级指标"],
38
+ requiredTerminalEvidence: ["Required terminal evidence", "必要终局证据"],
39
+ metricGuidePrimary: ["Primary metric plain-language explanation", "主指标白话解释"],
40
+ metricGuideSecondary: ["Secondary metric plain-language explanation", "次级指标白话解释"],
41
+ metricGuideSupport: [
42
+ "Health or support metrics and why they are not the main claim",
43
+ "健康度或支持性指标以及它们为什么不是主 claim",
44
+ ],
45
+ backgroundSources: ["Most important background papers or benchmark references", "最重要的背景论文或 benchmark 参考"],
46
+ backgroundAnchors: ["Why these are the right background anchors", "为什么这些是合适的背景锚点"],
47
+ methodBasis: ["Our method source or implementation basis", "我们的方法来源或实现基础"],
48
+ baselineSourcePapers: ["Baseline and comparison source papers", "基线与对比方法来源论文"],
49
+ baselineImplementationSources: [
50
+ "Baseline and comparison implementation sources",
51
+ "基线与对比方法实现来源",
52
+ ],
53
+ metricSourcePapers: ["Metric source papers", "指标来源论文"],
54
+ metricImplementationSource: ["Metric implementation source", "指标实现来源"],
55
+ metricDeviation: ["Deviation from original implementation", "与原始实现的偏差"],
56
+ datasets: ["Datasets", "数据集"],
57
+ baselines: ["Baselines", "基线"],
58
+ metrics: ["Metrics", "指标"],
59
+ finalPerformanceSummary: ["Final performance summary", "最终表现总结"],
60
+ tableCoverage: ["Table coverage", "表格覆盖范围"],
61
+ };
62
+ const TERMINOLOGY_FIELDS = {
63
+ methodName: ["Method name", "方法名"],
64
+ shortName: ["Short name or acronym", "简称或缩写"],
65
+ contributionBullets: ["Contribution bullets", "贡献 bullets", "Contribution bullets:"],
66
+ };
67
+
5
68
  function contextFile(targetDir, name) {
6
69
  return path.join(targetDir, ".lab", "context", name);
7
70
  }
@@ -58,6 +121,278 @@ function joinNonEmpty(parts, separator = "; ") {
58
121
  return parts.filter(Boolean).join(separator);
59
122
  }
60
123
 
124
+ function isMeaningful(value) {
125
+ return !PLACEHOLDER_VALUES.has((value || "").trim().toLowerCase());
126
+ }
127
+
128
+ function readWorkflowConfig(targetDir) {
129
+ const configPath = path.join(targetDir, ".lab", "config", "workflow.json");
130
+ if (!fs.existsSync(configPath)) {
131
+ return {};
132
+ }
133
+ try {
134
+ return JSON.parse(fs.readFileSync(configPath, "utf8"));
135
+ } catch {
136
+ return {};
137
+ }
138
+ }
139
+
140
+ function resolveProjectPath(targetDir, configuredPath) {
141
+ if (!configuredPath || typeof configuredPath !== "string") {
142
+ return "";
143
+ }
144
+ return path.resolve(targetDir, configuredPath);
145
+ }
146
+
147
+ function getCollaboratorDeliverablePaths(targetDir) {
148
+ const config = readWorkflowConfig(targetDir);
149
+ const deliverablesRoot = resolveProjectPath(targetDir, config.deliverables_root || "docs/research");
150
+ return {
151
+ deliverablesRoot,
152
+ reportPath: path.join(deliverablesRoot, "report.md"),
153
+ mainTablesPath: path.join(deliverablesRoot, "main-tables.md"),
154
+ };
155
+ }
156
+
157
+ function hasCollaboratorFacingDeliverables(targetDir) {
158
+ const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
159
+ return fs.existsSync(reportPath) || fs.existsSync(mainTablesPath);
160
+ }
161
+
162
+ function missingCollaboratorFields(text, fields) {
163
+ return fields.filter((field) => !isMeaningful(extractValue(text, field.labels))).map((field) => field.name);
164
+ }
165
+
166
+ function collaboratorMissionIssues(targetDir) {
167
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
168
+ return [];
169
+ }
170
+ const mission = readFileIfExists(contextFile(targetDir, "mission.md"));
171
+ if (!mission) {
172
+ return [];
173
+ }
174
+ const missing = missingCollaboratorFields(mission, MISSION_COLLABORATOR_FIELDS);
175
+ return missing.length > 0
176
+ ? [`mission context is still skeletal for collaborator-facing reporting: ${missing.join(", ")}`]
177
+ : [];
178
+ }
179
+
180
+ function collaboratorEvalIssues(targetDir) {
181
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
182
+ return [];
183
+ }
184
+ const protocol = readFileIfExists(contextFile(targetDir, "eval-protocol.md"));
185
+ if (!protocol) {
186
+ return [];
187
+ }
188
+ const missing = missingCollaboratorFields(protocol, EVAL_COLLABORATOR_FIELDS);
189
+ return missing.length > 0
190
+ ? [`evaluation protocol is still skeletal for collaborator-facing reporting: ${missing.join(", ")}`]
191
+ : [];
192
+ }
193
+
194
+ function extractReportValue(reportText, key) {
195
+ return extractValue(reportText, REPORT_FIELDS[key] || []);
196
+ }
197
+
198
+ function mergePreferred(existingValue, ...candidates) {
199
+ if (isMeaningful(existingValue)) {
200
+ return existingValue;
201
+ }
202
+ for (const candidate of candidates) {
203
+ if (isMeaningful(candidate)) {
204
+ return candidate;
205
+ }
206
+ }
207
+ return "";
208
+ }
209
+
210
+ function firstMetric(metrics) {
211
+ return (metrics || "")
212
+ .split(/[;,]/)
213
+ .map((value) => value.trim())
214
+ .filter(Boolean)[0] || "";
215
+ }
216
+
217
+ function buildMissionContextText(lang, fields) {
218
+ if (lang === "zh") {
219
+ return `# 研究主线
220
+
221
+ ## 核心问题
222
+
223
+ - 一句话问题: ${fields.problem || "待补充"}
224
+ - 为什么重要: ${fields.whyItMatters || "待补充"}
225
+ - 目标失败场景: ${fields.targetFailureCase || "待补充"}
226
+
227
+ ## 成功标准
228
+
229
+ - 主指标: ${fields.primaryMetric || "待补充"}
230
+ - 成功阈值: ${fields.successThreshold || "待补充"}
231
+ - 必须对比的 baseline: ${fields.requiredBaselineComparison || "待补充"}
232
+ - 最小证据要求: ${fields.minimumEvidenceRequirement || "待补充"}
233
+
234
+ ## 冻结边界
235
+
236
+ - 数据集或 benchmark 范围: ${fields.datasetScope || "待补充"}
237
+ - 切分策略: ${fields.splitPolicy || "待补充"}
238
+ - 评估协议: ${fields.evaluationProtocol || "待补充"}
239
+ - 硬约束: ${fields.hardConstraints || "待补充"}
240
+
241
+ ## 当前状态
242
+
243
+ - 已批准方向: ${fields.approvedDirection || "待补充"}
244
+ - 当前 owner 或会话: ${fields.currentOwner || "待补充"}
245
+ - 最近一次允许更新 mission 的阶段: ${fields.latestStage || "待补充"}
246
+ - 回填来源: ${fields.hydrationProvenance || "待补充"}
247
+ - 协作者可读状态: ${fields.collaboratorReadyStatus || "待补充"}
248
+ `;
249
+ }
250
+
251
+ return `# Research Mission
252
+
253
+ ## Core Problem
254
+
255
+ - One-sentence problem: ${fields.problem || "TBD"}
256
+ - Why it matters: ${fields.whyItMatters || "TBD"}
257
+ - Target failure case: ${fields.targetFailureCase || "TBD"}
258
+
259
+ ## Success Criteria
260
+
261
+ - Primary metric: ${fields.primaryMetric || "TBD"}
262
+ - Success threshold: ${fields.successThreshold || "TBD"}
263
+ - Required baseline comparison: ${fields.requiredBaselineComparison || "TBD"}
264
+ - Minimum evidence requirement: ${fields.minimumEvidenceRequirement || "TBD"}
265
+
266
+ ## Frozen Boundaries
267
+
268
+ - Dataset or benchmark scope: ${fields.datasetScope || "TBD"}
269
+ - Split policy: ${fields.splitPolicy || "TBD"}
270
+ - Evaluation protocol: ${fields.evaluationProtocol || "TBD"}
271
+ - Hard constraints: ${fields.hardConstraints || "TBD"}
272
+
273
+ ## Current Status
274
+
275
+ - Approved direction: ${fields.approvedDirection || "TBD"}
276
+ - Current owner or session: ${fields.currentOwner || "TBD"}
277
+ - Latest stage to update this mission: ${fields.latestStage || "TBD"}
278
+ - Hydration provenance: ${fields.hydrationProvenance || "TBD"}
279
+ - Collaborator-ready status: ${fields.collaboratorReadyStatus || "TBD"}
280
+ `;
281
+ }
282
+
283
+ function buildEvalProtocolText(lang, fields, rungs) {
284
+ const rungBlocks = (rungs || [])
285
+ .map((rung) => {
286
+ if (lang === "zh") {
287
+ return `### Rung: ${rung.id}
288
+
289
+ - 阶段: ${rung.stage || "待补充"}
290
+ - 目标: ${rung.goal || "待补充"}
291
+ - 命令: ${rung.command || "待补充"}
292
+ - 监视目标: ${rung.watch || "待补充"}
293
+ - gate 命令: ${rung.gate || "待补充"}
294
+ - 通过后: ${rung.onPass || "待补充"}
295
+ - 失败后: ${rung.onFail || "待补充"}
296
+ - 停止后: ${rung.onStop || "待补充"}`;
297
+ }
298
+ return `### Rung: ${rung.id}
299
+
300
+ - Stage: ${rung.stage || "TBD"}
301
+ - Goal: ${rung.goal || "TBD"}
302
+ - Command: ${rung.command || "TBD"}
303
+ - Watch: ${rung.watch || "TBD"}
304
+ - Gate: ${rung.gate || "TBD"}
305
+ - On pass: ${rung.onPass || "TBD"}
306
+ - On fail: ${rung.onFail || "TBD"}
307
+ - On stop: ${rung.onStop || "TBD"}`;
308
+ })
309
+ .join("\n\n");
310
+
311
+ if (lang === "zh") {
312
+ return `# 评估协议
313
+
314
+ 用这份文件定义 \`/lab:run\`、\`/lab:iterate\`、\`/lab:auto\` 和 \`/lab:report\` 共用的论文导向评估目标、主表计划、gate 与 benchmark ladder。
315
+
316
+ ## 主评估目标
317
+
318
+ - 主评估目标: ${fields.primaryEvaluationObjective || "待补充"}
319
+ - 主指标: ${fields.primaryMetrics || "待补充"}
320
+ - 次级指标: ${fields.secondaryMetrics || "待补充"}
321
+ - 必要终局证据: ${fields.requiredTerminalEvidence || "待补充"}
322
+
323
+ ## 主表计划
324
+
325
+ - 主表计划: ${fields.tablePlan || "待补充"}
326
+ - 每张表必须支撑的 claims: ${fields.requiredClaimsPerTable || "待补充"}
327
+
328
+ ## 指标释义
329
+
330
+ - 指标释义: ${fields.metricGlossary || "待补充"}
331
+ - 回填来源: ${fields.hydrationProvenance || "待补充"}
332
+ - 背景来源: ${fields.backgroundSources || "待补充"}
333
+ - 方法与基线来源论文: ${fields.methodAndBaselineSourcePapers || "待补充"}
334
+ - 方法与基线实现来源: ${fields.methodAndBaselineImplementationSource || "待补充"}
335
+ - 指标来源论文: ${fields.metricSourcePapers || "待补充"}
336
+ - 指标实现来源: ${fields.metricImplementationSource || "待补充"}
337
+ - 对比方法来源论文: ${fields.comparisonSourcePapers || "待补充"}
338
+ - 对比方法实现来源: ${fields.comparisonImplementationSource || "待补充"}
339
+ - 与原始实现的偏差: ${fields.deviationFromOriginalImplementation || "待补充"}
340
+
341
+ ## Gate Ladder
342
+
343
+ - 实验阶梯: ${fields.experimentLadder || "待补充"}
344
+ - benchmark 阶梯: ${fields.benchmarkLadder || "待补充"}
345
+ - 对比方法 gate: ${fields.comparisonGate || "待补充"}
346
+ - 升格 gate: ${fields.promotionGate || "待补充"}
347
+ - 最小样本量: ${fields.minimumSampleSizes || "待补充"}
348
+ - 必要输出工件: ${fields.requiredOutputArtifacts || "待补充"}
349
+
350
+ ${rungBlocks}
351
+ `;
352
+ }
353
+
354
+ return `# Evaluation Protocol
355
+
356
+ Use this file to define the paper-facing evaluation target, table plan, gates, and benchmark ladder shared by \`/lab:run\`, \`/lab:iterate\`, \`/lab:auto\`, and \`/lab:report\`.
357
+
358
+ ## Primary Evaluation Objective
359
+
360
+ - Primary evaluation objective: ${fields.primaryEvaluationObjective || "TBD"}
361
+ - Primary metrics: ${fields.primaryMetrics || "TBD"}
362
+ - Secondary metrics: ${fields.secondaryMetrics || "TBD"}
363
+ - Required terminal evidence: ${fields.requiredTerminalEvidence || "TBD"}
364
+
365
+ ## Table Plan
366
+
367
+ - Table plan: ${fields.tablePlan || "TBD"}
368
+ - Required claims per table: ${fields.requiredClaimsPerTable || "TBD"}
369
+
370
+ ## Metric Glossary
371
+
372
+ - Metric glossary: ${fields.metricGlossary || "TBD"}
373
+ - Hydration provenance: ${fields.hydrationProvenance || "TBD"}
374
+ - Background sources: ${fields.backgroundSources || "TBD"}
375
+ - Method and baseline source papers: ${fields.methodAndBaselineSourcePapers || "TBD"}
376
+ - Method and baseline implementation source: ${fields.methodAndBaselineImplementationSource || "TBD"}
377
+ - Metric source papers: ${fields.metricSourcePapers || "TBD"}
378
+ - Metric implementation source: ${fields.metricImplementationSource || "TBD"}
379
+ - Comparison source papers: ${fields.comparisonSourcePapers || "TBD"}
380
+ - Comparison implementation source: ${fields.comparisonImplementationSource || "TBD"}
381
+ - Deviation from original implementation: ${fields.deviationFromOriginalImplementation || "TBD"}
382
+
383
+ ## Gate Ladder
384
+
385
+ - Experiment ladder: ${fields.experimentLadder || "TBD"}
386
+ - Benchmark ladder: ${fields.benchmarkLadder || "TBD"}
387
+ - Comparison gate: ${fields.comparisonGate || "TBD"}
388
+ - Promotion gate: ${fields.promotionGate || "TBD"}
389
+ - Minimum sample sizes: ${fields.minimumSampleSizes || "TBD"}
390
+ - Required output artifacts: ${fields.requiredOutputArtifacts || "TBD"}
391
+
392
+ ${rungBlocks}
393
+ `;
394
+ }
395
+
61
396
  function extractClaim(text) {
62
397
  const blocks = text
63
398
  .split(/\n(?=\d+\.\s)/)
@@ -76,6 +411,264 @@ function labelValue(text, englishLabels, chineseLabels = []) {
76
411
  return extractValue(text, [...englishLabels, ...chineseLabels]);
77
412
  }
78
413
 
414
+ function collectHydrationSources(targetDir) {
415
+ const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
416
+ return [
417
+ fs.existsSync(reportPath) ? path.relative(targetDir, reportPath) : "",
418
+ fs.existsSync(mainTablesPath) ? path.relative(targetDir, mainTablesPath) : "",
419
+ readFileIfExists(contextFile(targetDir, "data-decisions.md")) ? ".lab/context/data-decisions.md" : "",
420
+ readFileIfExists(contextFile(targetDir, "state.md")) ? ".lab/context/state.md" : "",
421
+ readFileIfExists(contextFile(targetDir, "evidence-index.md")) ? ".lab/context/evidence-index.md" : "",
422
+ ].filter(Boolean);
423
+ }
424
+
425
+ function hydrateMissionContext(targetDir) {
426
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
427
+ return false;
428
+ }
429
+
430
+ const lang = readWorkflowLanguage(targetDir);
431
+ const missionText = readFileIfExists(contextFile(targetDir, "mission.md"));
432
+ const stateText = readFileIfExists(contextFile(targetDir, "state.md"));
433
+ const evidenceText = readFileIfExists(contextFile(targetDir, "evidence-index.md"));
434
+ const dataDecisions = readFileIfExists(contextFile(targetDir, "data-decisions.md"));
435
+ const reportText = readFileIfExists(getCollaboratorDeliverablePaths(targetDir).reportPath);
436
+ const evalProtocol = parseEvalProtocol(targetDir);
437
+ const hydrationSources = collectHydrationSources(targetDir);
438
+
439
+ const fields = {
440
+ problem: mergePreferred(
441
+ extractValue(missionText, ["One-sentence problem", "一句话问题"]),
442
+ extractReportValue(reportText, "problem"),
443
+ extractValue(stateText, ["Current objective", "当前目标", "Current objective"])
444
+ ),
445
+ whyItMatters: mergePreferred(
446
+ extractValue(missionText, ["Why it matters", "为什么重要"]),
447
+ extractReportValue(reportText, "whyItMatters")
448
+ ),
449
+ targetFailureCase: extractValue(missionText, ["Target failure case", "目标失败场景"]),
450
+ primaryMetric: mergePreferred(
451
+ extractValue(missionText, ["Primary metric", "主指标"]),
452
+ evalProtocol.primaryMetrics,
453
+ extractReportValue(reportText, "primaryMetrics"),
454
+ firstMetric(evalProtocol.primaryMetrics)
455
+ ),
456
+ successThreshold: mergePreferred(
457
+ extractValue(missionText, ["Success threshold", "成功阈值"]),
458
+ extractValue(readFileIfExists(contextFile(targetDir, "auto-outcome.md")), ["Terminal goal target", "终止目标目标值"])
459
+ ),
460
+ requiredBaselineComparison: extractValue(
461
+ missionText,
462
+ ["Required baseline comparison", "必须对比的 baseline"]
463
+ ),
464
+ minimumEvidenceRequirement: extractValue(
465
+ missionText,
466
+ ["Minimum evidence requirement", "最小证据要求"]
467
+ ),
468
+ datasetScope: mergePreferred(
469
+ extractValue(missionText, ["Dataset or benchmark scope", "数据集或 benchmark 范围"]),
470
+ extractValue(dataDecisions, ["Approved dataset package", "Approved datasets", "已批准数据集包", "已批准数据集"]),
471
+ extractReportValue(reportText, "datasets")
472
+ ),
473
+ splitPolicy: extractValue(missionText, ["Split policy", "切分策略"]),
474
+ evaluationProtocol: mergePreferred(
475
+ extractValue(missionText, ["Evaluation protocol", "评估协议"]),
476
+ ".lab/context/eval-protocol.md"
477
+ ),
478
+ hardConstraints: extractValue(missionText, ["Hard constraints", "硬约束"]),
479
+ approvedDirection: mergePreferred(
480
+ extractValue(missionText, ["Approved direction", "已批准方向"]),
481
+ extractClaim(evidenceText)
482
+ ),
483
+ currentOwner: extractValue(missionText, ["Current owner or session", "当前 owner 或会话"]),
484
+ latestStage: mergePreferred(
485
+ extractValue(missionText, ["Latest stage to update this mission", "最近一次允许更新 mission 的阶段"]),
486
+ extractValue(stateText, ["Active stage", "当前阶段", "Stage"])
487
+ ),
488
+ };
489
+
490
+ const before = missionText;
491
+ fields.hydrationProvenance = hydrationSources.length > 0 ? hydrationSources.join("; ") : "";
492
+ fields.collaboratorReadyStatus =
493
+ missingCollaboratorFields(buildMissionContextText(lang, fields), MISSION_COLLABORATOR_FIELDS).length === 0
494
+ ? "hydrated"
495
+ : "artifact-anchored interim";
496
+
497
+ const nextText = buildMissionContextText(lang, fields);
498
+ if (nextText !== before) {
499
+ writeContextFile(targetDir, "mission.md", nextText);
500
+ return true;
501
+ }
502
+ return false;
503
+ }
504
+
505
+ function hydrateEvalProtocol(targetDir) {
506
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
507
+ return false;
508
+ }
509
+
510
+ const lang = readWorkflowLanguage(targetDir);
511
+ const protocol = parseEvalProtocol(targetDir);
512
+ const missionText = readFileIfExists(contextFile(targetDir, "mission.md"));
513
+ const reportText = readFileIfExists(getCollaboratorDeliverablePaths(targetDir).reportPath);
514
+ const dataDecisions = readFileIfExists(contextFile(targetDir, "data-decisions.md"));
515
+ const hydrationSources = collectHydrationSources(targetDir);
516
+ const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
517
+ const requiredOutputs = [
518
+ fs.existsSync(reportPath) ? path.relative(targetDir, reportPath) : "",
519
+ fs.existsSync(mainTablesPath) ? path.relative(targetDir, mainTablesPath) : "",
520
+ ".lab/context/evidence-index.md",
521
+ ]
522
+ .filter(Boolean)
523
+ .join(", ");
524
+
525
+ const fields = {
526
+ primaryEvaluationObjective: mergePreferred(
527
+ protocol.primaryEvaluationObjective,
528
+ extractValue(missionText, ["One-sentence problem", "一句话问题"]),
529
+ extractReportValue(reportText, "problem")
530
+ ),
531
+ primaryMetrics: mergePreferred(
532
+ protocol.primaryMetrics,
533
+ extractValue(missionText, ["Primary metric", "主指标"]),
534
+ extractReportValue(reportText, "primaryMetrics")
535
+ ),
536
+ secondaryMetrics: mergePreferred(
537
+ protocol.secondaryMetrics,
538
+ extractReportValue(reportText, "secondaryMetrics")
539
+ ),
540
+ requiredTerminalEvidence: mergePreferred(
541
+ protocol.requiredTerminalEvidence,
542
+ extractReportValue(reportText, "requiredTerminalEvidence"),
543
+ requiredOutputs
544
+ ),
545
+ tablePlan: mergePreferred(
546
+ protocol.tablePlan,
547
+ fs.existsSync(mainTablesPath) ? `See ${path.relative(targetDir, mainTablesPath)}` : ""
548
+ ),
549
+ requiredClaimsPerTable: protocol.requiredClaimsPerTable,
550
+ metricGlossary: mergePreferred(
551
+ protocol.metricGlossary,
552
+ joinNonEmpty(
553
+ [
554
+ extractReportValue(reportText, "metricGuidePrimary"),
555
+ extractReportValue(reportText, "metricGuideSecondary"),
556
+ extractReportValue(reportText, "metricGuideSupport"),
557
+ ],
558
+ " | "
559
+ )
560
+ ),
561
+ backgroundSources: mergePreferred(
562
+ protocol.backgroundSources,
563
+ extractReportValue(reportText, "backgroundSources"),
564
+ extractValue(dataDecisions, ["Papers that used the approved datasets", "使用过已批准数据集的论文", "使用过该数据集的论文"])
565
+ ),
566
+ methodAndBaselineSourcePapers: mergePreferred(
567
+ protocol.methodAndBaselineSourcePapers,
568
+ extractReportValue(reportText, "baselineSourcePapers")
569
+ ),
570
+ methodAndBaselineImplementationSource: mergePreferred(
571
+ protocol.methodAndBaselineImplementationSource,
572
+ extractReportValue(reportText, "baselineImplementationSources")
573
+ ),
574
+ metricSourcePapers: mergePreferred(
575
+ protocol.metricSourcePapers,
576
+ extractReportValue(reportText, "metricSourcePapers")
577
+ ),
578
+ metricImplementationSource: mergePreferred(
579
+ protocol.metricImplementationSource,
580
+ extractReportValue(reportText, "metricImplementationSource")
581
+ ),
582
+ comparisonSourcePapers: mergePreferred(
583
+ protocol.comparisonSourcePapers,
584
+ extractReportValue(reportText, "baselineSourcePapers")
585
+ ),
586
+ comparisonImplementationSource: mergePreferred(
587
+ protocol.comparisonImplementationSource,
588
+ extractReportValue(reportText, "baselineImplementationSources")
589
+ ),
590
+ deviationFromOriginalImplementation: mergePreferred(
591
+ protocol.deviationFromOriginalImplementation,
592
+ extractReportValue(reportText, "metricDeviation")
593
+ ),
594
+ benchmarkLadder: protocol.benchmarkLadder,
595
+ experimentLadder: protocol.experimentLadder,
596
+ comparisonGate: protocol.comparisonGate,
597
+ promotionGate: protocol.promotionGate,
598
+ minimumSampleSizes: protocol.minimumSampleSizes,
599
+ requiredOutputArtifacts: mergePreferred(protocol.requiredOutputArtifacts, requiredOutputs),
600
+ hydrationProvenance: hydrationSources.length > 0 ? hydrationSources.join("; ") : "",
601
+ };
602
+
603
+ const nextText = buildEvalProtocolText(lang, fields, protocol.experimentRungs);
604
+ if (nextText !== protocol.text) {
605
+ writeContextFile(targetDir, "eval-protocol.md", nextText);
606
+ return true;
607
+ }
608
+ return false;
609
+ }
610
+
611
+ function getCollaboratorReportStatus(targetDir) {
612
+ const missionIssues = collaboratorMissionIssues(targetDir);
613
+ const evalIssues = collaboratorEvalIssues(targetDir);
614
+ const issues = missionIssues.concat(evalIssues);
615
+ if (issues.length > 0) {
616
+ return {
617
+ mode: "artifact-anchored interim",
618
+ readiness: "hydrated but incomplete",
619
+ reason: issues.join(" | "),
620
+ issues,
621
+ };
622
+ }
623
+ return {
624
+ mode: "collaborator-ready",
625
+ readiness: "ready",
626
+ reason: "canonical mission and evaluation context are complete enough for collaborator-facing reporting",
627
+ issues: [],
628
+ };
629
+ }
630
+
631
+ function upsertSection(text, heading, bodyLines) {
632
+ const sectionText = `${heading}\n\n${bodyLines.join("\n")}`.trimEnd();
633
+ const pattern = new RegExp(`^${heading.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\s*$[\\s\\S]*?(?=^##\\s|\\Z)`, "m");
634
+ if (pattern.test(text)) {
635
+ return text.replace(pattern, sectionText);
636
+ }
637
+ return `${text.trimEnd()}\n\n${sectionText}\n`;
638
+ }
639
+
640
+ function syncCollaboratorReportStatus(targetDir, status) {
641
+ const { reportPath } = getCollaboratorDeliverablePaths(targetDir);
642
+ if (!fs.existsSync(reportPath)) {
643
+ return false;
644
+ }
645
+ const reportText = fs.readFileSync(reportPath, "utf8");
646
+ const heading = /^#\s/.test(reportText) ? "## Report Status" : "# Report Status";
647
+ const nextText = upsertSection(reportText, heading, [
648
+ `- Report mode: ${status.mode}`,
649
+ `- Canonical context readiness: ${status.readiness}`,
650
+ `- Why the current mode is appropriate: ${status.reason}`,
651
+ ]);
652
+ if (nextText !== reportText) {
653
+ fs.writeFileSync(reportPath, nextText.trimEnd() + "\n");
654
+ return true;
655
+ }
656
+ return false;
657
+ }
658
+
659
+ function hydrateCanonicalContext(targetDir) {
660
+ const missionChanged = hydrateMissionContext(targetDir);
661
+ const evalChanged = hydrateEvalProtocol(targetDir);
662
+ const reportStatus = getCollaboratorReportStatus(targetDir);
663
+ const reportChanged = syncCollaboratorReportStatus(targetDir, reportStatus);
664
+ return {
665
+ missionChanged,
666
+ evalChanged,
667
+ reportChanged,
668
+ reportStatus,
669
+ };
670
+ }
671
+
79
672
  function renderSummary(lang, data) {
80
673
  if (lang === "zh") {
81
674
  return `# 研究摘要
@@ -95,6 +688,11 @@ function renderSummary(lang, data) {
95
688
  - Auto stop reason: ${data.autoStopReason || "待补充"}
96
689
  - Auto final artifact: ${data.autoFinalArtifact || "待补充"}
97
690
  - Auto final rung: ${data.autoFinalRung || "待补充"}
691
+ - Collaborator report mode: ${data.reportMode || "待补充"}
692
+ - Canonical context readiness: ${data.reportReadiness || "待补充"}
693
+ - Why this report mode is active: ${data.reportModeReason || "待补充"}
694
+ - Method name: ${data.methodName || "待补充"}
695
+ - Contribution bullets: ${data.contributionBullets || "待补充"}
98
696
  - Eval objective: ${data.evalObjective || "待补充"}
99
697
  - Primary metrics: ${data.evalPrimaryMetrics || "待补充"}
100
698
  - Secondary metrics: ${data.evalSecondaryMetrics || "待补充"}
@@ -153,6 +751,11 @@ function renderSummary(lang, data) {
153
751
  - Auto stop reason: ${data.autoStopReason || "TBD"}
154
752
  - Auto final artifact: ${data.autoFinalArtifact || "TBD"}
155
753
  - Auto final rung: ${data.autoFinalRung || "TBD"}
754
+ - Collaborator report mode: ${data.reportMode || "TBD"}
755
+ - Canonical context readiness: ${data.reportReadiness || "TBD"}
756
+ - Why this report mode is active: ${data.reportModeReason || "TBD"}
757
+ - Method name: ${data.methodName || "TBD"}
758
+ - Contribution bullets: ${data.contributionBullets || "TBD"}
156
759
  - Eval objective: ${data.evalObjective || "TBD"}
157
760
  - Primary metrics: ${data.evalPrimaryMetrics || "TBD"}
158
761
  - Secondary metrics: ${data.evalSecondaryMetrics || "TBD"}
@@ -266,6 +869,11 @@ ${data.problem || "待补充"}
266
869
  - Auto stop reason: ${data.autoStopReason || "待补充"}
267
870
  - Auto final artifact: ${data.autoFinalArtifact || "待补充"}
268
871
  - Auto final rung: ${data.autoFinalRung || "待补充"}
872
+ - Collaborator report mode: ${data.reportMode || "待补充"}
873
+ - Canonical context readiness: ${data.reportReadiness || "待补充"}
874
+ - Why this report mode is active: ${data.reportModeReason || "待补充"}
875
+ - Method name: ${data.methodName || "待补充"}
876
+ - Contribution bullets: ${data.contributionBullets || "待补充"}
269
877
  - Eval objective: ${data.evalObjective || "待补充"}
270
878
  - Primary metrics: ${data.evalPrimaryMetrics || "待补充"}
271
879
  - Secondary metrics: ${data.evalSecondaryMetrics || "待补充"}
@@ -335,6 +943,11 @@ ${data.problem || "TBD"}
335
943
  - Auto stop reason: ${data.autoStopReason || "TBD"}
336
944
  - Auto final artifact: ${data.autoFinalArtifact || "TBD"}
337
945
  - Auto final rung: ${data.autoFinalRung || "TBD"}
946
+ - Collaborator report mode: ${data.reportMode || "TBD"}
947
+ - Canonical context readiness: ${data.reportReadiness || "TBD"}
948
+ - Why this report mode is active: ${data.reportModeReason || "TBD"}
949
+ - Method name: ${data.methodName || "TBD"}
950
+ - Contribution bullets: ${data.contributionBullets || "TBD"}
338
951
  - Eval objective: ${data.evalObjective || "TBD"}
339
952
  - Primary metrics: ${data.evalPrimaryMetrics || "TBD"}
340
953
  - Secondary metrics: ${data.evalSecondaryMetrics || "TBD"}
@@ -381,11 +994,13 @@ ${data.problem || "TBD"}
381
994
  }
382
995
 
383
996
  function buildContextSnapshot(targetDir) {
997
+ const reportStatus = getCollaboratorReportStatus(targetDir);
384
998
  const mission = readFileIfExists(contextFile(targetDir, "mission.md"));
385
999
  const state = readFileIfExists(contextFile(targetDir, "state.md"));
386
1000
  const evidence = readFileIfExists(contextFile(targetDir, "evidence-index.md"));
387
1001
  const questions = readFileIfExists(contextFile(targetDir, "open-questions.md"));
388
1002
  const dataDecisions = readFileIfExists(contextFile(targetDir, "data-decisions.md"));
1003
+ const terminologyLock = readFileIfExists(contextFile(targetDir, "terminology-lock.md"));
389
1004
  const autoMode = readFileIfExists(contextFile(targetDir, "auto-mode.md"));
390
1005
  const autoStatus = readFileIfExists(contextFile(targetDir, "auto-status.md"));
391
1006
  const autoOutcome = readFileIfExists(contextFile(targetDir, "auto-outcome.md"));
@@ -592,6 +1207,11 @@ function buildContextSnapshot(targetDir) {
592
1207
  autoStopReason: extractValue(autoOutcome, ["Stop reason", "停止原因"]),
593
1208
  autoFinalArtifact: extractValue(autoOutcome, ["Final artifact", "最终工件"]),
594
1209
  autoFinalRung: extractValue(autoOutcome, ["Final rung", "最终 rung"]),
1210
+ reportMode: reportStatus.mode,
1211
+ reportReadiness: reportStatus.readiness,
1212
+ reportModeReason: reportStatus.reason,
1213
+ methodName: extractValue(terminologyLock, TERMINOLOGY_FIELDS.methodName),
1214
+ contributionBullets: extractValue(terminologyLock, TERMINOLOGY_FIELDS.contributionBullets),
595
1215
  evalObjective: evalProtocol.primaryEvaluationObjective,
596
1216
  evalPrimaryMetrics: evalProtocol.primaryMetrics,
597
1217
  evalSecondaryMetrics: evalProtocol.secondaryMetrics,
@@ -619,6 +1239,7 @@ function writeContextFile(targetDir, name, content) {
619
1239
  }
620
1240
 
621
1241
  function refreshContext({ targetDir }) {
1242
+ hydrateCanonicalContext(targetDir);
622
1243
  const lang = readWorkflowLanguage(targetDir);
623
1244
  const snapshot = buildContextSnapshot(targetDir);
624
1245
  writeContextFile(targetDir, "summary.md", renderSummary(lang, snapshot));
@@ -695,6 +1316,11 @@ function archiveContext({ targetDir, now = new Date() }) {
695
1316
 
696
1317
  module.exports = {
697
1318
  archiveContext,
1319
+ collaboratorEvalIssues,
1320
+ collaboratorMissionIssues,
1321
+ getCollaboratorReportStatus,
1322
+ hasCollaboratorFacingDeliverables,
1323
+ hydrateCanonicalContext,
698
1324
  pruneContext,
699
1325
  refreshContext,
700
1326
  };
package/lib/i18n.cjs CHANGED
@@ -289,7 +289,11 @@ const ZH_SKILL_FILES = {
289
289
 
290
290
  ## 必要输出
291
291
 
292
+ - 报告状态:collaborator-ready 或 artifact-anchored interim
292
293
  - 给用户看的总结
294
+ - 问题与背景的白话说明
295
+ - 数据集场景说明
296
+ - 贡献总结
293
297
  - 方法概述
294
298
  - 选定指标摘要
295
299
  - 指标白话释义
@@ -301,6 +305,7 @@ const ZH_SKILL_FILES = {
301
305
  - 失败尝试
302
306
  - 局限性
303
307
  - 下一步
308
+ - 单独列出的工件状态,而不是混进已验证结果
304
309
 
305
310
  ## 上下文读取
306
311
 
@@ -312,6 +317,8 @@ const ZH_SKILL_FILES = {
312
317
 
313
318
  ## 上下文写回
314
319
 
320
+ - \`.lab/context/mission.md\`
321
+ - \`.lab/context/eval-protocol.md\`
315
322
  - \`.lab/context/state.md\`
316
323
  - \`.lab/context/evidence-index.md\`
317
324
 
@@ -324,7 +331,15 @@ const ZH_SKILL_FILES = {
324
331
  - 必须把已批准的主指标、次级指标和必要终局证据明确写进 \`report.md\` 与受管的 \`main-tables.md\`。
325
332
  - 必须用白话解释选定的主指标和次级指标:每个指标在衡量什么、越高还是越低更好、它是主结果指标还是健康度/支持性指标。
326
333
  - 如果出现 coverage、completeness、confidence 或类似健康度指标,必须明确说明这类指标回答的是“实验是否跑稳、证据是否完整”,而不是主要科学效应本身。
334
+ - 要把最关键的背景来源、方法/基线来源和指标来源直接写进报告,不要把它们藏在 \`.lab/context/*\` 里。
335
+ - 如果 \`.lab/context/terminology-lock.md\` 里已经冻结了方法名和 contribution bullets,就必须把它们带进报告。
336
+ - 方法概述必须用协作者能读懂的话说明:我们的方法大致怎么做、相对 closest prior work 或 strongest baseline 改了什么、这些 prior 方法各自做了什么,以及它们为什么在当前 claim 下仍然不够。
337
+ - 只保留少量最关键的 prior work/baseline 锚点;每个锚点都要用一句话交代它做了什么和它的局限。
338
+ - 在起草报告前,先检查 \`.lab/context/mission.md\` 和 \`.lab/context/eval-protocol.md\` 是否仍是模板空壳。
339
+ - 如果 canonical context 还是空壳,要先根据 frozen result artifacts、data-decisions、evidence-index 和已批准上下文回填“最小可信版本”,再写报告。
340
+ - 如果回填后仍缺少协作者可读所需的关键字段,就必须把输出降级成 \`artifact-anchored interim report\`,不能冒充最终协作者报告。
327
341
  - 如果报告依赖了对原始指标或原始实现的偏差,必须明确写出这个偏差。
342
+ - workflow 工件状态、rerun id 或 LaTeX 骨架状态不能混进“已验证主结果”;这些内容必须单列到工件状态部分。
328
343
  - 如果 workflow language 是中文,\`report.md\` 和 \`<deliverables_root>/main-tables.md\` 也应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。
329
344
  - 解释优先保守,不要写成营销文案。
330
345
  - 要给 \`/lab:write\` 留下清晰 handoff,尤其是 section draft 可以直接引用的证据链接。
@@ -334,6 +349,7 @@ const ZH_SKILL_FILES = {
334
349
  - 开始前先简洁说明:campaign outcome、选定的主指标和次级指标、最强已支撑 claim、最大的报告风险。
335
350
  - 当该阶段由 \`/lab:auto\` 进入时,要主动给出用户可读的白话总结,不要等用户再追问“这些指标是什么意思”或“这些表怎么看”。
336
351
  - 把 \`report.md\` 当作给用户看的工件,而不是内部 dump。术语第一次出现时就解释;先讲结论,再讲术语。
352
+ - 把 contribution bullets 当作协作者可读的最终主张摘要,而不是内部 TODO;每条都必须和当前证据边界对齐。
337
353
  - 如果某个未决前提会改变报告解释,一次只问一个问题。
338
354
  - 如果存在多种报告 framing,先给 2-3 个方案、trade-offs 和推荐项,优先最忠于证据的 framing。
339
355
  - 如果某种 framing 会实质影响后续论文 claim,要保留 approval gate。
@@ -691,6 +707,12 @@ const ZH_SKILL_FILES = {
691
707
  [path.join(".lab", ".managed", "templates", "final-report.md")]:
692
708
  `# 最终报告
693
709
 
710
+ ## 报告状态
711
+
712
+ - 报告模式:collaborator-ready 或 artifact-anchored interim
713
+ - canonical context 完整度:
714
+ - 为什么当前只能用这个模式:
715
+
694
716
  ## 给用户看的总结
695
717
 
696
718
  - 一句话结论:
@@ -698,6 +720,33 @@ const ZH_SKILL_FILES = {
698
720
  - 还没有被证明的内容:
699
721
  - 当前最大报告风险:
700
722
 
723
+ ## 问题与背景
724
+
725
+ - 这项研究在解决什么问题:
726
+ - 为什么这个问题重要:
727
+ - 当前报告到底覆盖了什么 setting 或 workflow:
728
+
729
+ ## 数据集场景说明
730
+
731
+ - 数据集或 benchmark 1 代表什么真实场景:
732
+ - 数据集或 benchmark 2 代表什么真实场景:
733
+ - 数据集或 benchmark 3 代表什么真实场景:
734
+
735
+ ## 贡献总结
736
+
737
+ - Contribution bullets:
738
+ - 当前证据最强的贡献:
739
+ - 仍需要更强证据的贡献:
740
+
741
+ ## 方法概述
742
+
743
+ - 已批准的方法名:
744
+ - 方法白话总结:
745
+ - 相比 prior work 这套方法改变了什么:
746
+ - 最相关的 prior work 或 baseline 锚点:
747
+ - 这些 prior 方法各自做了什么:
748
+ - 为什么这些 prior 方法在这里仍然不够:
749
+
701
750
  ## 选定指标
702
751
 
703
752
  - 主指标:
@@ -740,6 +789,11 @@ const ZH_SKILL_FILES = {
740
789
  - 最终表现摘要:
741
790
  - 主表覆盖情况:
742
791
 
792
+ ## 工件状态
793
+
794
+ - 已就绪的交付物或工作流工件:
795
+ - 这些工件状态为什么不是科学结论:
796
+
743
797
  ## 主要结果
744
798
 
745
799
  - 主要发现 1:
@@ -953,6 +1007,7 @@ const ZH_SKILL_FILES = {
953
1007
  - 一句话问题:
954
1008
  - 为什么重要:
955
1009
  - 目标失败场景:
1010
+ - 回填来源:
956
1011
 
957
1012
  ## 成功标准
958
1013
 
@@ -973,6 +1028,7 @@ const ZH_SKILL_FILES = {
973
1028
  - 已批准方向:
974
1029
  - 当前 owner 或会话:
975
1030
  - 最近更新该 mission 的 stage:
1031
+ - 协作者可读状态:
976
1032
  `,
977
1033
  [path.join(".lab", "context", "state.md")]:
978
1034
  `# 工作流状态
@@ -1966,6 +2022,7 @@ ZH_CONTENT[path.join(".lab", "context", "eval-protocol.md")] = `# 评估协议
1966
2022
  ## 指标释义
1967
2023
 
1968
2024
  - 指标释义:
2025
+ - 回填来源:
1969
2026
  - 背景来源:
1970
2027
  - 方法与基线来源论文:
1971
2028
  - 方法与基线实现来源:
@@ -2021,10 +2078,12 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
2021
2078
 
2022
2079
  ## 上下文写回
2023
2080
 
2081
+ - \`.lab/context/mission.md\`
2024
2082
  - \`.lab/context/state.md\`
2025
2083
  - \`.lab/context/decisions.md\`
2026
2084
  - \`.lab/context/data-decisions.md\`
2027
2085
  - \`.lab/context/evidence-index.md\`
2086
+ - \`.lab/context/eval-protocol.md\`
2028
2087
  - \`.lab/context/summary.md\`
2029
2088
  - \`.lab/context/session-brief.md\`
2030
2089
  - \`.lab/context/auto-status.md\`
@@ -2053,6 +2112,8 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
2053
2112
  - \`review\` 更新规范审查上下文
2054
2113
  - \`report\` 写出 \`<deliverables_root>/report.md\`
2055
2114
  - \`write\` 写出 \`<deliverables_root>/paper/\` 下的 LaTeX 产物
2115
+ - 如果即将进入 \`report\`,而 \`.lab/context/mission.md\` 或 \`.lab/context/eval-protocol.md\` 仍是模板空壳,就先根据冻结工件和已批准上下文回填最小可信版本。
2116
+ - 如果回填后仍缺少协作者可读所需的关键字段,就必须强制生成 \`artifact-anchored interim report\`,不能冒充最终协作者报告。
2056
2117
  - promotion 成功后,必须写回 \`data-decisions.md\`、\`decisions.md\`、\`state.md\` 和 \`session-brief.md\`。
2057
2118
  - 如果某个指标或对比 claim 在评估协议里没有带来源的定义,就不能拿它做 stop 或 promotion 判断。
2058
2119
 
@@ -1,5 +1,11 @@
1
1
  # Final Report
2
2
 
3
+ ## Report Status
4
+
5
+ - Report mode: collaborator-ready or artifact-anchored interim
6
+ - Canonical context readiness:
7
+ - Why the current mode is appropriate:
8
+
3
9
  ## Reader Summary
4
10
 
5
11
  - One-sentence conclusion:
@@ -7,6 +13,33 @@
7
13
  - What is still unproven:
8
14
  - Biggest reporting risk:
9
15
 
16
+ ## Problem and Background
17
+
18
+ - Research problem in plain language:
19
+ - Why this problem matters:
20
+ - What setting or workflow this report is actually about:
21
+
22
+ ## Dataset Scene Notes
23
+
24
+ - Dataset or benchmark 1 and what real-world setting it represents:
25
+ - Dataset or benchmark 2 and what real-world setting it represents:
26
+ - Dataset or benchmark 3 and what real-world setting it represents:
27
+
28
+ ## Contribution Summary
29
+
30
+ - Contribution bullets:
31
+ - Strongest supported contribution:
32
+ - Contributions that still need stronger evidence:
33
+
34
+ ## Method Overview
35
+
36
+ - Approved method name:
37
+ - Plain-language method summary:
38
+ - What this method changes relative to prior work:
39
+ - Most relevant prior work or baseline anchors:
40
+ - What those prior methods do:
41
+ - Why those prior methods are still insufficient here:
42
+
10
43
  ## Selected Metrics
11
44
 
12
45
  - Primary metrics:
@@ -56,6 +89,11 @@
56
89
  - Final performance summary:
57
90
  - Table coverage:
58
91
 
92
+ ## Artifact Status
93
+
94
+ - Deliverables or workflow artifacts that are ready:
95
+ - Artifact status notes that are not scientific findings:
96
+
59
97
  ## Main Results
60
98
 
61
99
  Summarize validated iteration outcomes.
@@ -17,6 +17,7 @@ Use this file to define the paper-facing evaluation objective, table plan, gates
17
17
  ## Metric Glossary
18
18
 
19
19
  - Metric glossary:
20
+ - Hydration provenance:
20
21
  - Background sources:
21
22
  - Method and baseline source papers:
22
23
  - Method and baseline implementation source:
@@ -5,6 +5,7 @@
5
5
  - One-sentence problem:
6
6
  - Why it matters:
7
7
  - Target failure case:
8
+ - Hydration provenance:
8
9
 
9
10
  ## Success Criteria
10
11
 
@@ -25,3 +26,4 @@
25
26
  - Approved direction:
26
27
  - Current owner or session:
27
28
  - Last stage that updated this mission:
29
+ - Collaborator-ready status:
@@ -46,6 +46,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
46
46
  - Keep an explicit approval gate before `/lab:spec`.
47
47
  - Write idea artifacts with the template in `.lab/.managed/templates/idea.md`.
48
48
  - Update `.lab/context/mission.md`, `.lab/context/decisions.md`, and `.lab/context/open-questions.md` after convergence.
49
+ - Do not leave `.lab/context/mission.md` as a template shell once the problem statement and approved direction are known.
49
50
  - Do not implement code in this stage.
50
51
 
51
52
  ### `/lab:data`
@@ -111,7 +112,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
111
112
  - Normalize the result with `.lab/.managed/scripts/eval_report.py`.
112
113
  - Validate normalized output with `.lab/.managed/scripts/validate_results.py`.
113
114
  - Read `.lab/context/eval-protocol.md` before choosing the smallest run so the first experiment already targets the approved tables, metrics, and gates.
114
- - Update `.lab/context/state.md` and `.lab/context/evidence-index.md` after the run.
115
+ - Update `.lab/context/state.md`, `.lab/context/evidence-index.md`, and `.lab/context/eval-protocol.md` after the run.
116
+ - If the evaluation protocol is still skeletal, initialize the smallest trustworthy source-backed version before treating the run as the protocol anchor.
115
117
 
116
118
  ### `/lab:iterate`
117
119
 
@@ -131,7 +133,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
131
133
  - Keep metric definitions, baseline behavior, and comparison implementations anchored to the source-backed evaluation protocol before changing thresholds, gates, or ladder transitions.
132
134
  - Switch to diagnostic mode if risk increases for two consecutive rounds.
133
135
  - Write round reports with `.lab/.managed/templates/iteration-report.md`.
134
- - Update `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, and `.lab/context/open-questions.md` each round as needed.
136
+ - Update `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, `.lab/context/open-questions.md`, and `.lab/context/eval-protocol.md` each round as needed.
137
+ - Keep `.lab/context/eval-protocol.md` synchronized with accepted ladder changes, benchmark scope, and source-backed implementation deviations.
135
138
  - Stop at threshold success or iteration cap, and record blockers plus next-best actions when the campaign ends without success.
136
139
 
137
140
  ### `/lab:review`
@@ -153,7 +156,9 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
153
156
  - Aggregate them with `.lab/.managed/scripts/summarize_iterations.py`.
154
157
  - Write the final document with `.lab/.managed/templates/final-report.md` and the managed table summary with `.lab/.managed/templates/main-tables.md`.
155
158
  - Keep failed attempts and limitations visible.
156
- - Update `.lab/context/state.md` and `.lab/context/evidence-index.md` with report-level handoff notes.
159
+ - Update `.lab/context/mission.md`, `.lab/context/eval-protocol.md`, `.lab/context/state.md`, and `.lab/context/evidence-index.md` with report-level handoff notes.
160
+ - If canonical context is still skeletal, hydrate the smallest trustworthy version from frozen artifacts before finalizing the report.
161
+ - If collaborator-critical fields remain missing after hydration, downgrade to an `artifact-anchored interim report` instead of presenting a final collaborator-ready report.
157
162
 
158
163
  ### `/lab:write`
159
164
 
@@ -24,6 +24,8 @@
24
24
 
25
25
  ## Context Write Set
26
26
 
27
+ - `.lab/context/mission.md`
28
+ - `.lab/context/eval-protocol.md`
27
29
  - `.lab/context/state.md`
28
30
  - `.lab/context/decisions.md`
29
31
  - `.lab/context/data-decisions.md`
@@ -63,6 +65,8 @@
63
65
  - keep the session alive while the current rung is running
64
66
  - write the current rung, watch target, and next rung to `.lab/context/auto-status.md`
65
67
  - Reuse the existing `/lab:run`, `/lab:iterate`, `/lab:review`, `/lab:report`, and optional `/lab:write` contracts instead of inventing a parallel workflow.
68
+ - If the loop is about to reach `report` while `.lab/context/mission.md` or `.lab/context/eval-protocol.md` is still skeletal, hydrate the smallest trustworthy canonical version from frozen artifacts and approved context before drafting the report.
69
+ - If hydration still leaves collaborator-critical fields blank, force `report` to emit an `artifact-anchored interim report` instead of a collaborator-ready final report.
66
70
  - Enforce stage contracts, not just exit codes:
67
71
  - `run` and `iterate` must change persistent outputs under `results_root`
68
72
  - `review` must update canonical review context
@@ -3,6 +3,7 @@
3
3
  ## Required Output
4
4
 
5
5
  - one-sentence problem statement
6
+ - why the problem matters in plain language
6
7
  - failure case
7
8
  - idea classification
8
9
  - contribution category
@@ -18,6 +19,7 @@
18
19
  - critique before convergence
19
20
  - minimum viable experiment
20
21
  - explicit approval gate before `/lab:spec`
22
+ - canonical mission context updated with the approved problem, importance, failure case, and direction
21
23
 
22
24
  ## Evidence Discipline
23
25
 
@@ -60,3 +62,4 @@
60
62
  - State why the target problem matters before talking about the method.
61
63
  - Compare against existing methods explicitly, not by vague novelty language.
62
64
  - The three meaningful points should each fit in one direct sentence.
65
+ - Do not leave `.lab/context/mission.md` as an empty template after convergence; write the approved problem, why it matters, the current benchmark scope, and the approved direction back into canonical context.
@@ -29,6 +29,7 @@ Declare and keep fixed:
29
29
  - `.lab/context/decisions.md`
30
30
  - `.lab/context/evidence-index.md`
31
31
  - `.lab/context/open-questions.md`
32
+ - `.lab/context/eval-protocol.md`
32
33
 
33
34
  ## Per-Round Output
34
35
 
@@ -62,6 +63,7 @@ If the loop stops without success, record:
62
63
  - Do not accumulate long-lived results under `.lab/changes/<change-id>/runs`.
63
64
  - Do not change metric definitions, baseline semantics, or comparison implementations unless the approved evaluation protocol records both their sources and any deviations.
64
65
  - When you change ladders, sample sizes, or promotion gates, keep the resulting logic anchored to the source-backed evaluation protocol instead of ad-hoc chat reasoning.
66
+ - Keep `.lab/context/eval-protocol.md` synchronized with the active benchmark scope, ladder gates, source-backed metric definitions, and any accepted implementation deviations instead of leaving it as a stale template.
65
67
 
66
68
  ## Interaction Contract
67
69
 
@@ -2,7 +2,11 @@
2
2
 
3
3
  ## Required Output
4
4
 
5
+ - report status: collaborator-ready or artifact-anchored interim
5
6
  - reader summary for the user
7
+ - problem and background in plain language
8
+ - dataset scene notes in plain language
9
+ - contribution summary
6
10
  - method overview
7
11
  - selected metrics summary
8
12
  - plain-language metric guide
@@ -17,6 +21,7 @@
17
21
  - failed attempts
18
22
  - limitations
19
23
  - next steps
24
+ - artifact status kept separate from validated findings
20
25
 
21
26
  ## Context Read Set
22
27
 
@@ -30,6 +35,8 @@
30
35
 
31
36
  ## Context Write Set
32
37
 
38
+ - `.lab/context/mission.md`
39
+ - `.lab/context/eval-protocol.md`
33
40
  - `.lab/context/state.md`
34
41
  - `.lab/context/evidence-index.md`
35
42
 
@@ -43,8 +50,15 @@
43
50
  - Explain the selected primary and secondary metrics in plain language for the user: what each metric measures, whether higher or lower is better, and whether it is a main result metric or only a health/support metric.
44
51
  - If coverage, completeness, confidence, or similar health metrics appear, explicitly say that they describe experimental reliability rather than the main scientific effect.
45
52
  - Pull the core background references, method or baseline references, and metric references out of the approved evaluation protocol instead of hiding them in `.lab/context/*`.
53
+ - Pull the approved method name and contribution bullets out of `.lab/context/terminology-lock.md` when that framing context exists; do not silently drop them from the collaborator-facing report.
54
+ - Explain the method overview in collaborator language: what the method roughly does, what changed relative to the closest prior work or strongest baseline, what those prior methods do, and why they remain insufficient for the approved claim.
55
+ - When citing prior work or baselines in the method overview, include only the few anchor references a collaborator needs, and summarize their role and limitation in one short line each.
46
56
  - Report only the few references a collaborator needs to orient themselves quickly; do not turn `report.md` into a full bibliography dump.
47
57
  - If the report depends on a deviation from an original metric or implementation, state that deviation explicitly instead of smoothing it over.
58
+ - Before drafting the report, inspect `.lab/context/mission.md` and `.lab/context/eval-protocol.md` for skeletal template fields.
59
+ - If either canonical context file is still skeletal, hydrate the smallest trustworthy version from frozen result artifacts, dataset decisions, evidence-index, and prior approved context, and write that back before finalizing the report.
60
+ - If collaborator-critical fields still remain missing after hydration, downgrade the output to an `artifact-anchored interim report` instead of presenting it as a final collaborator-ready report.
61
+ - Do not mix workflow deliverable status, rerun ids, or manuscript skeleton status into validated scientific findings; keep those in a separate artifact-status section.
48
62
  - If `.lab/config/workflow.json` sets the workflow language to Chinese, write `report.md` and `<deliverables_root>/main-tables.md` in Chinese unless a file path, code identifier, or literal metric name must remain unchanged.
49
63
  - Prefer conservative interpretation over marketing language.
50
64
  - Leave a clear handoff path into `/lab:write` with evidence links that section drafts can cite.
@@ -54,6 +68,7 @@
54
68
  - Start with a concise summary of the campaign outcome, the selected primary and secondary metrics, the strongest supported claim, and the biggest reporting risk.
55
69
  - Proactively deliver a user-readable plain-language summary when the stage is reached from `/lab:auto`; do not wait for a separate follow-up request asking what the metrics or tables mean.
56
70
  - Treat `report.md` as a user-facing artifact rather than an internal dump. Prefer plain-language explanations before jargon, and explain each metric the first time it matters.
71
+ - Treat contribution bullets as collaborator-facing claim summaries, not as internal TODOs; tie each one to the current evidence boundary.
57
72
  - If a missing assumption would change report interpretation, ask one clarifying question at a time.
58
73
  - If there are multiple defensible report framings, present 2-3 approaches with trade-offs and recommend the most evidence-faithful framing before writing.
59
74
  - Keep an approval gate when the reporting frame would materially affect what the paper later claims.
@@ -6,6 +6,7 @@
6
6
  - run registry entry
7
7
  - normalized evaluation summary
8
8
  - validation result for the normalized summary
9
+ - canonical evaluation context initialized or refined when the active protocol is still skeletal
9
10
 
10
11
  ## Context Read Set
11
12
 
@@ -19,6 +20,7 @@
19
20
 
20
21
  - `.lab/context/state.md`
21
22
  - `.lab/context/evidence-index.md`
23
+ - `.lab/context/eval-protocol.md`
22
24
 
23
25
  ## Constraints
24
26
 
@@ -26,6 +28,7 @@
26
28
  - Fail fast on data, environment, or metric wiring problems.
27
29
  - Tie the run to the approved evaluation protocol, not just an ad-hoc chat goal.
28
30
  - Do not invent metric definitions, baseline behavior, or comparison implementations from memory; anchor them to the approved evaluation protocol and its recorded sources.
31
+ - If `.lab/context/eval-protocol.md` is still skeletal, write the smallest trustworthy version of the current evaluation objective, metric set, ladder, and source-backed implementation notes before treating the run as the new protocol anchor.
29
32
  - Record the exact launch command and output location.
30
33
  - Write durable run outputs, logs, and checkpoints under `results_root`.
31
34
  - Write figures or plots under `figures_root`.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlab",
3
- "version": "0.1.18",
3
+ "version": "0.1.20",
4
4
  "description": "Strict /lab research workflow installer for Codex and Claude",
5
5
  "keywords": [
6
6
  "codex",