superlab 0.1.18 → 0.1.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/superlab.cjs CHANGED
@@ -12,6 +12,8 @@ const {
12
12
  } = require("../lib/install.cjs");
13
13
  const {
14
14
  archiveContext,
15
+ collaboratorEvalIssues,
16
+ collaboratorMissionIssues,
15
17
  pruneContext,
16
18
  refreshContext,
17
19
  } = require("../lib/context.cjs");
@@ -788,6 +790,8 @@ function printDoctor(options) {
788
790
  const templateIssues = validatePaperTemplateRoot(options.targetDir, config);
789
791
  const dataDecisionIssues = validateDataDecisions(options.targetDir);
790
792
  const evalProtocolIssues = validateEvalProtocol(options.targetDir);
793
+ const missionContextIssues = collaboratorMissionIssues(options.targetDir);
794
+ const collaboratorProtocolIssues = collaboratorEvalIssues(options.targetDir);
791
795
  const rootIssues = validateProjectRoots(options.targetDir, config);
792
796
  const autoStatus = getAutoStatus({ targetDir: options.targetDir });
793
797
  const autoIssues = autoStatus.issues;
@@ -806,6 +810,8 @@ function printDoctor(options) {
806
810
  templateIssues.length > 0 ||
807
811
  dataDecisionIssues.length > 0 ||
808
812
  evalProtocolIssues.length > 0 ||
813
+ missionContextIssues.length > 0 ||
814
+ collaboratorProtocolIssues.length > 0 ||
809
815
  rootIssues.length > 0 ||
810
816
  autoIssues.length > 0
811
817
  ) {
@@ -820,6 +826,8 @@ function printDoctor(options) {
820
826
  templateIssues,
821
827
  dataDecisionIssues,
822
828
  evalProtocolIssues,
829
+ missionContextIssues,
830
+ collaboratorProtocolIssues,
823
831
  rootIssues,
824
832
  autoIssues
825
833
  );
@@ -253,7 +253,7 @@ async function startAutoMode({ targetDir, now = new Date() }) {
253
253
  const maxFailures = parseInteger(mode.maxFailures, 0);
254
254
  const maxIterations = parseInteger(mode.maxIterations, 1);
255
255
  const requiredArtifact = resolveRequiredArtifact(targetDir, mode.requiredTerminalArtifact);
256
- const frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
256
+ let frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
257
257
  const { loopStages, finalStages } = splitAutoStages(mode.allowedStages);
258
258
  const executedStages = [];
259
259
  let failureCount = 0;
@@ -363,13 +363,14 @@ async function startAutoMode({ targetDir, now = new Date() }) {
363
363
  nextRung,
364
364
  decision: rungId ? `completed rung ${rungId}` : `completed stage ${stage}`,
365
365
  });
366
- refreshContext({ targetDir });
367
-
368
366
  const frozenCoreChanges = detectFrozenCoreChanges(frozenCoreSnapshot);
369
367
  if (frozenCoreChanges.length > 0) {
370
368
  failAutoMode(`frozen core changed: ${frozenCoreChanges.join(", ")}`);
371
369
  }
372
370
 
371
+ refreshContext({ targetDir });
372
+ frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
373
+
373
374
  const stopCheck = await runCheckCommand({
374
375
  targetDir,
375
376
  label: `stop check after ${rungId || stage}`,
@@ -437,12 +438,13 @@ async function startAutoMode({ targetDir, now = new Date() }) {
437
438
  decision: `promotion policy matched after ${label}`,
438
439
  });
439
440
  promotionApplied = true;
440
- refreshContext({ targetDir });
441
- verifyPromotionWriteback(targetDir, promotionSnapshot);
442
441
  const frozenCoreChangesAfterPromotion = detectFrozenCoreChanges(frozenCoreSnapshot);
443
442
  if (frozenCoreChangesAfterPromotion.length > 0) {
444
443
  failAutoMode(`frozen core changed: ${frozenCoreChangesAfterPromotion.join(", ")}`);
445
444
  }
445
+ refreshContext({ targetDir });
446
+ verifyPromotionWriteback(targetDir, promotionSnapshot);
447
+ frozenCoreSnapshot = snapshotFrozenCore(targetDir, mode.frozenCore);
446
448
  };
447
449
 
448
450
  if (evalProtocol.experimentRungs.length > 0) {
package/lib/context.cjs CHANGED
@@ -2,6 +2,64 @@ const fs = require("node:fs");
2
2
  const path = require("node:path");
3
3
  const { parseEvalProtocol } = require("./eval_protocol.cjs");
4
4
 
5
+ const PLACEHOLDER_VALUES = new Set(["", "tbd", "none", "待补充", "无"]);
6
+ const MISSION_COLLABORATOR_FIELDS = [
7
+ { name: "One-sentence problem", labels: ["One-sentence problem", "一句话问题"] },
8
+ { name: "Why it matters", labels: ["Why it matters", "为什么重要"] },
9
+ { name: "Primary metric", labels: ["Primary metric", "主指标"] },
10
+ { name: "Success threshold", labels: ["Success threshold", "成功阈值"] },
11
+ { name: "Dataset or benchmark scope", labels: ["Dataset or benchmark scope", "数据集或 benchmark 范围"] },
12
+ { name: "Approved direction", labels: ["Approved direction", "已批准方向"] },
13
+ ];
14
+ const EVAL_COLLABORATOR_FIELDS = [
15
+ { name: "Primary evaluation objective", labels: ["Primary evaluation objective", "主评估目标"] },
16
+ { name: "Primary metrics", labels: ["Primary metrics", "主指标"] },
17
+ { name: "Secondary metrics", labels: ["Secondary metrics", "次级指标"] },
18
+ { name: "Table plan", labels: ["Table plan", "主表计划"] },
19
+ { name: "Metric glossary", labels: ["Metric glossary", "指标释义"] },
20
+ { name: "Background sources", labels: ["Background sources", "背景来源"] },
21
+ {
22
+ name: "Method and baseline source papers",
23
+ labels: ["Method and baseline source papers", "方法与基线来源论文"],
24
+ },
25
+ {
26
+ name: "Method and baseline implementation source",
27
+ labels: ["Method and baseline implementation source", "方法与基线实现来源"],
28
+ },
29
+ { name: "Metric source papers", labels: ["Metric source papers", "指标来源论文"] },
30
+ { name: "Required output artifacts", labels: ["Required output artifacts", "必要输出工件"] },
31
+ ];
32
+ const REPORT_FIELDS = {
33
+ problem: ["Research problem in plain language", "研究问题白话解释", "研究问题"],
34
+ whyItMatters: ["Why this problem matters", "为什么这个问题重要"],
35
+ setting: ["What setting or workflow this report is actually about", "这份报告实际对应的场景或流程"],
36
+ primaryMetrics: ["Primary metrics", "主指标"],
37
+ secondaryMetrics: ["Secondary metrics", "次级指标"],
38
+ requiredTerminalEvidence: ["Required terminal evidence", "必要终局证据"],
39
+ metricGuidePrimary: ["Primary metric plain-language explanation", "主指标白话解释"],
40
+ metricGuideSecondary: ["Secondary metric plain-language explanation", "次级指标白话解释"],
41
+ metricGuideSupport: [
42
+ "Health or support metrics and why they are not the main claim",
43
+ "健康度或支持性指标以及它们为什么不是主 claim",
44
+ ],
45
+ backgroundSources: ["Most important background papers or benchmark references", "最重要的背景论文或 benchmark 参考"],
46
+ backgroundAnchors: ["Why these are the right background anchors", "为什么这些是合适的背景锚点"],
47
+ methodBasis: ["Our method source or implementation basis", "我们的方法来源或实现基础"],
48
+ baselineSourcePapers: ["Baseline and comparison source papers", "基线与对比方法来源论文"],
49
+ baselineImplementationSources: [
50
+ "Baseline and comparison implementation sources",
51
+ "基线与对比方法实现来源",
52
+ ],
53
+ metricSourcePapers: ["Metric source papers", "指标来源论文"],
54
+ metricImplementationSource: ["Metric implementation source", "指标实现来源"],
55
+ metricDeviation: ["Deviation from original implementation", "与原始实现的偏差"],
56
+ datasets: ["Datasets", "数据集"],
57
+ baselines: ["Baselines", "基线"],
58
+ metrics: ["Metrics", "指标"],
59
+ finalPerformanceSummary: ["Final performance summary", "最终表现总结"],
60
+ tableCoverage: ["Table coverage", "表格覆盖范围"],
61
+ };
62
+
5
63
  function contextFile(targetDir, name) {
6
64
  return path.join(targetDir, ".lab", "context", name);
7
65
  }
@@ -58,6 +116,278 @@ function joinNonEmpty(parts, separator = "; ") {
58
116
  return parts.filter(Boolean).join(separator);
59
117
  }
60
118
 
119
+ function isMeaningful(value) {
120
+ return !PLACEHOLDER_VALUES.has((value || "").trim().toLowerCase());
121
+ }
122
+
123
+ function readWorkflowConfig(targetDir) {
124
+ const configPath = path.join(targetDir, ".lab", "config", "workflow.json");
125
+ if (!fs.existsSync(configPath)) {
126
+ return {};
127
+ }
128
+ try {
129
+ return JSON.parse(fs.readFileSync(configPath, "utf8"));
130
+ } catch {
131
+ return {};
132
+ }
133
+ }
134
+
135
+ function resolveProjectPath(targetDir, configuredPath) {
136
+ if (!configuredPath || typeof configuredPath !== "string") {
137
+ return "";
138
+ }
139
+ return path.resolve(targetDir, configuredPath);
140
+ }
141
+
142
+ function getCollaboratorDeliverablePaths(targetDir) {
143
+ const config = readWorkflowConfig(targetDir);
144
+ const deliverablesRoot = resolveProjectPath(targetDir, config.deliverables_root || "docs/research");
145
+ return {
146
+ deliverablesRoot,
147
+ reportPath: path.join(deliverablesRoot, "report.md"),
148
+ mainTablesPath: path.join(deliverablesRoot, "main-tables.md"),
149
+ };
150
+ }
151
+
152
+ function hasCollaboratorFacingDeliverables(targetDir) {
153
+ const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
154
+ return fs.existsSync(reportPath) || fs.existsSync(mainTablesPath);
155
+ }
156
+
157
+ function missingCollaboratorFields(text, fields) {
158
+ return fields.filter((field) => !isMeaningful(extractValue(text, field.labels))).map((field) => field.name);
159
+ }
160
+
161
+ function collaboratorMissionIssues(targetDir) {
162
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
163
+ return [];
164
+ }
165
+ const mission = readFileIfExists(contextFile(targetDir, "mission.md"));
166
+ if (!mission) {
167
+ return [];
168
+ }
169
+ const missing = missingCollaboratorFields(mission, MISSION_COLLABORATOR_FIELDS);
170
+ return missing.length > 0
171
+ ? [`mission context is still skeletal for collaborator-facing reporting: ${missing.join(", ")}`]
172
+ : [];
173
+ }
174
+
175
+ function collaboratorEvalIssues(targetDir) {
176
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
177
+ return [];
178
+ }
179
+ const protocol = readFileIfExists(contextFile(targetDir, "eval-protocol.md"));
180
+ if (!protocol) {
181
+ return [];
182
+ }
183
+ const missing = missingCollaboratorFields(protocol, EVAL_COLLABORATOR_FIELDS);
184
+ return missing.length > 0
185
+ ? [`evaluation protocol is still skeletal for collaborator-facing reporting: ${missing.join(", ")}`]
186
+ : [];
187
+ }
188
+
189
+ function extractReportValue(reportText, key) {
190
+ return extractValue(reportText, REPORT_FIELDS[key] || []);
191
+ }
192
+
193
+ function mergePreferred(existingValue, ...candidates) {
194
+ if (isMeaningful(existingValue)) {
195
+ return existingValue;
196
+ }
197
+ for (const candidate of candidates) {
198
+ if (isMeaningful(candidate)) {
199
+ return candidate;
200
+ }
201
+ }
202
+ return "";
203
+ }
204
+
205
+ function firstMetric(metrics) {
206
+ return (metrics || "")
207
+ .split(/[;,]/)
208
+ .map((value) => value.trim())
209
+ .filter(Boolean)[0] || "";
210
+ }
211
+
212
+ function buildMissionContextText(lang, fields) {
213
+ if (lang === "zh") {
214
+ return `# 研究主线
215
+
216
+ ## 核心问题
217
+
218
+ - 一句话问题: ${fields.problem || "待补充"}
219
+ - 为什么重要: ${fields.whyItMatters || "待补充"}
220
+ - 目标失败场景: ${fields.targetFailureCase || "待补充"}
221
+
222
+ ## 成功标准
223
+
224
+ - 主指标: ${fields.primaryMetric || "待补充"}
225
+ - 成功阈值: ${fields.successThreshold || "待补充"}
226
+ - 必须对比的 baseline: ${fields.requiredBaselineComparison || "待补充"}
227
+ - 最小证据要求: ${fields.minimumEvidenceRequirement || "待补充"}
228
+
229
+ ## 冻结边界
230
+
231
+ - 数据集或 benchmark 范围: ${fields.datasetScope || "待补充"}
232
+ - 切分策略: ${fields.splitPolicy || "待补充"}
233
+ - 评估协议: ${fields.evaluationProtocol || "待补充"}
234
+ - 硬约束: ${fields.hardConstraints || "待补充"}
235
+
236
+ ## 当前状态
237
+
238
+ - 已批准方向: ${fields.approvedDirection || "待补充"}
239
+ - 当前 owner 或会话: ${fields.currentOwner || "待补充"}
240
+ - 最近一次允许更新 mission 的阶段: ${fields.latestStage || "待补充"}
241
+ - 回填来源: ${fields.hydrationProvenance || "待补充"}
242
+ - 协作者可读状态: ${fields.collaboratorReadyStatus || "待补充"}
243
+ `;
244
+ }
245
+
246
+ return `# Research Mission
247
+
248
+ ## Core Problem
249
+
250
+ - One-sentence problem: ${fields.problem || "TBD"}
251
+ - Why it matters: ${fields.whyItMatters || "TBD"}
252
+ - Target failure case: ${fields.targetFailureCase || "TBD"}
253
+
254
+ ## Success Criteria
255
+
256
+ - Primary metric: ${fields.primaryMetric || "TBD"}
257
+ - Success threshold: ${fields.successThreshold || "TBD"}
258
+ - Required baseline comparison: ${fields.requiredBaselineComparison || "TBD"}
259
+ - Minimum evidence requirement: ${fields.minimumEvidenceRequirement || "TBD"}
260
+
261
+ ## Frozen Boundaries
262
+
263
+ - Dataset or benchmark scope: ${fields.datasetScope || "TBD"}
264
+ - Split policy: ${fields.splitPolicy || "TBD"}
265
+ - Evaluation protocol: ${fields.evaluationProtocol || "TBD"}
266
+ - Hard constraints: ${fields.hardConstraints || "TBD"}
267
+
268
+ ## Current Status
269
+
270
+ - Approved direction: ${fields.approvedDirection || "TBD"}
271
+ - Current owner or session: ${fields.currentOwner || "TBD"}
272
+ - Latest stage to update this mission: ${fields.latestStage || "TBD"}
273
+ - Hydration provenance: ${fields.hydrationProvenance || "TBD"}
274
+ - Collaborator-ready status: ${fields.collaboratorReadyStatus || "TBD"}
275
+ `;
276
+ }
277
+
278
+ function buildEvalProtocolText(lang, fields, rungs) {
279
+ const rungBlocks = (rungs || [])
280
+ .map((rung) => {
281
+ if (lang === "zh") {
282
+ return `### Rung: ${rung.id}
283
+
284
+ - 阶段: ${rung.stage || "待补充"}
285
+ - 目标: ${rung.goal || "待补充"}
286
+ - 命令: ${rung.command || "待补充"}
287
+ - 监视目标: ${rung.watch || "待补充"}
288
+ - gate 命令: ${rung.gate || "待补充"}
289
+ - 通过后: ${rung.onPass || "待补充"}
290
+ - 失败后: ${rung.onFail || "待补充"}
291
+ - 停止后: ${rung.onStop || "待补充"}`;
292
+ }
293
+ return `### Rung: ${rung.id}
294
+
295
+ - Stage: ${rung.stage || "TBD"}
296
+ - Goal: ${rung.goal || "TBD"}
297
+ - Command: ${rung.command || "TBD"}
298
+ - Watch: ${rung.watch || "TBD"}
299
+ - Gate: ${rung.gate || "TBD"}
300
+ - On pass: ${rung.onPass || "TBD"}
301
+ - On fail: ${rung.onFail || "TBD"}
302
+ - On stop: ${rung.onStop || "TBD"}`;
303
+ })
304
+ .join("\n\n");
305
+
306
+ if (lang === "zh") {
307
+ return `# 评估协议
308
+
309
+ 用这份文件定义 \`/lab:run\`、\`/lab:iterate\`、\`/lab:auto\` 和 \`/lab:report\` 共用的论文导向评估目标、主表计划、gate 与 benchmark ladder。
310
+
311
+ ## 主评估目标
312
+
313
+ - 主评估目标: ${fields.primaryEvaluationObjective || "待补充"}
314
+ - 主指标: ${fields.primaryMetrics || "待补充"}
315
+ - 次级指标: ${fields.secondaryMetrics || "待补充"}
316
+ - 必要终局证据: ${fields.requiredTerminalEvidence || "待补充"}
317
+
318
+ ## 主表计划
319
+
320
+ - 主表计划: ${fields.tablePlan || "待补充"}
321
+ - 每张表必须支撑的 claims: ${fields.requiredClaimsPerTable || "待补充"}
322
+
323
+ ## 指标释义
324
+
325
+ - 指标释义: ${fields.metricGlossary || "待补充"}
326
+ - 回填来源: ${fields.hydrationProvenance || "待补充"}
327
+ - 背景来源: ${fields.backgroundSources || "待补充"}
328
+ - 方法与基线来源论文: ${fields.methodAndBaselineSourcePapers || "待补充"}
329
+ - 方法与基线实现来源: ${fields.methodAndBaselineImplementationSource || "待补充"}
330
+ - 指标来源论文: ${fields.metricSourcePapers || "待补充"}
331
+ - 指标实现来源: ${fields.metricImplementationSource || "待补充"}
332
+ - 对比方法来源论文: ${fields.comparisonSourcePapers || "待补充"}
333
+ - 对比方法实现来源: ${fields.comparisonImplementationSource || "待补充"}
334
+ - 与原始实现的偏差: ${fields.deviationFromOriginalImplementation || "待补充"}
335
+
336
+ ## Gate Ladder
337
+
338
+ - 实验阶梯: ${fields.experimentLadder || "待补充"}
339
+ - benchmark 阶梯: ${fields.benchmarkLadder || "待补充"}
340
+ - 对比方法 gate: ${fields.comparisonGate || "待补充"}
341
+ - 升格 gate: ${fields.promotionGate || "待补充"}
342
+ - 最小样本量: ${fields.minimumSampleSizes || "待补充"}
343
+ - 必要输出工件: ${fields.requiredOutputArtifacts || "待补充"}
344
+
345
+ ${rungBlocks}
346
+ `;
347
+ }
348
+
349
+ return `# Evaluation Protocol
350
+
351
+ Use this file to define the paper-facing evaluation target, table plan, gates, and benchmark ladder shared by \`/lab:run\`, \`/lab:iterate\`, \`/lab:auto\`, and \`/lab:report\`.
352
+
353
+ ## Primary Evaluation Objective
354
+
355
+ - Primary evaluation objective: ${fields.primaryEvaluationObjective || "TBD"}
356
+ - Primary metrics: ${fields.primaryMetrics || "TBD"}
357
+ - Secondary metrics: ${fields.secondaryMetrics || "TBD"}
358
+ - Required terminal evidence: ${fields.requiredTerminalEvidence || "TBD"}
359
+
360
+ ## Table Plan
361
+
362
+ - Table plan: ${fields.tablePlan || "TBD"}
363
+ - Required claims per table: ${fields.requiredClaimsPerTable || "TBD"}
364
+
365
+ ## Metric Glossary
366
+
367
+ - Metric glossary: ${fields.metricGlossary || "TBD"}
368
+ - Hydration provenance: ${fields.hydrationProvenance || "TBD"}
369
+ - Background sources: ${fields.backgroundSources || "TBD"}
370
+ - Method and baseline source papers: ${fields.methodAndBaselineSourcePapers || "TBD"}
371
+ - Method and baseline implementation source: ${fields.methodAndBaselineImplementationSource || "TBD"}
372
+ - Metric source papers: ${fields.metricSourcePapers || "TBD"}
373
+ - Metric implementation source: ${fields.metricImplementationSource || "TBD"}
374
+ - Comparison source papers: ${fields.comparisonSourcePapers || "TBD"}
375
+ - Comparison implementation source: ${fields.comparisonImplementationSource || "TBD"}
376
+ - Deviation from original implementation: ${fields.deviationFromOriginalImplementation || "TBD"}
377
+
378
+ ## Gate Ladder
379
+
380
+ - Experiment ladder: ${fields.experimentLadder || "TBD"}
381
+ - Benchmark ladder: ${fields.benchmarkLadder || "TBD"}
382
+ - Comparison gate: ${fields.comparisonGate || "TBD"}
383
+ - Promotion gate: ${fields.promotionGate || "TBD"}
384
+ - Minimum sample sizes: ${fields.minimumSampleSizes || "TBD"}
385
+ - Required output artifacts: ${fields.requiredOutputArtifacts || "TBD"}
386
+
387
+ ${rungBlocks}
388
+ `;
389
+ }
390
+
61
391
  function extractClaim(text) {
62
392
  const blocks = text
63
393
  .split(/\n(?=\d+\.\s)/)
@@ -76,6 +406,264 @@ function labelValue(text, englishLabels, chineseLabels = []) {
76
406
  return extractValue(text, [...englishLabels, ...chineseLabels]);
77
407
  }
78
408
 
409
+ function collectHydrationSources(targetDir) {
410
+ const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
411
+ return [
412
+ fs.existsSync(reportPath) ? path.relative(targetDir, reportPath) : "",
413
+ fs.existsSync(mainTablesPath) ? path.relative(targetDir, mainTablesPath) : "",
414
+ readFileIfExists(contextFile(targetDir, "data-decisions.md")) ? ".lab/context/data-decisions.md" : "",
415
+ readFileIfExists(contextFile(targetDir, "state.md")) ? ".lab/context/state.md" : "",
416
+ readFileIfExists(contextFile(targetDir, "evidence-index.md")) ? ".lab/context/evidence-index.md" : "",
417
+ ].filter(Boolean);
418
+ }
419
+
420
+ function hydrateMissionContext(targetDir) {
421
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
422
+ return false;
423
+ }
424
+
425
+ const lang = readWorkflowLanguage(targetDir);
426
+ const missionText = readFileIfExists(contextFile(targetDir, "mission.md"));
427
+ const stateText = readFileIfExists(contextFile(targetDir, "state.md"));
428
+ const evidenceText = readFileIfExists(contextFile(targetDir, "evidence-index.md"));
429
+ const dataDecisions = readFileIfExists(contextFile(targetDir, "data-decisions.md"));
430
+ const reportText = readFileIfExists(getCollaboratorDeliverablePaths(targetDir).reportPath);
431
+ const evalProtocol = parseEvalProtocol(targetDir);
432
+ const hydrationSources = collectHydrationSources(targetDir);
433
+
434
+ const fields = {
435
+ problem: mergePreferred(
436
+ extractValue(missionText, ["One-sentence problem", "一句话问题"]),
437
+ extractReportValue(reportText, "problem"),
438
+ extractValue(stateText, ["Current objective", "当前目标", "Current objective"])
439
+ ),
440
+ whyItMatters: mergePreferred(
441
+ extractValue(missionText, ["Why it matters", "为什么重要"]),
442
+ extractReportValue(reportText, "whyItMatters")
443
+ ),
444
+ targetFailureCase: extractValue(missionText, ["Target failure case", "目标失败场景"]),
445
+ primaryMetric: mergePreferred(
446
+ extractValue(missionText, ["Primary metric", "主指标"]),
447
+ evalProtocol.primaryMetrics,
448
+ extractReportValue(reportText, "primaryMetrics"),
449
+ firstMetric(evalProtocol.primaryMetrics)
450
+ ),
451
+ successThreshold: mergePreferred(
452
+ extractValue(missionText, ["Success threshold", "成功阈值"]),
453
+ extractValue(readFileIfExists(contextFile(targetDir, "auto-outcome.md")), ["Terminal goal target", "终止目标目标值"])
454
+ ),
455
+ requiredBaselineComparison: extractValue(
456
+ missionText,
457
+ ["Required baseline comparison", "必须对比的 baseline"]
458
+ ),
459
+ minimumEvidenceRequirement: extractValue(
460
+ missionText,
461
+ ["Minimum evidence requirement", "最小证据要求"]
462
+ ),
463
+ datasetScope: mergePreferred(
464
+ extractValue(missionText, ["Dataset or benchmark scope", "数据集或 benchmark 范围"]),
465
+ extractValue(dataDecisions, ["Approved dataset package", "Approved datasets", "已批准数据集包", "已批准数据集"]),
466
+ extractReportValue(reportText, "datasets")
467
+ ),
468
+ splitPolicy: extractValue(missionText, ["Split policy", "切分策略"]),
469
+ evaluationProtocol: mergePreferred(
470
+ extractValue(missionText, ["Evaluation protocol", "评估协议"]),
471
+ ".lab/context/eval-protocol.md"
472
+ ),
473
+ hardConstraints: extractValue(missionText, ["Hard constraints", "硬约束"]),
474
+ approvedDirection: mergePreferred(
475
+ extractValue(missionText, ["Approved direction", "已批准方向"]),
476
+ extractClaim(evidenceText)
477
+ ),
478
+ currentOwner: extractValue(missionText, ["Current owner or session", "当前 owner 或会话"]),
479
+ latestStage: mergePreferred(
480
+ extractValue(missionText, ["Latest stage to update this mission", "最近一次允许更新 mission 的阶段"]),
481
+ extractValue(stateText, ["Active stage", "当前阶段", "Stage"])
482
+ ),
483
+ };
484
+
485
+ const before = missionText;
486
+ fields.hydrationProvenance = hydrationSources.length > 0 ? hydrationSources.join("; ") : "";
487
+ fields.collaboratorReadyStatus =
488
+ missingCollaboratorFields(buildMissionContextText(lang, fields), MISSION_COLLABORATOR_FIELDS).length === 0
489
+ ? "hydrated"
490
+ : "artifact-anchored interim";
491
+
492
+ const nextText = buildMissionContextText(lang, fields);
493
+ if (nextText !== before) {
494
+ writeContextFile(targetDir, "mission.md", nextText);
495
+ return true;
496
+ }
497
+ return false;
498
+ }
499
+
500
+ function hydrateEvalProtocol(targetDir) {
501
+ if (!hasCollaboratorFacingDeliverables(targetDir)) {
502
+ return false;
503
+ }
504
+
505
+ const lang = readWorkflowLanguage(targetDir);
506
+ const protocol = parseEvalProtocol(targetDir);
507
+ const missionText = readFileIfExists(contextFile(targetDir, "mission.md"));
508
+ const reportText = readFileIfExists(getCollaboratorDeliverablePaths(targetDir).reportPath);
509
+ const dataDecisions = readFileIfExists(contextFile(targetDir, "data-decisions.md"));
510
+ const hydrationSources = collectHydrationSources(targetDir);
511
+ const { reportPath, mainTablesPath } = getCollaboratorDeliverablePaths(targetDir);
512
+ const requiredOutputs = [
513
+ fs.existsSync(reportPath) ? path.relative(targetDir, reportPath) : "",
514
+ fs.existsSync(mainTablesPath) ? path.relative(targetDir, mainTablesPath) : "",
515
+ ".lab/context/evidence-index.md",
516
+ ]
517
+ .filter(Boolean)
518
+ .join(", ");
519
+
520
+ const fields = {
521
+ primaryEvaluationObjective: mergePreferred(
522
+ protocol.primaryEvaluationObjective,
523
+ extractValue(missionText, ["One-sentence problem", "一句话问题"]),
524
+ extractReportValue(reportText, "problem")
525
+ ),
526
+ primaryMetrics: mergePreferred(
527
+ protocol.primaryMetrics,
528
+ extractValue(missionText, ["Primary metric", "主指标"]),
529
+ extractReportValue(reportText, "primaryMetrics")
530
+ ),
531
+ secondaryMetrics: mergePreferred(
532
+ protocol.secondaryMetrics,
533
+ extractReportValue(reportText, "secondaryMetrics")
534
+ ),
535
+ requiredTerminalEvidence: mergePreferred(
536
+ protocol.requiredTerminalEvidence,
537
+ extractReportValue(reportText, "requiredTerminalEvidence"),
538
+ requiredOutputs
539
+ ),
540
+ tablePlan: mergePreferred(
541
+ protocol.tablePlan,
542
+ fs.existsSync(mainTablesPath) ? `See ${path.relative(targetDir, mainTablesPath)}` : ""
543
+ ),
544
+ requiredClaimsPerTable: protocol.requiredClaimsPerTable,
545
+ metricGlossary: mergePreferred(
546
+ protocol.metricGlossary,
547
+ joinNonEmpty(
548
+ [
549
+ extractReportValue(reportText, "metricGuidePrimary"),
550
+ extractReportValue(reportText, "metricGuideSecondary"),
551
+ extractReportValue(reportText, "metricGuideSupport"),
552
+ ],
553
+ " | "
554
+ )
555
+ ),
556
+ backgroundSources: mergePreferred(
557
+ protocol.backgroundSources,
558
+ extractReportValue(reportText, "backgroundSources"),
559
+ extractValue(dataDecisions, ["Papers that used the approved datasets", "使用过已批准数据集的论文", "使用过该数据集的论文"])
560
+ ),
561
+ methodAndBaselineSourcePapers: mergePreferred(
562
+ protocol.methodAndBaselineSourcePapers,
563
+ extractReportValue(reportText, "baselineSourcePapers")
564
+ ),
565
+ methodAndBaselineImplementationSource: mergePreferred(
566
+ protocol.methodAndBaselineImplementationSource,
567
+ extractReportValue(reportText, "baselineImplementationSources")
568
+ ),
569
+ metricSourcePapers: mergePreferred(
570
+ protocol.metricSourcePapers,
571
+ extractReportValue(reportText, "metricSourcePapers")
572
+ ),
573
+ metricImplementationSource: mergePreferred(
574
+ protocol.metricImplementationSource,
575
+ extractReportValue(reportText, "metricImplementationSource")
576
+ ),
577
+ comparisonSourcePapers: mergePreferred(
578
+ protocol.comparisonSourcePapers,
579
+ extractReportValue(reportText, "baselineSourcePapers")
580
+ ),
581
+ comparisonImplementationSource: mergePreferred(
582
+ protocol.comparisonImplementationSource,
583
+ extractReportValue(reportText, "baselineImplementationSources")
584
+ ),
585
+ deviationFromOriginalImplementation: mergePreferred(
586
+ protocol.deviationFromOriginalImplementation,
587
+ extractReportValue(reportText, "metricDeviation")
588
+ ),
589
+ benchmarkLadder: protocol.benchmarkLadder,
590
+ experimentLadder: protocol.experimentLadder,
591
+ comparisonGate: protocol.comparisonGate,
592
+ promotionGate: protocol.promotionGate,
593
+ minimumSampleSizes: protocol.minimumSampleSizes,
594
+ requiredOutputArtifacts: mergePreferred(protocol.requiredOutputArtifacts, requiredOutputs),
595
+ hydrationProvenance: hydrationSources.length > 0 ? hydrationSources.join("; ") : "",
596
+ };
597
+
598
+ const nextText = buildEvalProtocolText(lang, fields, protocol.experimentRungs);
599
+ if (nextText !== protocol.text) {
600
+ writeContextFile(targetDir, "eval-protocol.md", nextText);
601
+ return true;
602
+ }
603
+ return false;
604
+ }
605
+
606
+ function getCollaboratorReportStatus(targetDir) {
607
+ const missionIssues = collaboratorMissionIssues(targetDir);
608
+ const evalIssues = collaboratorEvalIssues(targetDir);
609
+ const issues = missionIssues.concat(evalIssues);
610
+ if (issues.length > 0) {
611
+ return {
612
+ mode: "artifact-anchored interim",
613
+ readiness: "hydrated but incomplete",
614
+ reason: issues.join(" | "),
615
+ issues,
616
+ };
617
+ }
618
+ return {
619
+ mode: "collaborator-ready",
620
+ readiness: "ready",
621
+ reason: "canonical mission and evaluation context are complete enough for collaborator-facing reporting",
622
+ issues: [],
623
+ };
624
+ }
625
+
626
+ function upsertSection(text, heading, bodyLines) {
627
+ const sectionText = `${heading}\n\n${bodyLines.join("\n")}`.trimEnd();
628
+ const pattern = new RegExp(`^${heading.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\s*$[\\s\\S]*?(?=^##\\s|\\Z)`, "m");
629
+ if (pattern.test(text)) {
630
+ return text.replace(pattern, sectionText);
631
+ }
632
+ return `${text.trimEnd()}\n\n${sectionText}\n`;
633
+ }
634
+
635
+ function syncCollaboratorReportStatus(targetDir, status) {
636
+ const { reportPath } = getCollaboratorDeliverablePaths(targetDir);
637
+ if (!fs.existsSync(reportPath)) {
638
+ return false;
639
+ }
640
+ const reportText = fs.readFileSync(reportPath, "utf8");
641
+ const heading = /^#\s/.test(reportText) ? "## Report Status" : "# Report Status";
642
+ const nextText = upsertSection(reportText, heading, [
643
+ `- Report mode: ${status.mode}`,
644
+ `- Canonical context readiness: ${status.readiness}`,
645
+ `- Why the current mode is appropriate: ${status.reason}`,
646
+ ]);
647
+ if (nextText !== reportText) {
648
+ fs.writeFileSync(reportPath, nextText.trimEnd() + "\n");
649
+ return true;
650
+ }
651
+ return false;
652
+ }
653
+
654
+ function hydrateCanonicalContext(targetDir) {
655
+ const missionChanged = hydrateMissionContext(targetDir);
656
+ const evalChanged = hydrateEvalProtocol(targetDir);
657
+ const reportStatus = getCollaboratorReportStatus(targetDir);
658
+ const reportChanged = syncCollaboratorReportStatus(targetDir, reportStatus);
659
+ return {
660
+ missionChanged,
661
+ evalChanged,
662
+ reportChanged,
663
+ reportStatus,
664
+ };
665
+ }
666
+
79
667
  function renderSummary(lang, data) {
80
668
  if (lang === "zh") {
81
669
  return `# 研究摘要
@@ -95,6 +683,9 @@ function renderSummary(lang, data) {
95
683
  - Auto stop reason: ${data.autoStopReason || "待补充"}
96
684
  - Auto final artifact: ${data.autoFinalArtifact || "待补充"}
97
685
  - Auto final rung: ${data.autoFinalRung || "待补充"}
686
+ - Collaborator report mode: ${data.reportMode || "待补充"}
687
+ - Canonical context readiness: ${data.reportReadiness || "待补充"}
688
+ - Why this report mode is active: ${data.reportModeReason || "待补充"}
98
689
  - Eval objective: ${data.evalObjective || "待补充"}
99
690
  - Primary metrics: ${data.evalPrimaryMetrics || "待补充"}
100
691
  - Secondary metrics: ${data.evalSecondaryMetrics || "待补充"}
@@ -153,6 +744,9 @@ function renderSummary(lang, data) {
153
744
  - Auto stop reason: ${data.autoStopReason || "TBD"}
154
745
  - Auto final artifact: ${data.autoFinalArtifact || "TBD"}
155
746
  - Auto final rung: ${data.autoFinalRung || "TBD"}
747
+ - Collaborator report mode: ${data.reportMode || "TBD"}
748
+ - Canonical context readiness: ${data.reportReadiness || "TBD"}
749
+ - Why this report mode is active: ${data.reportModeReason || "TBD"}
156
750
  - Eval objective: ${data.evalObjective || "TBD"}
157
751
  - Primary metrics: ${data.evalPrimaryMetrics || "TBD"}
158
752
  - Secondary metrics: ${data.evalSecondaryMetrics || "TBD"}
@@ -266,6 +860,9 @@ ${data.problem || "待补充"}
266
860
  - Auto stop reason: ${data.autoStopReason || "待补充"}
267
861
  - Auto final artifact: ${data.autoFinalArtifact || "待补充"}
268
862
  - Auto final rung: ${data.autoFinalRung || "待补充"}
863
+ - Collaborator report mode: ${data.reportMode || "待补充"}
864
+ - Canonical context readiness: ${data.reportReadiness || "待补充"}
865
+ - Why this report mode is active: ${data.reportModeReason || "待补充"}
269
866
  - Eval objective: ${data.evalObjective || "待补充"}
270
867
  - Primary metrics: ${data.evalPrimaryMetrics || "待补充"}
271
868
  - Secondary metrics: ${data.evalSecondaryMetrics || "待补充"}
@@ -335,6 +932,9 @@ ${data.problem || "TBD"}
335
932
  - Auto stop reason: ${data.autoStopReason || "TBD"}
336
933
  - Auto final artifact: ${data.autoFinalArtifact || "TBD"}
337
934
  - Auto final rung: ${data.autoFinalRung || "TBD"}
935
+ - Collaborator report mode: ${data.reportMode || "TBD"}
936
+ - Canonical context readiness: ${data.reportReadiness || "TBD"}
937
+ - Why this report mode is active: ${data.reportModeReason || "TBD"}
338
938
  - Eval objective: ${data.evalObjective || "TBD"}
339
939
  - Primary metrics: ${data.evalPrimaryMetrics || "TBD"}
340
940
  - Secondary metrics: ${data.evalSecondaryMetrics || "TBD"}
@@ -381,6 +981,7 @@ ${data.problem || "TBD"}
381
981
  }
382
982
 
383
983
  function buildContextSnapshot(targetDir) {
984
+ const reportStatus = getCollaboratorReportStatus(targetDir);
384
985
  const mission = readFileIfExists(contextFile(targetDir, "mission.md"));
385
986
  const state = readFileIfExists(contextFile(targetDir, "state.md"));
386
987
  const evidence = readFileIfExists(contextFile(targetDir, "evidence-index.md"));
@@ -592,6 +1193,9 @@ function buildContextSnapshot(targetDir) {
592
1193
  autoStopReason: extractValue(autoOutcome, ["Stop reason", "停止原因"]),
593
1194
  autoFinalArtifact: extractValue(autoOutcome, ["Final artifact", "最终工件"]),
594
1195
  autoFinalRung: extractValue(autoOutcome, ["Final rung", "最终 rung"]),
1196
+ reportMode: reportStatus.mode,
1197
+ reportReadiness: reportStatus.readiness,
1198
+ reportModeReason: reportStatus.reason,
595
1199
  evalObjective: evalProtocol.primaryEvaluationObjective,
596
1200
  evalPrimaryMetrics: evalProtocol.primaryMetrics,
597
1201
  evalSecondaryMetrics: evalProtocol.secondaryMetrics,
@@ -619,6 +1223,7 @@ function writeContextFile(targetDir, name, content) {
619
1223
  }
620
1224
 
621
1225
  function refreshContext({ targetDir }) {
1226
+ hydrateCanonicalContext(targetDir);
622
1227
  const lang = readWorkflowLanguage(targetDir);
623
1228
  const snapshot = buildContextSnapshot(targetDir);
624
1229
  writeContextFile(targetDir, "summary.md", renderSummary(lang, snapshot));
@@ -695,6 +1300,11 @@ function archiveContext({ targetDir, now = new Date() }) {
695
1300
 
696
1301
  module.exports = {
697
1302
  archiveContext,
1303
+ collaboratorEvalIssues,
1304
+ collaboratorMissionIssues,
1305
+ getCollaboratorReportStatus,
1306
+ hasCollaboratorFacingDeliverables,
1307
+ hydrateCanonicalContext,
698
1308
  pruneContext,
699
1309
  refreshContext,
700
1310
  };
package/lib/i18n.cjs CHANGED
@@ -289,7 +289,10 @@ const ZH_SKILL_FILES = {
289
289
 
290
290
  ## 必要输出
291
291
 
292
+ - 报告状态:collaborator-ready 或 artifact-anchored interim
292
293
  - 给用户看的总结
294
+ - 问题与背景的白话说明
295
+ - 数据集场景说明
293
296
  - 方法概述
294
297
  - 选定指标摘要
295
298
  - 指标白话释义
@@ -301,6 +304,7 @@ const ZH_SKILL_FILES = {
301
304
  - 失败尝试
302
305
  - 局限性
303
306
  - 下一步
307
+ - 单独列出的工件状态,而不是混进已验证结果
304
308
 
305
309
  ## 上下文读取
306
310
 
@@ -312,6 +316,8 @@ const ZH_SKILL_FILES = {
312
316
 
313
317
  ## 上下文写回
314
318
 
319
+ - \`.lab/context/mission.md\`
320
+ - \`.lab/context/eval-protocol.md\`
315
321
  - \`.lab/context/state.md\`
316
322
  - \`.lab/context/evidence-index.md\`
317
323
 
@@ -324,7 +330,11 @@ const ZH_SKILL_FILES = {
324
330
  - 必须把已批准的主指标、次级指标和必要终局证据明确写进 \`report.md\` 与受管的 \`main-tables.md\`。
325
331
  - 必须用白话解释选定的主指标和次级指标:每个指标在衡量什么、越高还是越低更好、它是主结果指标还是健康度/支持性指标。
326
332
  - 如果出现 coverage、completeness、confidence 或类似健康度指标,必须明确说明这类指标回答的是“实验是否跑稳、证据是否完整”,而不是主要科学效应本身。
333
+ - 在起草报告前,先检查 \`.lab/context/mission.md\` 和 \`.lab/context/eval-protocol.md\` 是否仍是模板空壳。
334
+ - 如果 canonical context 还是空壳,要先根据 frozen result artifacts、data-decisions、evidence-index 和已批准上下文回填“最小可信版本”,再写报告。
335
+ - 如果回填后仍缺少协作者可读所需的关键字段,就必须把输出降级成 \`artifact-anchored interim report\`,不能冒充最终协作者报告。
327
336
  - 如果报告依赖了对原始指标或原始实现的偏差,必须明确写出这个偏差。
337
+ - workflow 工件状态、rerun id 或 LaTeX 骨架状态不能混进“已验证主结果”;这些内容必须单列到工件状态部分。
328
338
  - 如果 workflow language 是中文,\`report.md\` 和 \`<deliverables_root>/main-tables.md\` 也应使用中文,除非文件路径、代码标识符或字面指标名必须保持原样。
329
339
  - 解释优先保守,不要写成营销文案。
330
340
  - 要给 \`/lab:write\` 留下清晰 handoff,尤其是 section draft 可以直接引用的证据链接。
@@ -691,6 +701,12 @@ const ZH_SKILL_FILES = {
691
701
  [path.join(".lab", ".managed", "templates", "final-report.md")]:
692
702
  `# 最终报告
693
703
 
704
+ ## 报告状态
705
+
706
+ - 报告模式:collaborator-ready 或 artifact-anchored interim
707
+ - canonical context 完整度:
708
+ - 为什么当前只能用这个模式:
709
+
694
710
  ## 给用户看的总结
695
711
 
696
712
  - 一句话结论:
@@ -698,6 +714,18 @@ const ZH_SKILL_FILES = {
698
714
  - 还没有被证明的内容:
699
715
  - 当前最大报告风险:
700
716
 
717
+ ## 问题与背景
718
+
719
+ - 这项研究在解决什么问题:
720
+ - 为什么这个问题重要:
721
+ - 当前报告到底覆盖了什么 setting 或 workflow:
722
+
723
+ ## 数据集场景说明
724
+
725
+ - 数据集或 benchmark 1 代表什么真实场景:
726
+ - 数据集或 benchmark 2 代表什么真实场景:
727
+ - 数据集或 benchmark 3 代表什么真实场景:
728
+
701
729
  ## 选定指标
702
730
 
703
731
  - 主指标:
@@ -740,6 +768,11 @@ const ZH_SKILL_FILES = {
740
768
  - 最终表现摘要:
741
769
  - 主表覆盖情况:
742
770
 
771
+ ## 工件状态
772
+
773
+ - 已就绪的交付物或工作流工件:
774
+ - 这些工件状态为什么不是科学结论:
775
+
743
776
  ## 主要结果
744
777
 
745
778
  - 主要发现 1:
@@ -953,6 +986,7 @@ const ZH_SKILL_FILES = {
953
986
  - 一句话问题:
954
987
  - 为什么重要:
955
988
  - 目标失败场景:
989
+ - 回填来源:
956
990
 
957
991
  ## 成功标准
958
992
 
@@ -973,6 +1007,7 @@ const ZH_SKILL_FILES = {
973
1007
  - 已批准方向:
974
1008
  - 当前 owner 或会话:
975
1009
  - 最近更新该 mission 的 stage:
1010
+ - 协作者可读状态:
976
1011
  `,
977
1012
  [path.join(".lab", "context", "state.md")]:
978
1013
  `# 工作流状态
@@ -1966,6 +2001,7 @@ ZH_CONTENT[path.join(".lab", "context", "eval-protocol.md")] = `# 评估协议
1966
2001
  ## 指标释义
1967
2002
 
1968
2003
  - 指标释义:
2004
+ - 回填来源:
1969
2005
  - 背景来源:
1970
2006
  - 方法与基线来源论文:
1971
2007
  - 方法与基线实现来源:
@@ -2021,10 +2057,12 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
2021
2057
 
2022
2058
  ## 上下文写回
2023
2059
 
2060
+ - \`.lab/context/mission.md\`
2024
2061
  - \`.lab/context/state.md\`
2025
2062
  - \`.lab/context/decisions.md\`
2026
2063
  - \`.lab/context/data-decisions.md\`
2027
2064
  - \`.lab/context/evidence-index.md\`
2065
+ - \`.lab/context/eval-protocol.md\`
2028
2066
  - \`.lab/context/summary.md\`
2029
2067
  - \`.lab/context/session-brief.md\`
2030
2068
  - \`.lab/context/auto-status.md\`
@@ -2053,6 +2091,8 @@ ZH_CONTENT[path.join(".codex", "skills", "lab", "stages", "auto.md")] = `# \`/la
2053
2091
  - \`review\` 更新规范审查上下文
2054
2092
  - \`report\` 写出 \`<deliverables_root>/report.md\`
2055
2093
  - \`write\` 写出 \`<deliverables_root>/paper/\` 下的 LaTeX 产物
2094
+ - 如果即将进入 \`report\`,而 \`.lab/context/mission.md\` 或 \`.lab/context/eval-protocol.md\` 仍是模板空壳,就先根据冻结工件和已批准上下文回填最小可信版本。
2095
+ - 如果回填后仍缺少协作者可读所需的关键字段,就必须强制生成 \`artifact-anchored interim report\`,不能冒充最终协作者报告。
2056
2096
  - promotion 成功后,必须写回 \`data-decisions.md\`、\`decisions.md\`、\`state.md\` 和 \`session-brief.md\`。
2057
2097
  - 如果某个指标或对比 claim 在评估协议里没有带来源的定义,就不能拿它做 stop 或 promotion 判断。
2058
2098
 
@@ -1,5 +1,11 @@
1
1
  # Final Report
2
2
 
3
+ ## Report Status
4
+
5
+ - Report mode: collaborator-ready or artifact-anchored interim
6
+ - Canonical context readiness:
7
+ - Why the current mode is appropriate:
8
+
3
9
  ## Reader Summary
4
10
 
5
11
  - One-sentence conclusion:
@@ -7,6 +13,18 @@
7
13
  - What is still unproven:
8
14
  - Biggest reporting risk:
9
15
 
16
+ ## Problem and Background
17
+
18
+ - Research problem in plain language:
19
+ - Why this problem matters:
20
+ - What setting or workflow this report is actually about:
21
+
22
+ ## Dataset Scene Notes
23
+
24
+ - Dataset or benchmark 1 and what real-world setting it represents:
25
+ - Dataset or benchmark 2 and what real-world setting it represents:
26
+ - Dataset or benchmark 3 and what real-world setting it represents:
27
+
10
28
  ## Selected Metrics
11
29
 
12
30
  - Primary metrics:
@@ -56,6 +74,11 @@
56
74
  - Final performance summary:
57
75
  - Table coverage:
58
76
 
77
+ ## Artifact Status
78
+
79
+ - Deliverables or workflow artifacts that are ready:
80
+ - Artifact status notes that are not scientific findings:
81
+
59
82
  ## Main Results
60
83
 
61
84
  Summarize validated iteration outcomes.
@@ -17,6 +17,7 @@ Use this file to define the paper-facing evaluation objective, table plan, gates
17
17
  ## Metric Glossary
18
18
 
19
19
  - Metric glossary:
20
+ - Hydration provenance:
20
21
  - Background sources:
21
22
  - Method and baseline source papers:
22
23
  - Method and baseline implementation source:
@@ -5,6 +5,7 @@
5
5
  - One-sentence problem:
6
6
  - Why it matters:
7
7
  - Target failure case:
8
+ - Hydration provenance:
8
9
 
9
10
  ## Success Criteria
10
11
 
@@ -25,3 +26,4 @@
25
26
  - Approved direction:
26
27
  - Current owner or session:
27
28
  - Last stage that updated this mission:
29
+ - Collaborator-ready status:
@@ -46,6 +46,7 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
46
46
  - Keep an explicit approval gate before `/lab:spec`.
47
47
  - Write idea artifacts with the template in `.lab/.managed/templates/idea.md`.
48
48
  - Update `.lab/context/mission.md`, `.lab/context/decisions.md`, and `.lab/context/open-questions.md` after convergence.
49
+ - Do not leave `.lab/context/mission.md` as a template shell once the problem statement and approved direction are known.
49
50
  - Do not implement code in this stage.
50
51
 
51
52
  ### `/lab:data`
@@ -111,7 +112,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
111
112
  - Normalize the result with `.lab/.managed/scripts/eval_report.py`.
112
113
  - Validate normalized output with `.lab/.managed/scripts/validate_results.py`.
113
114
  - Read `.lab/context/eval-protocol.md` before choosing the smallest run so the first experiment already targets the approved tables, metrics, and gates.
114
- - Update `.lab/context/state.md` and `.lab/context/evidence-index.md` after the run.
115
+ - Update `.lab/context/state.md`, `.lab/context/evidence-index.md`, and `.lab/context/eval-protocol.md` after the run.
116
+ - If the evaluation protocol is still skeletal, initialize the smallest trustworthy source-backed version before treating the run as the protocol anchor.
115
117
 
116
118
  ### `/lab:iterate`
117
119
 
@@ -131,7 +133,8 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
131
133
  - Keep metric definitions, baseline behavior, and comparison implementations anchored to the source-backed evaluation protocol before changing thresholds, gates, or ladder transitions.
132
134
  - Switch to diagnostic mode if risk increases for two consecutive rounds.
133
135
  - Write round reports with `.lab/.managed/templates/iteration-report.md`.
134
- - Update `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, and `.lab/context/open-questions.md` each round as needed.
136
+ - Update `.lab/context/state.md`, `.lab/context/decisions.md`, `.lab/context/evidence-index.md`, `.lab/context/open-questions.md`, and `.lab/context/eval-protocol.md` each round as needed.
137
+ - Keep `.lab/context/eval-protocol.md` synchronized with accepted ladder changes, benchmark scope, and source-backed implementation deviations.
135
138
  - Stop at threshold success or iteration cap, and record blockers plus next-best actions when the campaign ends without success.
136
139
 
137
140
  ### `/lab:review`
@@ -153,7 +156,9 @@ Use this skill when the user invokes `/lab:*` or asks for the structured researc
153
156
  - Aggregate them with `.lab/.managed/scripts/summarize_iterations.py`.
154
157
  - Write the final document with `.lab/.managed/templates/final-report.md` and the managed table summary with `.lab/.managed/templates/main-tables.md`.
155
158
  - Keep failed attempts and limitations visible.
156
- - Update `.lab/context/state.md` and `.lab/context/evidence-index.md` with report-level handoff notes.
159
+ - Update `.lab/context/mission.md`, `.lab/context/eval-protocol.md`, `.lab/context/state.md`, and `.lab/context/evidence-index.md` with report-level handoff notes.
160
+ - If canonical context is still skeletal, hydrate the smallest trustworthy version from frozen artifacts before finalizing the report.
161
+ - If collaborator-critical fields remain missing after hydration, downgrade to an `artifact-anchored interim report` instead of presenting a final collaborator-ready report.
157
162
 
158
163
  ### `/lab:write`
159
164
 
@@ -24,6 +24,8 @@
24
24
 
25
25
  ## Context Write Set
26
26
 
27
+ - `.lab/context/mission.md`
28
+ - `.lab/context/eval-protocol.md`
27
29
  - `.lab/context/state.md`
28
30
  - `.lab/context/decisions.md`
29
31
  - `.lab/context/data-decisions.md`
@@ -63,6 +65,8 @@
63
65
  - keep the session alive while the current rung is running
64
66
  - write the current rung, watch target, and next rung to `.lab/context/auto-status.md`
65
67
  - Reuse the existing `/lab:run`, `/lab:iterate`, `/lab:review`, `/lab:report`, and optional `/lab:write` contracts instead of inventing a parallel workflow.
68
+ - If the loop is about to reach `report` while `.lab/context/mission.md` or `.lab/context/eval-protocol.md` is still skeletal, hydrate the smallest trustworthy canonical version from frozen artifacts and approved context before drafting the report.
69
+ - If hydration still leaves collaborator-critical fields blank, force `report` to emit an `artifact-anchored interim report` instead of a collaborator-ready final report.
66
70
  - Enforce stage contracts, not just exit codes:
67
71
  - `run` and `iterate` must change persistent outputs under `results_root`
68
72
  - `review` must update canonical review context
@@ -3,6 +3,7 @@
3
3
  ## Required Output
4
4
 
5
5
  - one-sentence problem statement
6
+ - why the problem matters in plain language
6
7
  - failure case
7
8
  - idea classification
8
9
  - contribution category
@@ -18,6 +19,7 @@
18
19
  - critique before convergence
19
20
  - minimum viable experiment
20
21
  - explicit approval gate before `/lab:spec`
22
+ - canonical mission context updated with the approved problem, importance, failure case, and direction
21
23
 
22
24
  ## Evidence Discipline
23
25
 
@@ -60,3 +62,4 @@
60
62
  - State why the target problem matters before talking about the method.
61
63
  - Compare against existing methods explicitly, not by vague novelty language.
62
64
  - The three meaningful points should each fit in one direct sentence.
65
+ - Do not leave `.lab/context/mission.md` as an empty template after convergence; write the approved problem, why it matters, the current benchmark scope, and the approved direction back into canonical context.
@@ -29,6 +29,7 @@ Declare and keep fixed:
29
29
  - `.lab/context/decisions.md`
30
30
  - `.lab/context/evidence-index.md`
31
31
  - `.lab/context/open-questions.md`
32
+ - `.lab/context/eval-protocol.md`
32
33
 
33
34
  ## Per-Round Output
34
35
 
@@ -62,6 +63,7 @@ If the loop stops without success, record:
62
63
  - Do not accumulate long-lived results under `.lab/changes/<change-id>/runs`.
63
64
  - Do not change metric definitions, baseline semantics, or comparison implementations unless the approved evaluation protocol records both their sources and any deviations.
64
65
  - When you change ladders, sample sizes, or promotion gates, keep the resulting logic anchored to the source-backed evaluation protocol instead of ad-hoc chat reasoning.
66
+ - Keep `.lab/context/eval-protocol.md` synchronized with the active benchmark scope, ladder gates, source-backed metric definitions, and any accepted implementation deviations instead of leaving it as a stale template.
65
67
 
66
68
  ## Interaction Contract
67
69
 
@@ -2,7 +2,10 @@
2
2
 
3
3
  ## Required Output
4
4
 
5
+ - report status: collaborator-ready or artifact-anchored interim
5
6
  - reader summary for the user
7
+ - problem and background in plain language
8
+ - dataset scene notes in plain language
6
9
  - method overview
7
10
  - selected metrics summary
8
11
  - plain-language metric guide
@@ -17,6 +20,7 @@
17
20
  - failed attempts
18
21
  - limitations
19
22
  - next steps
23
+ - artifact status kept separate from validated findings
20
24
 
21
25
  ## Context Read Set
22
26
 
@@ -30,6 +34,8 @@
30
34
 
31
35
  ## Context Write Set
32
36
 
37
+ - `.lab/context/mission.md`
38
+ - `.lab/context/eval-protocol.md`
33
39
  - `.lab/context/state.md`
34
40
  - `.lab/context/evidence-index.md`
35
41
 
@@ -45,6 +51,10 @@
45
51
  - Pull the core background references, method or baseline references, and metric references out of the approved evaluation protocol instead of hiding them in `.lab/context/*`.
46
52
  - Report only the few references a collaborator needs to orient themselves quickly; do not turn `report.md` into a full bibliography dump.
47
53
  - If the report depends on a deviation from an original metric or implementation, state that deviation explicitly instead of smoothing it over.
54
+ - Before drafting the report, inspect `.lab/context/mission.md` and `.lab/context/eval-protocol.md` for skeletal template fields.
55
+ - If either canonical context file is still skeletal, hydrate the smallest trustworthy version from frozen result artifacts, dataset decisions, evidence-index, and prior approved context, and write that back before finalizing the report.
56
+ - If collaborator-critical fields still remain missing after hydration, downgrade the output to an `artifact-anchored interim report` instead of presenting it as a final collaborator-ready report.
57
+ - Do not mix workflow deliverable status, rerun ids, or manuscript skeleton status into validated scientific findings; keep those in a separate artifact-status section.
48
58
  - If `.lab/config/workflow.json` sets the workflow language to Chinese, write `report.md` and `<deliverables_root>/main-tables.md` in Chinese unless a file path, code identifier, or literal metric name must remain unchanged.
49
59
  - Prefer conservative interpretation over marketing language.
50
60
  - Leave a clear handoff path into `/lab:write` with evidence links that section drafts can cite.
@@ -6,6 +6,7 @@
6
6
  - run registry entry
7
7
  - normalized evaluation summary
8
8
  - validation result for the normalized summary
9
+ - canonical evaluation context initialized or refined when the active protocol is still skeletal
9
10
 
10
11
  ## Context Read Set
11
12
 
@@ -19,6 +20,7 @@
19
20
 
20
21
  - `.lab/context/state.md`
21
22
  - `.lab/context/evidence-index.md`
23
+ - `.lab/context/eval-protocol.md`
22
24
 
23
25
  ## Constraints
24
26
 
@@ -26,6 +28,7 @@
26
28
  - Fail fast on data, environment, or metric wiring problems.
27
29
  - Tie the run to the approved evaluation protocol, not just an ad-hoc chat goal.
28
30
  - Do not invent metric definitions, baseline behavior, or comparison implementations from memory; anchor them to the approved evaluation protocol and its recorded sources.
31
+ - If `.lab/context/eval-protocol.md` is still skeletal, write the smallest trustworthy version of the current evaluation objective, metric set, ladder, and source-backed implementation notes before treating the run as the new protocol anchor.
29
32
  - Record the exact launch command and output location.
30
33
  - Write durable run outputs, logs, and checkpoints under `results_root`.
31
34
  - Write figures or plots under `figures_root`.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "superlab",
3
- "version": "0.1.18",
3
+ "version": "0.1.19",
4
4
  "description": "Strict /lab research workflow installer for Codex and Claude",
5
5
  "keywords": [
6
6
  "codex",