npm - superlab - Versions diffs - 0.1.70 → 0.1.72 - Mend

superlab 0.1.70 → 0.1.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/package-assets/shared/lab/.managed/scripts/validate_stage_report.py ADDED Viewed

@@ -0,0 +1,547 @@
+#!/usr/bin/env python3
+import argparse
+import re
+import sys
+from pathlib import Path
+REQUIRED_SECTIONS = {
+    "Rule Preflight": [r"^##\s+Rule Preflight\s*$", r"^##\s+规则预检\s*$"],
+    "Stage Identity": [r"^##\s+Stage Identity\s*$", r"^##\s+阶段身份\s*$"],
+    "Requested Outcome Mapping": [r"^##\s+Requested Outcome Mapping\s*$", r"^##\s+请求结果映射\s*$"],
+    "Core Explanation Table": [r"^##\s+Core Explanation Table\s*$", r"^##\s+核心说明表\s*$"],
+    "Evidence And Artifacts": [r"^##\s+Evidence And Artifacts\s*$", r"^##\s+证据与工件\s*$"],
+    "Next Action": [r"^##\s+Next Action\s*$", r"^##\s+下一步动作\s*$"],
+}
+REPAIR_CONTROL_SECTION = [r"^##\s+Repair Control\s*$", r"^##\s+修复控制\s*$"]
+REQUIRED_CORE_ROWS = {
+    "stage": ("这是什么阶段", "what stage is this", "stage"),
+    "background": ("背景是什么", "background"),
+    "why_now": ("为什么现在要做", "why now", "why this stage ran"),
+    "what_done": ("这轮具体做了什么", "what this stage did", "what did this stage do"),
+    "how_done": ("怎么做的", "how it was done", "how was it done"),
+    "worked": ("结果好的地方是什么", "what worked"),
+    "did_not_work": ("结果坏的地方是什么", "what did not work", "negative result"),
+    "verifies": ("这验证了什么", "what this verifies", "what was verified"),
+    "unverified": ("还没有验证什么", "what remains unverified", "not yet verified"),
+    "improve_why": ("是否需要改进", "need improvement", "what needs improvement"),
+    "how_improve": ("下一步怎么改", "how to improve"),
+    "evidence": ("关键证据在哪里", "key evidence", "evidence"),
+    "decision": ("现在应该继续", "continue, stop", "decision"),
+}
+PLACEHOLDER_VALUES = {
+    "",
+    "-",
+    "--",
+    "—",
+    "todo",
+    "tbd",
+    "n/a",
+    "na",
+    "none",
+    "待补",
+    "待定",
+    "无",
+}
+SHALLOW_VALUES = {
+    "done",
+    "ok",
+    "pass",
+    "passed",
+    "符合预期",
+    "已完成",
+    "继续优化",
+    "继续推进",
+    "没有问题",
+}
+INTERNAL_META_PATTERNS = [
+    r"用户说",
+    r"我来解释",
+    r"我会",
+    r"我已经",
+    r"你要求",
+    r"\bagent\b",
+    r"\bsubagent\b",
+    r"\bprompt\b",
+    r"提示词",
+    r"按.*技能",
+    r"service-style",
+    r"AI-assistant",
+]
+WHY_MARKERS = (
+    "because",
+    "so that",
+    "therefore",
+    "reason",
+    "why",
+    "因为",
+    "所以",
+    "因此",
+    "原因",
+    "以便",
+    "用于",
+    "避免",
+)
+IMPROVEMENT_NEEDED_MARKERS = (
+    "need improvement",
+    "needs improvement",
+    "needs revision",
+    "must improve",
+    "should improve",
+    "needs repair",
+    "needs rerun",
+    "需要改进",
+    "需要修复",
+    "需要重跑",
+    "需要继续",
+    "仍需",
+)
+NO_IMPROVEMENT_MARKERS = (
+    "no improvement needed",
+    "does not need improvement",
+    "not need improvement",
+    "无需改进",
+    "不需要改进",
+    "不需改进",
+)
+STOP_DECISION_MARKERS = (
+    "decision: stop",
+    "decision：stop",
+    "decision: 停止",
+    "decision：停止",
+    "决策: stop",
+    "决策：stop",
+    "决策: 停止",
+    "决策：停止",
+)
+TERMINAL_BOUNDARY_MARKERS = (
+    "budget exhausted",
+    "budget boundary",
+    "exceeded budget",
+    "fatal",
+    "safety",
+    "invalid artifact",
+    "invalid metric",
+    "frozen core",
+    "outside the envelope",
+    "outside approved envelope",
+    "user requested stop",
+    "approval required",
+    "requires approval",
+    "escalation boundary",
+    "impossible",
+    "not allowed",
+    "terminal boundary",
+    "integrity",
+    "ethics",
+    "预算耗尽",
+    "预算边界",
+    "超过预算",
+    "致命",
+    "安全",
+    "无效工件",
+    "无效指标",
+    "冻结核心",
+    "超出边界",
+    "超出已批准",
+    "用户要求停止",
+    "需要批准",
+    "升级边界",
+    "不可能",
+    "不允许",
+    "终止边界",
+    "诚信",
+    "伦理",
+)
+REPAIR_CONTROL_FIELDS = (
+    ("Repair budget:", "修复预算："),
+    ("Repair attempts used:", "已用修复次数："),
+    ("Current failure class:", "当前失败类型："),
+    ("Repair hypothesis:", "修复假设："),
+    ("Evidence-changing knobs changed:", "改变证据解释的旋钮："),
+    ("Ordinary engineering fixes allowed:", "允许的普通工程修复："),
+    ("Frozen core unchanged:", "冻结核心不变："),
+    ("Forbidden repairs avoided:", "已避免的禁用修复："),
+    ("Confirmation check:", "确认验证："),
+)
+FORBIDDEN_REPAIR_PATTERNS = (
+    r"\b(changed|modified|relaxed|lowered|loosened|rewrote)\s+(the\s+)?(primary\s+)?metric\b",
+    r"\b(changed|modified|relaxed|lowered|loosened)\s+(the\s+)?(target|threshold|target\s+range)\b",
+    r"\b(drop|dropped|delete|deleted|remove|removed|exclude|excluded)\s+(hard|failed|bad)\s+(cases|examples|samples)\b",
+    r"\b(changed|modified|rewrote)\s+(labels?|ground\s+truth)\b",
+    r"\b(changed|modified|swapped|replaced)\s+(the\s+)?(final\s+)?test\s+split\b",
+    r"\b(changed|modified|expanded)\s+(the\s+)?(paper-facing\s+)?claim\b",
+    r"\b(changed|modified|switched)\s+(the\s+)?(threat\s+model|reviewer\s+profile|dataset\s+scope)\b",
+    r"修改(主)?指标",
+    r"放宽(目标|阈值|目标区间)",
+    r"(删除|移除|剔除)(困难|失败|坏)(样本|案例)",
+    r"修改(标签|真值|测试集|主张|威胁模型|数据集范围)",
+)
+REPAIR_SUCCESS_MARKERS = (
+    "repair passed",
+    "repair succeeded",
+    "passed after repair",
+    "promotion is planned",
+    "promote",
+    "promotion",
+    "修复通过",
+    "修复成功",
+    "准备推广",
+    "推广",
+)
+NO_CONFIRMATION_MARKERS = (
+    "not needed",
+    "not required",
+    "none",
+    "n/a",
+    "无需",
+    "不需要",
+    "无",
+)
+CONFIRMATION_MARKERS = (
+    "confirmation",
+    "confirm",
+    "holdout",
+    "control",
+    "new seed",
+    "seed",
+    "batch",
+    "rerun",
+    "复验",
+    "确认",
+    "留出",
+    "对照",
+    "新 seed",
+    "批次",
+    "重跑",
+)
+def parse_args():
+    parser = argparse.ArgumentParser(description="Validate a plain-language lab stage report.")
+    parser.add_argument("--stage-report", required=True, help="Path to the stage report markdown file.")
+    parser.add_argument("--stage", default="", help="Expected lab stage name, such as run, auto, or write.")
+    return parser.parse_args()
+def normalize(text: str) -> str:
+    return re.sub(r"\s+", " ", text.strip().lower())
+def extract_section(text: str, patterns: list[str]) -> str:
+    for pattern in patterns:
+        match = re.search(pattern, text, flags=re.MULTILINE)
+        if not match:
+            continue
+        start = match.end()
+        next_heading = re.search(r"^##\s+", text[start:], flags=re.MULTILINE)
+        end = start + next_heading.start() if next_heading else len(text)
+        return text[start:end].strip()
+    return ""
+def find_missing_sections(text: str) -> list[str]:
+    missing = []
+    for name, patterns in REQUIRED_SECTIONS.items():
+        if not any(re.search(pattern, text, flags=re.MULTILINE) for pattern in patterns):
+            missing.append(name)
+    return missing
+def parse_core_table_rows(text: str) -> dict[str, str]:
+    section = extract_section(text, REQUIRED_SECTIONS["Core Explanation Table"])
+    rows = {}
+    for raw_line in section.splitlines():
+        line = raw_line.strip()
+        if not line.startswith("|") or line.count("|") < 3:
+            continue
+        cells = [cell.strip() for cell in line.strip("|").split("|")]
+        if len(cells) < 2:
+            continue
+        question = normalize(cells[0])
+        answer = cells[1].strip()
+        if question in {"question", "---", ""}:
+            continue
+        rows[question] = answer
+    return rows
+def find_row_value(rows: dict[str, str], markers: tuple[str, ...]) -> str | None:
+    normalized_markers = tuple(normalize(marker) for marker in markers)
+    for question, answer in rows.items():
+        if any(marker in question for marker in normalized_markers):
+            return answer
+    return None
+def is_blank_or_placeholder(value: str | None) -> bool:
+    if value is None:
+        return True
+    compact = normalize(value).strip(" .:;，。；：")
+    return compact in PLACEHOLDER_VALUES
+def has_marker_with_value(body: str, markers: tuple[str, ...]) -> bool:
+    for line in body.splitlines():
+        stripped = line.strip()
+        for marker in markers:
+            if marker not in stripped:
+                continue
+            value = stripped.split(marker, 1)[1].strip()
+            return not is_blank_or_placeholder(value)
+    return False
+def marker_value(body: str, markers: tuple[str, ...]) -> str:
+    for line in body.splitlines():
+        stripped = line.strip()
+        for marker in markers:
+            if marker not in stripped:
+                continue
+            return stripped.split(marker, 1)[1].strip()
+    return ""
+def is_shallow(value: str | None) -> bool:
+    if value is None:
+        return True
+    compact = normalize(value).strip(" .:;，。；：")
+    return compact in SHALLOW_VALUES or len(compact) < 8
+def has_why(value: str) -> bool:
+    lowered = normalize(value)
+    return any(marker in lowered for marker in WHY_MARKERS)
+def improvement_is_needed(value: str | None) -> bool:
+    lowered = normalize(value or "")
+    if any(marker in lowered for marker in NO_IMPROVEMENT_MARKERS):
+        return False
+    return any(marker in lowered for marker in IMPROVEMENT_NEEDED_MARKERS)
+def next_action_is_stop(body: str) -> bool:
+    lowered = normalize(body)
+    if any(marker in lowered for marker in STOP_DECISION_MARKERS):
+        return True
+    return re.search(r"^\s*-\s*(decision|决策)\s*[:：]\s*(stop|停止)\b", body, flags=re.IGNORECASE | re.MULTILINE) is not None
+def has_terminal_boundary(value: str) -> bool:
+    lowered = normalize(value)
+    return any(marker in lowered for marker in TERMINAL_BOUNDARY_MARKERS)
+def parse_repair_attempts(value: str) -> int | None:
+    match = re.search(r"\d+", value or "")
+    if not match:
+        return None
+    return int(match.group(0))
+def validate_core_table(text: str) -> list[str]:
+    issues = []
+    rows = parse_core_table_rows(text)
+    missing_rows = []
+    for row_name, markers in REQUIRED_CORE_ROWS.items():
+        value = find_row_value(rows, markers)
+        if is_blank_or_placeholder(value):
+            missing_rows.append(row_name)
+    if missing_rows:
+        issues.append(f"Core Explanation Table is missing non-empty answers for: {', '.join(missing_rows)}")
+    for row_name in ("did_not_work", "verifies", "improve_why", "how_improve", "decision"):
+        value = find_row_value(rows, REQUIRED_CORE_ROWS[row_name])
+        if is_shallow(value):
+            issues.append(f"Core Explanation Table row '{row_name}' is too shallow")
+    for row_name in ("improve_why", "how_improve"):
+        value = find_row_value(rows, REQUIRED_CORE_ROWS[row_name])
+        if value and not has_why(value):
+            issues.append(f"Core Explanation Table row '{row_name}' must include a reason, not only an action")
+    return issues
+def validate_evidence_section(text: str) -> list[str]:
+    issues = []
+    body = extract_section(text, REQUIRED_SECTIONS["Evidence And Artifacts"])
+    marker_groups = (
+        ("Primary artifact:", "主工件："),
+        ("Supporting artifacts:", "支撑工件："),
+        ("Validation commands:", "验证命令："),
+        ("Known gaps:", "已知缺口："),
+    )
+    if not body:
+        return ["Evidence And Artifacts section is empty"]
+    for group in marker_groups:
+        if not any(marker in body for marker in group):
+            issues.append(f"Evidence And Artifacts is missing '{group[0]}'")
+            continue
+        if not has_marker_with_value(body, group):
+            issues.append(f"Evidence And Artifacts field '{group[0]}' must have a non-empty value")
+    return issues
+def validate_requested_outcome_mapping(text: str) -> list[str]:
+    issues = []
+    body = extract_section(text, REQUIRED_SECTIONS["Requested Outcome Mapping"])
+    marker_groups = (
+        ("Original request:", "原始请求："),
+        ("Requested deliverables:", "请求交付物："),
+        ("Completion mapping:", "完成映射："),
+        ("Response shape:", "回答形态："),
+    )
+    if not body:
+        return ["Requested Outcome Mapping section is empty"]
+    for group in marker_groups:
+        if not any(marker in body for marker in group):
+            issues.append(f"Requested Outcome Mapping is missing '{group[0]}'")
+            continue
+        if not has_marker_with_value(body, group):
+            issues.append(f"Requested Outcome Mapping field '{group[0]}' must have a non-empty value")
+    return issues
+def validate_repair_control(text: str, expected_stage: str) -> list[str]:
+    body = extract_section(text, REPAIR_CONTROL_SECTION)
+    is_auto_stage = expected_stage.lower() == "auto"
+    if not body:
+        if is_auto_stage:
+            return ["Repair Control section is required for auto stage reports"]
+        return []
+    issues = []
+    for group in REPAIR_CONTROL_FIELDS:
+        if not any(marker in body for marker in group):
+            issues.append(f"Repair Control is missing '{group[0]}'")
+            continue
+        if not has_marker_with_value(body, group):
+            issues.append(f"Repair Control field '{group[0]}' must have a non-empty value")
+    for pattern in FORBIDDEN_REPAIR_PATTERNS:
+        if re.search(pattern, body, flags=re.IGNORECASE):
+            issues.append(f"Repair Control contains forbidden repair: {pattern}")
+    attempts = parse_repair_attempts(marker_value(body, ("Repair attempts used:", "已用修复次数：")))
+    confirmation = marker_value(body, ("Confirmation check:", "确认验证："))
+    whole_text = normalize(text)
+    repair_succeeded = any(marker in whole_text for marker in REPAIR_SUCCESS_MARKERS)
+    if attempts and attempts > 0 and repair_succeeded:
+        normalized_confirmation = normalize(confirmation)
+        if (
+            any(marker in normalized_confirmation for marker in NO_CONFIRMATION_MARKERS)
+            or not any(marker in normalized_confirmation for marker in CONFIRMATION_MARKERS)
+        ):
+            issues.append(
+                "Repair Control requires a confirmation check after a successful repair before promotion or final success"
+            )
+    return issues
+def validate_rule_preflight(text: str) -> list[str]:
+    body = extract_section(text, REQUIRED_SECTIONS["Rule Preflight"])
+    marker_groups = (
+        ("Rule source file:",),
+        ("Rule source revision:",),
+        ("Project version:",),
+        ("Resolved stage:",),
+        ("Resolved mode:",),
+        ("Resolved target:",),
+        ("Preflight stamp:",),
+    )
+    issues = []
+    for group in marker_groups:
+        marker = group[0]
+        if marker not in body:
+            issues.append(f"Rule Preflight is missing '{marker}'")
+            continue
+        if not has_marker_with_value(body, group):
+            issues.append(f"Rule Preflight field '{marker}' must have a non-empty value")
+    return issues
+def validate_next_action(text: str) -> list[str]:
+    body = extract_section(text, REQUIRED_SECTIONS["Next Action"])
+    if is_shallow(body):
+        return ["Next Action section must state a concrete decision and next step"]
+    allowed = ("continue", "stop", "revise", "rerun", "escalate", "handoff", "继续", "停止", "修订", "重跑", "升级", "交接")
+    if not any(marker in normalize(body) for marker in allowed):
+        return ["Next Action must choose continue, stop, revise, rerun, escalate, or handoff"]
+    if not has_why(body):
+        return ["Next Action must include why the next step is appropriate"]
+    rows = parse_core_table_rows(text)
+    improve_value = find_row_value(rows, REQUIRED_CORE_ROWS["improve_why"]) or ""
+    if improvement_is_needed(improve_value) and next_action_is_stop(body) and not has_terminal_boundary(body):
+        return [
+            "Next Action cannot stop after a recoverable improvement need without an explicit terminal boundary; choose continue, revise, rerun, or escalate, or state the budget/frozen-core/safety boundary"
+        ]
+    return []
+def validate_stage_identity(text: str, expected_stage: str) -> list[str]:
+    if not expected_stage:
+        return []
+    body = extract_section(text, REQUIRED_SECTIONS["Stage Identity"])
+    if expected_stage.lower() not in body.lower():
+        return [f"Stage Identity must mention expected stage '{expected_stage}'"]
+    return []
+def validate_internal_meta(text: str) -> list[str]:
+    issues = []
+    for pattern in INTERNAL_META_PATTERNS:
+        if re.search(pattern, text, flags=re.IGNORECASE):
+            issues.append(f"stage report contains internal or service-style meta language: {pattern}")
+    return issues
+def validate(path: Path, expected_stage: str = "") -> list[str]:
+    if not path.exists():
+        return [f"stage report does not exist: {path}"]
+    text = path.read_text(encoding="utf-8")
+    issues = []
+    missing_sections = find_missing_sections(text)
+    if missing_sections:
+        issues.append(f"stage report is missing required sections: {', '.join(missing_sections)}")
+    if not missing_sections:
+        issues.extend(validate_rule_preflight(text))
+        issues.extend(validate_stage_identity(text, expected_stage))
+        issues.extend(validate_requested_outcome_mapping(text))
+        issues.extend(validate_repair_control(text, expected_stage))
+        issues.extend(validate_core_table(text))
+        issues.extend(validate_evidence_section(text))
+        issues.extend(validate_next_action(text))
+    issues.extend(validate_internal_meta(text))
+    return issues
+def main():
+    args = parse_args()
+    issues = validate(Path(args.stage_report), args.stage)
+    if issues:
+        for issue in issues:
+            print(issue, file=sys.stderr)
+        return 1
+    print("stage report is valid")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

package/package-assets/shared/lab/.managed/templates/final-report.md CHANGED Viewed

@@ -9,6 +9,9 @@
 ## Reader Summary
 - One-sentence conclusion:
+- Core insight:
+- Evidence that supports the insight:
+- Decision or action implication:
 - What is validated:
 - What is still unproven:
 - Biggest reporting risk:
@@ -35,6 +38,8 @@
 - Approved method name:
 - Plain-language method summary:
+- Mechanism tested or explained:
+- Why the design follows from the insight:
 - What this method changes relative to prior work:
 - Most relevant prior work or baseline anchors:
 - What those prior methods do:
@@ -115,10 +120,16 @@
 Summarize validated iteration outcomes.
+- Diagnostic interpretation:
+- What this teaches beyond the raw numbers:
 ## Ablations
 Describe meaningful ablations and what they showed.
+- Mechanism tested:
+- What the ablation teaches beyond the delta:
 ## Failures
 Preserve failed runs and rejected ideas.

package/package-assets/shared/lab/.managed/templates/idea.md CHANGED Viewed

@@ -159,6 +159,24 @@ Suggested levels:
 - Expected advantage:
 - Evidence needed to prove the advantage:
+## Contribution vs Insight
+- Contribution:
+- Insight:
+- Core insight anchor sentence:
+- Why the insight matters beyond the artifact:
+- Action or community value:
+## Insight Evidence Chain
+- Observation:
+- Why existing explanations fail:
+- Core insight:
+- Mechanism:
+- Validation tests:
+- Generalization or action implication:
+- Prediction:
 ## Rough Approach
 - Plain-language description of how this would work:

package/package-assets/shared/lab/.managed/templates/main-tables.md CHANGED Viewed

@@ -36,27 +36,33 @@
 - Table 2 is for:
 - Table 3 is for:
 - Table 4 is for:
+- Diagnostic takeaway:
+- What the tables do not prove:
 ## Table 1
 - Purpose:
 - Metrics used:
 - Strongest supported claim:
+- Mechanism or insight tested:
 ## Table 2
 - Purpose:
 - Metrics used:
 - Strongest supported claim:
+- Mechanism or insight tested:
 ## Table 3
 - Purpose:
 - Metrics used:
 - Strongest supported claim:
+- Mechanism or insight tested:
 ## Table 4
 - Purpose:
 - Metrics used:
 - Strongest supported claim:
+- Mechanism or insight tested:

package/package-assets/shared/lab/.managed/templates/paper-plan.md CHANGED Viewed

@@ -5,6 +5,7 @@
 - Venue or audience:
 - Paper status:
 - Core story in one sentence:
+- Core insight anchor:
 - Approved framing artifact:
 - Terminology lock:
@@ -24,6 +25,14 @@
 - Limitation sources:
 - Claims that still need more evidence:
+## Insight Integration Map
+- Introduction contrast:
+- Method design consequence:
+- Experiments diagnostic evidence:
+- Conclusion principle or action implication:
+- Alternative explanation to address:
 ## Asset Coverage Targets
 - Core asset floor: