npm - kc-beta - Versions diffs - 0.7.3 → 0.8.1 - Mend

kc-beta 0.7.3 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

package/src/marathon/prompts.js ADDED Viewed

@@ -0,0 +1,93 @@
+// v0.8 P4 — templated continuation prompts for the marathon driver.
+//
+// Driver state → prompt template mapping. Templates are deterministic
+// (no LLM in the driver), bilingual (en + zh per workspace LANGUAGE).
+// Goal: surface to KC's main conductor the smallest useful nudge to
+// keep the pipeline moving without leaking marathon-implementation
+// details into the agent's context.
+//
+// Each template is a function (engineState, goal) → string. State
+// fields used:
+//   currentPhase, milestones, idleSec, lastEventType, goal
+//
+// Add new templates here as the driver state machine grows.
+const TEMPLATES_EN = {
+  initial: (s) => `Goal: ${s.goal}\n\n` +
+    `You are running in marathon mode (no manual user check-ins). Advance the pipeline phase by phase. ` +
+    `Engine derives milestones from filesystem facts; produce real artifacts, then call phase_advance. ` +
+    `If you get stuck on a specific phase, surface the blocker in your next response and the driver will ` +
+    `inject a diagnostic prompt next turn.`,
+  continue_phase: (s) => `Continue with ${s.currentPhase} work. ` +
+    `Engine status: ${formatMilestones(s.milestones)}.`,
+  advance_phase: (s) => `${s.currentPhase} milestones look complete (${formatMilestones(s.milestones)}). ` +
+    `Verify the gate conditions then call \`phase_advance\` to the next phase.`,
+  unstick: (s) => `No phase progress in the last ${Math.round(s.idleSec / 60)} minutes. ` +
+    `Either (1) surface the blocker explicitly so the developer user can intervene, or (2) ` +
+    `consult the relevant meta-skill for the current phase and try a different approach. ` +
+    `If you've genuinely finished and the engine gate is wrong, force phase_advance with reason.`,
+  finalize: (s) => `You've reached finalization. Wrap the deliverable bundle: ` +
+    `verify rule_skills/coverage_report.md is substantive, output/releases/<slug>/ is current ` +
+    `(re-run release tool if workflows changed after the last snapshot), and final_dashboard.html ` +
+    `reflects the latest QC data. When done, just say so — the marathon will exit.`,
+  stop: () => `Marathon stop condition reached. Save state and summarize what was accomplished.`,
+};
+const TEMPLATES_ZH = {
+  initial: (s) => `目标：${s.goal}\n\n` +
+    `你正运行在 marathon 模式（无人工 check-in）。按阶段推进整条流水线。` +
+    `引擎从文件系统事实派生里程碑；先把真实交付物产出来，再调用 phase_advance。` +
+    `如果在某个阶段卡住了，直接在下一回合的回复里把阻塞点说清楚，驱动器会在下一回合注入诊断提示。`,
+  continue_phase: (s) => `继续 ${s.currentPhase} 阶段的工作。` +
+    `引擎状态：${formatMilestones(s.milestones)}。`,
+  advance_phase: (s) => `${s.currentPhase} 阶段的里程碑看起来已经完成（${formatMilestones(s.milestones)}）。` +
+    `核对一遍门控条件，然后调用 \`phase_advance\` 进入下一阶段。`,
+  unstick: (s) => `已经 ${Math.round(s.idleSec / 60)} 分钟没有阶段推进了。` +
+    `两条路：(1) 明确说出阻塞在哪里、让开发者用户介入；(2) 查阅当前阶段相关的 meta-skill 换个思路再试。` +
+    `如果你确实已经做完、但引擎门控判断错误，用 reason 强制 phase_advance。`,
+  finalize: (s) => `已经进入 finalization。收尾打包：` +
+    `确认 rule_skills/coverage_report.md 内容充实、output/releases/<slug>/ 是最新的（` +
+    `如果 workflows 在最近一次快照之后还有改动，重新跑 release 工具），final_dashboard.html 反映最新 QC 数据。` +
+    `做完之后直接说一声，marathon 会退出。`,
+  stop: () => `Marathon 停止条件已触发。保存状态、总结已完成的工作。`,
+};
+function formatMilestones(m) {
+  if (!m || typeof m !== "object") return "(unknown)";
+  const parts = [];
+  for (const [k, v] of Object.entries(m)) {
+    if (typeof v === "number") parts.push(`${k}:${v}`);
+    else if (typeof v === "boolean") parts.push(`${k}:${v ? "yes" : "no"}`);
+    else if (Array.isArray(v)) parts.push(`${k}:${v.length}`);
+  }
+  return parts.slice(0, 6).join(", ") || "(empty)";
+}
+/**
+ * Render a continuation prompt for the given driver state.
+ *
+ * @param {string} templateKey — one of: initial, continue_phase, advance_phase, unstick, finalize, stop
+ * @param {object} state — {goal, currentPhase, milestones, idleSec, lastEventType}
+ * @param {string} [language] — "en" or "zh" (defaults to "en")
+ * @returns {string}
+ */
+export function renderPrompt(templateKey, state, language = "en") {
+  const tmpls = language === "zh" ? TEMPLATES_ZH : TEMPLATES_EN;
+  const tmpl = tmpls[templateKey];
+  if (!tmpl) {
+    throw new Error(`Unknown marathon prompt template: ${templateKey}`);
+  }
+  return tmpl(state);
+}
+export const PROMPT_TEMPLATES = Object.freeze(Object.keys(TEMPLATES_EN));

package/template/.env.template CHANGED Viewed

@@ -29,3 +29,19 @@ MONITOR_FREQUENCY=mid
 # === Evolution Control ===
 # Maximum evolution iterations per rule before escalating to developer user
 MAX_ITERATIONS=20
+# === sandbox_exec Timeout (v0.8 P1-F) ===
+# Default timeout when the agent doesn't pass timeout_ms (ms).
+# Claude Code parity = 120000 (2 min). Raise if your default workloads
+# routinely exceed 2 min (e.g., document-parsing benchmarks).
+# KC_EXEC_DEFAULT_TIMEOUT_MS=120000
+#
+# Hard ceiling — agent's timeout_ms is clamped to this (ms). Raise for
+# v0.9 jyppx integration where parser_building can take 10-30 min per
+# corpus. Don't go above 1800000 (30 min) without a specific reason.
+# KC_EXEC_MAX_TIMEOUT_MS=600000
+#
+# Legacy alias (seconds) for KC_EXEC_DEFAULT_TIMEOUT_MS. Deprecated as of
+# v0.8 — prefer the ms-based name. The seconds value is multiplied by
+# 1000 if KC_EXEC_DEFAULT_TIMEOUT_MS isn't set.
+# KC_EXEC_TIMEOUT=120

package/template/AGENT.md CHANGED Viewed

@@ -1,20 +1,195 @@
-# AGENT.md — Project Context
+# AGENT.md — KC Project Context
-This file is your per-project memory. Update it as you learn about the project.
-The content here is injected into your system prompt on every turn.
+This file is injected into the agent's system prompt every turn. The
+top sections describe KC's design philosophy + your mission (static
+across sessions); the bottom sections are per-project memory you
+update as you learn about this specific business scenario.
-## Project
+> **Skill priority**: meta-meta skills are architectural — they
+> override meta (how-to) skills when guidance conflicts. The
+> architect's frame bounds the technique. If you find yourself
+> rationalizing past a meta-meta principle to follow a meta procedure,
+> stop — the frame should bound the technique, not the other way
+> around. Each skill declares its tier in YAML frontmatter (`tier:
+> meta-meta` or `tier: meta`).
+---
+# KC Reborn — Document Verification Workspace
+## What This Workspace Is
+You are a coding agent tasked with building a document verification app for the developer user's specific business scenario. The meta skills in `skills/` encode the methodology of experienced verification system architects and business analysts. You bring the intelligence and judgment to apply this methodology to the specific case at hand.
+Your goal: build a verification system that starts with you doing the work, then gradually distills your capability into cheap, fast workflows powered by worker LLMs. You are the ground truth. The workflows you create are the deliverables.
+## Roles
+- **Developer user**: The human you serve. They are a domain expert (e.g., tech lead at a bank's loan department). They provide the rules, the documents, and the business context. Discuss decisions with them.
+- **You (the coding agent)**: You are both the Builder (creating skills and workflows) and the Observer (judging quality). You do the verification first, prove it works, then teach smaller models to replicate your results.
+- **Worker LLMs**: The performers. Models configured in `.env` (TIER1 through TIER4) that will execute the workflows you build. Your job is to find the smallest model that works for each task.
+## Workspace Layout
+```
+Rules/       — Regulation documents, compliance notes from the developer user
+Samples/     — Sample documents for testing (your training set)
+Input/       — Production document batches awaiting verification
+Output/      — Verification results
+skills/      — Methodology skills (current phase's available set)
+.env         — Configuration: API keys, model tiers, thresholds, language
+```
+Note: KC's session workspace under `~/.kc_agent/workspaces/<sessionId>/`
+uses lowercase counterparts (`rules/`, `samples/`, `input/`, `output/`,
+`logs/`, `workflows/`, `rule_skills/`) — these are runtime-internal and
+separate from this project's user-facing folders above. The asymmetry
+is intentional: title-case for human-facing project dirs, lowercase for
+KC's working state.
+## Your Mission
+Follow this lifecycle. Each step references the skill(s) to consult.
+Always-loaded skills are already in your system prompt (above); other
+skills are listed under "Available Methodology Skills" and require
+`consult_skill(name)` to load the body.
+1. **Bootstrap** → `bootstrap-workspace` (always loaded). Understand the business scenario, read Rules/, scan Samples/, configure .env with the developer user.
+2. **Extract Rules** → `rule-extraction` (always loaded). Decompose regulation documents into atomic, testable verification rules.
+3. **Decompose Tasks** → `work-decomposition` (always loaded in skill_authoring). Decide ordering, grouping, and TaskBoard structure.
+4. **Map Rule Relationships** → `consult_skill("rule-graph")`. Identify shared entities, dependencies, and conflicts between rules. Each rule stays independently executable.
+5. **Write Rule Skills** → `skill-authoring` (always loaded in skill_authoring). Write each rule into a skill folder. Before writing extraction logic for a new document type, `consult_skill("data-sensibility")` to observe the data first.
+6. **Test Skills** → Apply each skill to Samples/. `evolution-loop` is always loaded in skill_testing — use it to diagnose failures and iterate. Continue until accuracy meets SKILL_ACCURACY threshold in .env.
+7. **Distill to Workflows** → `skill-to-workflow` (always loaded in distillation). Convert proven skills into Python code + worker LLM prompts. Test workflows against your own results as ground truth. Iterate until WORKFLOW_ACCURACY is met.
+8. **Production QC** → `quality-control` (always loaded in production_qc). Run workflows on Input/. Sample and review results based on confidence scores. For multi-document cases, `consult_skill("cross-document-verification")`. Use `evolution-loop` when quality drops.
+9. **Stabilize** → Gradually reduce monitoring as workflows prove reliable. Only intervene when rules change or quality drops.
+10. **Report** → `consult_skill("dashboard-reporting")`. Generate HTML dashboards so the developer user can see results, progress, and issues. Ensure dashboards include feedback collection mechanisms for users.
+Throughout: `consult_skill("version-control")` to track changes. `consult_skill("corner-case-management")` to handle edge cases without polluting workflows.
+## Core Principles
+- **Minimum viable model**: Always use the smallest, cheapest, fastest model that meets the accuracy threshold. Start simple, escalate only when necessary.
+- **JIT structure**: Do not design schemas or formats prematurely. Define them when needed, keep them consistent once defined.
+- **OTF evolution**: The system you build today may look completely different tomorrow. Embrace change.
+- **Skills before workflows**: Prove each rule works as a skill (you executing it) before distilling into code + worker LLM prompts.
+- **Log everything**: Every test iteration, every evolution decision, every version change. Both JSON (machine-readable) and plain text (human-readable).
+## How to Use Skills
+Skills are loaded in two ways:
+1. **Always loaded** — bodies are inline in this system prompt above the project orientation. These are the architecturally-required skills for the current phase. Treat them as authoritative.
+2. **Available — call consult_skill(name)** — listed by name + description in the system prompt under "Available Methodology Skills." Call `consult_skill("<name>")` to load the body into your conversation history when the description tease isn't enough.
+The skill body is the methodology. Skills convey philosophy and decision frameworks. Adapt them to the specific business case. Do not follow them rigidly.
+## Communication with Developer User
+- **Proactively discuss**: rule granularity, accuracy thresholds, model selection, edge cases.
+- **Report progress**: after each testing round, share results and next steps.
+- **Escalate**: when you cannot resolve an issue after iterating, surface it with evidence.
+- **Ask**: the developer user is a domain expert. When in doubt about a rule's intent, ask.
+---
+# KC Reborn — 文档核查工作区
+> **技能优先级**: meta-meta 技能是架构层面 —— 当指导冲突时，
+> meta-meta 凌驾于 meta (技法层面) 之上。架构师的框架约束技法。
+> 如果你发现自己在为了遵循一条 meta 程序而绕开一条 meta-meta
+> 原则，停下 —— 框架应当约束技法，而不是反过来。每个技能在
+> YAML frontmatter 中声明自己的层级 (`tier: meta-meta` 或
+> `tier: meta`)。
+## 这是什么
+你是一个编程智能体，负责为开发者用户的具体业务场景构建文档核查应用。`skills/` 中的元技能编码了资深核查系统架构师和业务分析师的方法论。你负责运用智慧和判断力，将这些方法论应用到具体场景中。
+你的目标：构建一个核查系统，先由你亲自执行核查工作，然后逐步将你的能力蒸馏为由 Worker LLM（执行模型）驱动的低成本、高速度的工作流。你是基准真值。你创建的工作流是最终交付物。
+## 角色定义
+- **开发者用户**：你服务的人。他们是领域专家（如银行信贷部门的技术负责人）。他们提供规则、文档和业务背景。与他们讨论决策。
+- **你（编程智能体）**：你既是构建者（创建技能和工作流），也是观察者（评判质量）。你先执行核查，证明方法可行，再教小模型复现你的结果。
+- **Worker LLM**：执行者。在 `.env` 中配置的模型（TIER1到TIER4），将执行你构建的工作流。你的任务是为每项工作找到能胜任的最小模型。
+## 工作区结构
+```
+Rules/       — 法规文件、开发者用户的合规注释
+Samples/     — 用于测试的样本文件（你的训练集）
+Input/       — 等待核查的生产批次文件
+Output/      — 核查结果
+skills/      — 当前阶段可用的方法论技能
+.env         — 配置：API密钥、模型层级、阈值、语言
+```
+注：KC 在 `~/.kc_agent/workspaces/<sessionId>/` 下的会话工作区使用
+小写对应目录（`rules/`、`samples/`、`input/`、`output/`、`logs/`、
+`workflows/`、`rule_skills/`）—— 这些是运行时内部目录，与本项目上面
+那些用户可见的目录是分开的。这种大小写不对称是有意的：项目里给人看
+的目录用首字母大写；KC 自己的工作状态用小写。
+## 你的使命
+遵循以下生命周期。常驻加载的技能已经在你的系统提示词中；其他技能在"可用方法论技能"清单里列出，调 `consult_skill(name)` 才能加载正文。
+1. **初始化** → `bootstrap-workspace`（常驻）。理解业务场景，阅读 Rules/，浏览 Samples/，与开发者用户配置 .env。
+2. **提取规则** → `rule-extraction`（常驻）。将法规文件分解为原子级、可测试的核查规则。
+3. **任务分解** → `work-decomposition`（skill_authoring 常驻）。决定顺序、分组以及 TaskBoard 结构。
+4. **构建规则图谱** → `consult_skill("rule-graph")`。识别规则间的共享实体、依赖关系和潜在冲突。每条规则保持独立可执行。
+5. **编写规则技能** → `skill-authoring`（skill_authoring 常驻）。将每条规则写入技能文件夹。编写新文档类型的提取逻辑前，先 `consult_skill("data-sensibility")` 观察数据。
+6. **测试技能** → 在 Samples/ 上应用每个技能。`evolution-loop` 在 skill_testing 常驻 —— 用它诊断失败并迭代。直到准确率达到 .env 中的 SKILL_ACCURACY 阈值。
+7. **蒸馏为工作流** → `skill-to-workflow`（distillation 常驻）。将验证过的技能转化为 Python 代码 + Worker LLM 提示词。用你自己的结果作为基准测试工作流。迭代直到达到 WORKFLOW_ACCURACY。
+8. **生产质控** → `quality-control`（production_qc 常驻）。在 Input/ 上运行工作流。根据置信度分数抽样审查结果。涉及多文档案件时，`consult_skill("cross-document-verification")`。质量下降时使用 `evolution-loop`。
+9. **稳定运行** → 随着工作流稳定，逐步降低监控频率。仅在规则变更或质量下降时介入。
+10. **报告** → `consult_skill("dashboard-reporting")`。生成 HTML 仪表板，让开发者用户直观地看到结果、进度和问题。确保仪表盘内置用户反馈收集机制。
+全程：用 `consult_skill("version-control")` 跟踪所有变更，用 `consult_skill("corner-case-management")` 处理边缘案例，不要污染主工作流。
+## 核心原则
+- **最小可用模型**：始终使用能达到准确率阈值的最小、最便宜、最快的模型。从简单开始，必要时才升级。
+- **即时结构（JIT）**：不要过早设计数据结构或格式。需要时定义，定义后保持一致。
+- **即时演进（OTF）**：你今天构建的系统明天可能面目全非。拥抱变化。
+- **先技能后工作流**：先证明每条规则作为技能（你执行）可行，再蒸馏为代码 + Worker LLM 提示词。
+- **记录一切**：每次测试迭代、每个演进决策、每次版本变更。同时保存 JSON（机器可读）和纯文本（人类可读）。
+## 如何使用技能
+技能通过两种方式加载：
+1. **常驻加载** —— 技能正文直接出现在本系统提示词里、项目说明的上方。这些是当前阶段架构上必需的技能，把它们的内容当作权威指导。
+2. **可用 —— 调 consult_skill(name)** —— 在系统提示词的"可用方法论技能"清单里按名字 + 描述列出。当描述简介不够用时，调 `consult_skill("<名字>")` 把技能正文加载到你的对话历史里。
+技能正文是方法论本身。技能传达的是理念和决策框架。请根据具体业务场景灵活运用，不要机械照搬。
+## 与开发者用户的沟通
+- **主动讨论**：规则粒度、准确率阈值、模型选择、边缘案例。
+- **汇报进度**：每轮测试后，分享结果和下一步计划。
+- **升级问题**：迭代后仍无法解决的问题，附带证据提交给开发者用户。
+- **多问**：开发者用户是领域专家。对规则意图有疑问时，问他们。
+---
+## Per-project memory (you maintain this section)
+The sections below are your scratchpad for this specific project. Update them as you learn about the business scenario, decisions, and edge cases. They persist across your sessions on this project.
+### Project
 <!-- What domain? What regulations? What documents? Fill this in during bootstrap. -->
-## Decisions
+### Decisions
 <!-- Key decisions made with the developer user. Rule granularity, accuracy targets, model choices, scope boundaries. -->
-## Domain Notes
+### Domain Notes
 <!-- Terminology, document formats, naming conventions, edge cases specific to this domain. -->
-## User Preferences
+### User Preferences
 <!-- How the developer user prefers to communicate. Reporting format, language, level of detail. -->

package/template/skills/en/{meta-meta/auto-model-selection → auto-model-selection}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: auto-model-selection
+tier: meta
 description: >
   Use Context7 CLI to get up-to-date LLM model information. Use whenever you need to
   know about available models, model capabilities, pricing, context window sizes, or

package/template/skills/en/{meta-meta/bootstrap-workspace → bootstrap-workspace}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: bootstrap-workspace
+tier: meta-meta
 description: Initialize and configure a document verification workspace. Use when a developer user first opens this workspace, when .env needs configuration, or when the business scenario needs to be understood. Guides the coding agent through reading regulation documents, understanding the developer user's business context, configuring model tiers and thresholds, and establishing the working relationship. Covers initial conversation with developer user to scope the verification task, set expectations, and agree on checkpoints.
 ---
@@ -72,6 +73,20 @@ Once a project is past bootstrap and into production, fresh documents often arri
 Discuss the cadence with the developer user during bootstrap — knowing the production input rhythm shapes how skills and workflows should be written (batch vs streaming, idempotency requirements, etc.).
+## Per-project memory: keep AGENT.md alive
+`AGENT.md` at the workspace root has per-project memory sections (`Project`, `Decisions`, `Domain Notes`, `User Preferences`). These are intentionally placeholder comments at bootstrap — they're for YOU to fill in as the work surfaces things worth remembering across phases or future sessions.
+What belongs there:
+- **Project**: corpus identity (regulation name + scope), language, primary vs auxiliary rules, sample doc set composition.
+- **Decisions**: design choices that aren't obvious from code — "non-标 35% limit is bank-level not per-product, so single-doc reports get WARNING not FAIL", "季报 not applicable for R02-06/R02-08 per regulation §39", etc.
+- **Domain Notes**: regulatory or business-domain nuance worth surfacing — "PT/RT/LZ are three distinct product types with different disclosure templates", terminology disambiguation.
+- **User Preferences**: how the developer user wants you to operate on THIS project — verbosity, naming conventions, when to ask vs proceed.
+Update AGENT.md at natural checkpoints: after the developer user gives you a substantive clarification, after you finish a phase, after you discover a design constraint that affects subsequent phases. Don't wait for a `/remember` instruction — the memory is yours to maintain.
+A future session resumes by reading AGENT.md first. The richer it is, the less re-explanation the developer user has to do.
 ## When to Re-Bootstrap
 Return to this skill when:

package/template/skills/{zh/meta → en}/compliance-judgment/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: compliance-judgment
+tier: meta
 description: Determine whether extracted entities comply with verification rules. Use after entity extraction to make the pass/fail judgment for each rule on each document. Covers translating natural language rules into executable logic, choosing between Python calculation and LLM semantic judgment, and producing actionable comments on failures. Also use when designing the judgment step of a workflow or when a rule's judgment logic needs debugging.
 ---

package/template/skills/en/{meta/confidence-system → confidence-system}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: confidence-system
+tier: meta
 description: Design and calibrate confidence scoring for extraction and verification results. Use when building any workflow that needs to quantify trust in its output, when setting up quality control sampling thresholds, or when calibrating existing confidence scores against actual accuracy. Confidence is the bridge between workflows and quality control — high confidence means less review, low confidence means more review. Also use when the quality control skill reports that confidence scores do not correlate with actual correctness.
 ---

package/template/skills/en/{meta/corner-case-management → corner-case-management}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: corner-case-management
+tier: meta
 description: Identify, catalog, and handle corner cases that do not fit the mainstream verification workflow. Use when the evolution loop classifies a failure as a corner case (affecting less than ~10% of documents), when adding a new edge case to the registry, or when deciding whether a corner case should be promoted to a systemic fix. Also use when designing the corner case detection mechanism for a workflow.
 ---

package/template/skills/en/{meta/cross-document-verification → cross-document-verification}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: cross-document-verification
+tier: meta
 description: Perform case-level analysis across multiple documents for the same transaction. Use when documents do not exist in isolation — main contracts have appendices, loan applications come bundled with income certificates, bank statements, credit reports, and property appraisals. Use to build comparison matrices, detect contradictions (hard mismatches and soft implausibilities), classify severity, and flag fraud signals. Also use when user or end-user reports a cross-document inconsistency — these reports are ground truth and take priority over agent judgment.
 ---

package/template/skills/en/{meta-meta/dashboard-reporting → dashboard-reporting}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: dashboard-reporting
+tier: meta-meta
 description: Generate HTML dashboards for developer users to visualize verification results, system progress, and quality metrics. Use when a testing round completes, when production batches finish processing, when the developer user wants to see the system's status, or at any point where visual reporting would help communicate progress. Dashboards should be self-contained HTML files that can be opened by double-clicking. Also use when the developer user asks about results, accuracy, or system health.
 ---

package/template/skills/en/{meta/data-sensibility → data-sensibility}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: data-sensibility
+tier: meta
 description: Build intuition about document data before writing extraction logic. Use before designing any extraction schema or regex pattern, when onboarding a new document type, or when extraction accuracy is unexpectedly low and you suspect a data assumption is wrong. Covers systematic observation of raw documents, spot-checking extracted results, distribution analysis, and recognizing suspicious patterns. If you are about to write code that touches document data and you have not read at least five documents end-to-end, stop and use this skill first.
 ---

package/template/skills/{zh/meta → en}/document-chunking/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: document-chunking
+tier: meta
 description: >
   Fast, cheap chunking for processing batches of sample and input documents.
   Use when you need to split documents into manageable pieces for initial observation,

package/template/skills/en/{meta/document-parsing → document-parsing}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: document-parsing
+tier: meta
 description: Parse source documents into machine-readable text with maximum fidelity. Use when processing any document in Samples/ or Input/ for the first time, when parsed text quality is poor, or when tables and charts need special handling. Covers multi-level parser selection from simple text extraction to OCR and vision models. Also use when a verification rule fails due to parsing issues (garbled text, missing tables, mangled layouts) and the parser needs to be upgraded for that document type.
 ---

package/template/skills/{zh/meta → en}/entity-extraction/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: entity-extraction
+tier: meta
 description: Extract specific entities, values, and text segments from documents as required by verification rules. Use after tree processing has located the relevant section, when a rule needs a specific number, date, name, amount, clause, or any domain-specific entity extracted. Covers extraction method selection (regex vs LLM), schema design, postprocessing, and confidence annotation. Also use when designing the extraction step of a workflow for worker LLMs.
 ---

package/template/skills/en/{meta-meta/evolution-loop → evolution-loop}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: evolution-loop
+tier: meta-meta
 description: Drive continuous improvement of skills and workflows through the diagnose-classify-fix-retest cycle. Use after any testing round reveals failures, when production quality control flags issues, or when accuracy drops below thresholds. Covers failure analysis, distinguishing systemic issues from corner cases, deciding whether to rewrite or patch, and knowing when to stop iterating. The evolution loop is the heartbeat of the system. Also use when transitioning between lifecycle phases (skill testing, workflow testing, production monitoring).
 ---

package/template/skills/en/{meta-meta/pdf-review-dashboard → pdf-review-dashboard}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: pdf-review-dashboard
+tier: meta
 description: >
   Generate a two-column PDF review dashboard for manual verification result checking.
   Left panel shows the original PDF document, right panel shows verification results.

package/template/skills/en/{meta-meta/quality-control → quality-control}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: quality-control
+tier: meta-meta
 description: Design and execute quality control for production verification workflows. Use when workflows are deployed on Input/ documents and results need to be monitored, when designing the QC sampling strategy for a rule, or when evaluating whether monitoring can be reduced. Covers LLM-as-Judge evaluation, adaptive sampling strategies, confidence-based triage, and the transition from active monitoring to stable oversight. Also use when production quality drops and you need to diagnose whether to trigger the evolution loop.
 ---
@@ -120,6 +121,15 @@ There are two distinct dashboards in this system:
 When a release is built, point end users at the bundled dashboard, not the workspace one. Workspace dashboard stays your developer surface.
+## Re-release after substantive changes
+A release bundle is a snapshot of `workflows/` and `rule_skills/` at the moment the `release` tool ran. If you modify any `workflows/<rule>/workflow_v*.py`, `rule_skills/<id>/SKILL.md`, or `check.py` AFTER the release was built, the shipped artifact no longer reflects your actual work. Engine's milestone derivation will surface `releaseIsStale: true` with the divergent file list.
+When this fires:
+- **Substantive change** (new hybrid path, fixed verdict logic, added rule): re-run the `release` tool to produce a fresh bundle.
+- **Cosmetic edit only** (typo, comment, formatting): write `.accept_stale_release` into the release directory to acknowledge — `touch output/releases/<slug>/.accept_stale_release`.
+- **DON'T** declare finalization done while a stale release ships. Downstream consumers (other agents, deployed verification systems) read the bundled `parser_v*.py` / `workflows/`, not the workspace.
 ## Developer User Involvement
 The developer user should see QC results through the dashboard (see `dashboard-reporting`). Key metrics to surface:

package/template/skills/en/{meta-meta/rule-extraction → rule-extraction}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: rule-extraction
+tier: meta
 description: Extract and organize business verification rules from regulation documents into discrete, testable units. Use when processing documents in Rules/ to identify individual verification rules, when decomposing a regulation into atomic checks, or when the developer user adds new regulation files. Covers reading regulation text, identifying rule boundaries, determining granularity, handling cross-references, and producing a rule catalog. Also use when rules are provided in structured formats like xlsx or csv.
 ---

package/template/skills/en/{meta-meta/rule-graph → rule-graph}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: rule-graph
+tier: meta-meta
 description: Build and maintain a graph of relationships between verification rules — shared entities, logical dependencies, and conflicts. Use when analyzing the impact of a regulation change, when optimizing extraction to avoid duplicate work, when checking rule catalog completeness, or when rolling up document-level results into a summary. Critical constraint — the graph is an overlay for analysis, NOT a prerequisite for execution. Every rule must remain independently runnable.
 ---

package/template/skills/en/{meta-meta/skill-authoring → skill-authoring}/SKILL.md RENAMED Viewed

@@ -1,5 +1,6 @@
 ---
 name: skill-authoring
+tier: meta
 description: Write each verification rule into a Claude Code skill folder following the official skill format. Use when converting extracted rules into skill folders, when iterating on existing rule skills after testing, or when the developer user wants to capture domain knowledge as a skill. Each skill folder must be self-contained with business logic in SKILL.md, code in scripts/, regulation context in references/, and sample data in assets/. Also use the bundled skill-creator for the full eval/iterate workflow.
 ---
@@ -27,6 +28,8 @@ rule-skills/
 Not every rule needs all of these. A simple threshold check might only need SKILL.md and a script. A complex semantic rule might need detailed references and many samples. Start minimal, add as needed during testing.
+**Filename case matters.** Use uppercase `SKILL.md` (matching the meta-skill convention you see in `template/skills/`). On Linux filesystems this is case-sensitive; engine path-matching, audit scripts, and downstream tooling all assume uppercase. Do not write `skill.md`, `Skill.md`, or any other case variant.
 ## Granularity: 1 rule = 1 skill directory (default)
 Default to **one rule per skill directory**. Group rules into the same file ONLY when they meet BOTH:
@@ -46,6 +49,25 @@ E2E #4 demonstrated the cost: an agent wrote `unified_qc.py` to bypass 110 indiv
 If individual skills aren't running cleanly, the right response is to identify which ones break and fix them, not consolidate. The whole pipeline (extraction → skill_testing → distillation → production_qc) assumes one rule = one verifiable artifact.
+### Anti-pattern: stub SKILL.md OR stub check.py
+Each rule_skill folder MUST have BOTH a substantive `SKILL.md` AND a substantive `check.py` (or `check.py` that imports + calls a workflow that does the real work). One side being a stub breaks the contract.
+**Variant 1 (v0.7.5 贷款 audit § 9.1)**: stub `SKILL.md` (templated 19 lines with `检查逻辑: N/A`) paired with real `check.py` (44-131 LOC of regex methodology). SKILL.md is supposed to be the human-readable methodology document. A reader scanning the rule folder for "what does this verify and why" gets nothing. The agent put all the methodology into `check.py` comments, which works for the engine but loses the deliverable framing.
+**Variant 2 (v0.7.5 资管 audit § 3.4)**: substantive `SKILL.md` (real methodology, PASS/FAIL criteria, regulation cross-refs) paired with stub `check.py` (29-line scaffold returning `{"verdict": "NOT_APPLICABLE", "evidence": "Check requires worker LLM execution"}`). The real check logic lives in `workflows/<rule_id>/workflow.py` — but `check.py` doesn't import or call it. A user running `python rule_skills/R01-01/check.py document.txt` gets `NOT_APPLICABLE` on every input, which is misleading.
+**Variant 3 (legacy v0.7.0)**: stub `check.py` returning `{"pass": null, "method": "stub"}` paired with otherwise-real SKILL.md. Methodology described but never executable.
+**The contract**:
+- ✓ DO: SKILL.md describes WHAT to check + WHY + WHEN to flag it. Substantive — typically 50-300 lines, not 19.
+- ✓ DO: check.py implements the check. EITHER substantive direct logic OR `from workflows.<rule_id>.workflow_v1 import verify` + delegate. Returns concrete verdicts.
+- ✗ DON'T: stub SKILL.md with methodology in check.py comments (variant 1).
+- ✗ DON'T: substantive SKILL.md with check.py that returns NOT_APPLICABLE without delegating to a workflow (variant 2).
+- ✗ DON'T: stub check.py returning null verdict (variant 3, legacy).
+A future engine milestone check (v0.8 P2-F) may refuse phase advance if too many check.py files are stub-shaped. Better to author them substantively now.
 ## Writing SKILL.md
 ### Frontmatter
@@ -101,6 +123,24 @@ Scripts should be self-contained Python files that can be imported or executed.
 Do not put LLM prompts in scripts. LLM interactions belong in the SKILL.md body or in the workflow (later phase).
+### Strip reviewer annotations before keyword matching
+Sample documents often carry reviewer-annotation footers (`预期命中点: ...`, `标注: ...`, `Expected: ...`) that mark the ground-truth verdict for testing. If your check.py uses keyword/regex matching against the document body, these annotations will leak into the match — producing false-positive PASS on violation samples (your rule "finds" the disclosure keyword inside the annotation itself, not the actual document content).
+The canonical helper ships at `workflows/common/utils.py` and is auto-populated into every workspace at engine init:
+```python
+from workflows.common.utils import strip_annotations
+def check(document_text):
+    text = strip_annotations(document_text)
+    # ... your real check logic against `text`, not document_text
+```
+Recognized prefixes (Chinese + English variants): 预期命中点, 预期结果, 预期判定, 预期验证, 标注, 审核标注, Expected, expected, EXPECTED, Annotation, annotation. Pass `extra_prefixes=("..."、"...")` if your project uses different labels.
+E2E #11 贷款 v0.8 audit: 4/14 rules had standalone check.py false-positive PASS on violation samples because they matched the `预期命中点: ...年化利率` footer instead of the document body. v0.8.1 ships the helper as a template file so this trap is one import away from being avoided.
 ## Writing References
 `references/` holds content that the coding agent reads on demand:

package/template/skills/en/skill-creator/SKILL.md CHANGED Viewed

@@ -1,6 +1,7 @@
 ---
 name: skill-creator
-description: Anthropic's skill-scaffolding toolkit — use for iterating/improving existing skills or running evals on them, NOT as the primary reference for building KC's per-rule verification skills. For KC rule skills, read `meta-meta/skill-authoring` first (canonical folder layout + granularity rules + KC-specific check.py entry-point conventions) and `meta-meta/work-decomposition` for ordering + grouping decisions. This skill applies once per-rule skills exist and the agent wants to optimize their description/triggering or run formal evals.
+tier: meta
+description: Anthropic's skill-scaffolding toolkit — use for iterating/improving existing skills or running evals on them, NOT as the primary reference for building KC's per-rule verification skills. For KC rule skills, consult `skill-authoring` first (canonical folder layout + granularity rules + KC-specific check.py entry-point conventions) and `work-decomposition` for ordering + grouping decisions. This skill applies once per-rule skills exist and the agent wants to optimize their description/triggering or run formal evals.
 ---
 # Skill Creator