kc-beta 0.7.1 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,194 @@
1
+ import { BaseTool, ToolResult } from "./base.js";
2
+
3
+ const TASKS_REL = "tasks.json";
4
+
5
+ /**
6
+ * v0.7.3 — TaskCreate / TaskUpdate / TaskComplete tools.
7
+ *
8
+ * Completes the v0.7.0 "agent owns TaskBoard" design. The engine no longer
9
+ * auto-populates per-rule tasks on phase entry (PER_RULE_PHASES is empty by
10
+ * default — see task-manager.js); the agent reads the rule list via
11
+ * describeState, picks a decomposition (single / grouped / range / non-rule),
12
+ * and calls these tools to populate tasks.json. The Ralph loop in
13
+ * AgentEngine._runTaskLoopSerial then walks pending tasks one at a time.
14
+ *
15
+ * Skill teaching for these tools lives in
16
+ * template/skills/{en,zh}/meta-meta/work-decomposition/SKILL.md.
17
+ *
18
+ * tasks.json is a shared-coordination path (workspace.js
19
+ * SHARED_COORDINATION_PATHS) — every write goes through
20
+ * withSharedLockIfApplicable so two writers (main + subagent) serialize.
21
+ */
22
+
23
+ export class TaskCreateTool extends BaseTool {
24
+ constructor(workspace, taskManager) {
25
+ super();
26
+ this._workspace = workspace;
27
+ this._taskManager = taskManager;
28
+ }
29
+
30
+ get name() { return "TaskCreate"; }
31
+
32
+ get description() {
33
+ return (
34
+ "Add a task to the session task board. Tasks gate the Ralph loop — " +
35
+ "after the current turn ends, the engine pulls the next pending task " +
36
+ "and runs it. Use one task per unit of work you want to iterate on " +
37
+ "(per-rule, per-group, per-document — your decomposition). " +
38
+ "Call this on phase entry after reading describeState."
39
+ );
40
+ }
41
+
42
+ get inputSchema() {
43
+ return {
44
+ type: "object",
45
+ properties: {
46
+ id: {
47
+ type: "string",
48
+ description: "Unique task ID within this session (e.g. 'R001-skill_authoring' or 'group-trust-1').",
49
+ },
50
+ title: {
51
+ type: "string",
52
+ description: "Short human-readable title for the task.",
53
+ },
54
+ phase: {
55
+ type: "string",
56
+ description: "Phase this task belongs to (e.g. 'skill_authoring', 'skill_testing', 'distillation').",
57
+ },
58
+ ruleId: {
59
+ type: "string",
60
+ description: "Optional rule_id if this is a per-rule task. Omit for grouped or non-rule tasks.",
61
+ },
62
+ },
63
+ required: ["id", "title", "phase"],
64
+ };
65
+ }
66
+
67
+ async execute(input) {
68
+ const id = input.id || "";
69
+ const title = input.title || "";
70
+ const phase = input.phase || "";
71
+ const ruleId = input.ruleId || null;
72
+
73
+ if (!id) return new ToolResult("id required", true);
74
+ if (!title) return new ToolResult("title required", true);
75
+ if (!phase) return new ToolResult("phase required", true);
76
+
77
+ return await this._workspace.withSharedLockIfApplicable(TASKS_REL, () => {
78
+ const before = this._taskManager.getAllTasks().some((t) => t.id === id);
79
+ this._taskManager.addTask({ id, title, phase, ruleId });
80
+ if (before) {
81
+ return new ToolResult(`Task ${id} already existed (no-op).`);
82
+ }
83
+ const p = this._taskManager.progress;
84
+ return new ToolResult(
85
+ `Task ${id} created. Board: ${p.pending} pending, ${p.inProgress} in_progress, ${p.completed} completed.`,
86
+ );
87
+ });
88
+ }
89
+ }
90
+
91
+ export class TaskUpdateTool extends BaseTool {
92
+ constructor(workspace, taskManager) {
93
+ super();
94
+ this._workspace = workspace;
95
+ this._taskManager = taskManager;
96
+ }
97
+
98
+ get name() { return "TaskUpdate"; }
99
+
100
+ get description() {
101
+ return (
102
+ "Update a task's status and optional summary. Status: 'pending', " +
103
+ "'in_progress', 'completed', or 'failed'. Use TaskComplete instead " +
104
+ "for the common case of marking a task done with a summary."
105
+ );
106
+ }
107
+
108
+ get inputSchema() {
109
+ return {
110
+ type: "object",
111
+ properties: {
112
+ id: { type: "string", description: "Task ID to update." },
113
+ status: {
114
+ type: "string",
115
+ enum: ["pending", "in_progress", "completed", "failed"],
116
+ description: "New status for the task.",
117
+ },
118
+ summary: {
119
+ type: "string",
120
+ description: "Optional short summary (e.g. why the task failed, what was produced).",
121
+ },
122
+ },
123
+ required: ["id"],
124
+ };
125
+ }
126
+
127
+ async execute(input) {
128
+ const id = input.id || "";
129
+ const status = input.status;
130
+ const summary = input.summary;
131
+
132
+ if (!id) return new ToolResult("id required", true);
133
+
134
+ return await this._workspace.withSharedLockIfApplicable(TASKS_REL, () => {
135
+ const exists = this._taskManager.getAllTasks().some((t) => t.id === id);
136
+ if (!exists) return new ToolResult(`Task ${id} not found.`, true);
137
+ this._taskManager.updateTask(id, { status, summary });
138
+ const p = this._taskManager.progress;
139
+ return new ToolResult(
140
+ `Task ${id} updated${status ? ` to ${status}` : ""}. ` +
141
+ `Board: ${p.pending} pending, ${p.inProgress} in_progress, ${p.completed} completed, ${p.failed} failed.`,
142
+ );
143
+ });
144
+ }
145
+ }
146
+
147
+ export class TaskCompleteTool extends BaseTool {
148
+ constructor(workspace, taskManager) {
149
+ super();
150
+ this._workspace = workspace;
151
+ this._taskManager = taskManager;
152
+ }
153
+
154
+ get name() { return "TaskComplete"; }
155
+
156
+ get description() {
157
+ return (
158
+ "Mark a task as completed with an optional summary. Sugar for " +
159
+ "TaskUpdate({id, status: 'completed', summary}). The Ralph loop " +
160
+ "advances to the next pending task after this returns."
161
+ );
162
+ }
163
+
164
+ get inputSchema() {
165
+ return {
166
+ type: "object",
167
+ properties: {
168
+ id: { type: "string", description: "Task ID to complete." },
169
+ summary: {
170
+ type: "string",
171
+ description: "Optional short summary of what was produced.",
172
+ },
173
+ },
174
+ required: ["id"],
175
+ };
176
+ }
177
+
178
+ async execute(input) {
179
+ const id = input.id || "";
180
+ const summary = input.summary;
181
+
182
+ if (!id) return new ToolResult("id required", true);
183
+
184
+ return await this._workspace.withSharedLockIfApplicable(TASKS_REL, () => {
185
+ const exists = this._taskManager.getAllTasks().some((t) => t.id === id);
186
+ if (!exists) return new ToolResult(`Task ${id} not found.`, true);
187
+ this._taskManager.markDone(id, summary);
188
+ const p = this._taskManager.progress;
189
+ return new ToolResult(
190
+ `Task ${id} completed. Board: ${p.pending} pending, ${p.inProgress} in_progress, ${p.completed} completed.`,
191
+ );
192
+ });
193
+ }
194
+ }
@@ -30,7 +30,9 @@ export class WorkspaceFileTool extends BaseTool {
30
30
  "Read, write, or list files. " +
31
31
  "scope='workspace' (default): KC's working directory for rules, skills, workflows, results. " +
32
32
  "scope='project': the user's project folder where KC was launched — source regulations and samples live here. " +
33
- "Operations: read (returns file content), write (creates/overwrites a file), list (shows directory contents)."
33
+ "Operations: read (returns file content), write (creates/overwrites a file), list (shows directory contents). " +
34
+ "read returns up to 50,000 chars per call; longer files are truncated. " +
35
+ "For full reads of regulation/rule documents (typically smaller than this cap), prefer this tool over sandbox_exec."
34
36
  );
35
37
  }
36
38
 
@@ -87,7 +89,7 @@ export class WorkspaceFileTool extends BaseTool {
87
89
 
88
90
  try {
89
91
  if (op === "read") return this._read(filePath, scope);
90
- if (op === "write") return this._write(filePath, content, scope);
92
+ if (op === "write") return await this._write(filePath, content, scope);
91
93
  if (op === "list") return this._list(filePath, scope);
92
94
  return new ToolResult(`Unknown operation: ${op}`, true);
93
95
  } catch (err) {
@@ -107,56 +109,68 @@ export class WorkspaceFileTool extends BaseTool {
107
109
  return new ToolResult(text);
108
110
  }
109
111
 
110
- _write(filePath, content, scope) {
112
+ async _write(filePath, content, scope) {
111
113
  if (!filePath || filePath === ".") {
112
114
  return new ToolResult("Path required for write operation", true);
113
115
  }
114
116
  const resolved = this._resolveForScope(filePath, scope);
115
- fs.mkdirSync(path.dirname(resolved), { recursive: true });
116
-
117
- // v0.7.0 Group M (#84 remainder): on case-insensitive filesystems
118
- // (macOS/Windows defaults), warn when the target's basename collides
119
- // with an existing sibling differing only in case. Write proceeds
120
- // — agents may legitimately overwrite — but the agent gets visible
121
- // signal so it doesn't end up confused like E2E #5 GLM ("SKILL.md
122
- // disappeared" when the inode was shared with skill.md). Workspace-
123
- // scope only; project-dir scope is the user's territory.
124
- let collisionNote = "";
125
- if (
126
- scope === "workspace" &&
127
- this._workspace.fsCaseSensitive === false
128
- ) {
129
- try {
130
- const parent = path.dirname(resolved);
131
- const targetBase = path.basename(resolved);
132
- const targetLower = targetBase.toLowerCase();
133
- const siblings = fs.readdirSync(parent);
134
- const collision = siblings.find(
135
- (s) => s !== targetBase && s.toLowerCase() === targetLower,
136
- );
137
- if (collision) {
138
- collisionNote =
139
- ` ⚠ case-collision: case-insensitive filesystem already has '${collision}'` +
140
- ` at this path; both names resolve to the same inode. Pick one canonical case` +
141
- ` (lowercase preferred for skill files) and use it consistently — otherwise` +
142
- ` archive_file / Read on either name affects the other.`;
143
- }
144
- } catch { /* readdirSync may fail on a fresh dir; that's fine, no collision possible */ }
145
- }
146
117
 
147
- fs.writeFileSync(resolved, content, "utf-8");
118
+ const doWrite = () => {
119
+ fs.mkdirSync(path.dirname(resolved), { recursive: true });
120
+
121
+ // v0.7.0 Group M (#84 remainder): on case-insensitive filesystems
122
+ // (macOS/Windows defaults), warn when the target's basename collides
123
+ // with an existing sibling differing only in case. Write proceeds
124
+ // — agents may legitimately overwrite — but the agent gets visible
125
+ // signal so it doesn't end up confused like E2E #5 GLM ("SKILL.md
126
+ // disappeared" when the inode was shared with skill.md). Workspace-
127
+ // scope only; project-dir scope is the user's territory.
128
+ let collisionNote = "";
129
+ if (
130
+ scope === "workspace" &&
131
+ this._workspace.fsCaseSensitive === false
132
+ ) {
133
+ try {
134
+ const parent = path.dirname(resolved);
135
+ const targetBase = path.basename(resolved);
136
+ const targetLower = targetBase.toLowerCase();
137
+ const siblings = fs.readdirSync(parent);
138
+ const collision = siblings.find(
139
+ (s) => s !== targetBase && s.toLowerCase() === targetLower,
140
+ );
141
+ if (collision) {
142
+ collisionNote =
143
+ ` ⚠ case-collision: case-insensitive filesystem already has '${collision}'` +
144
+ ` at this path; both names resolve to the same inode. Pick one canonical case` +
145
+ ` (lowercase preferred for skill files) and use it consistently — otherwise` +
146
+ ` archive_file / Read on either name affects the other.`;
147
+ }
148
+ } catch { /* readdirSync may fail on a fresh dir; that's fine, no collision possible */ }
149
+ }
150
+
151
+ fs.writeFileSync(resolved, content, "utf-8");
152
+
153
+ // Auto-commit to git for workspace writes (silently no-ops if gitignored or git unavailable)
154
+ let traceId = null;
155
+ if (scope === "workspace") {
156
+ traceId = this._workspace.autoCommit(filePath, "update");
157
+ }
158
+
159
+ const label = scope === "project" ? `[project] ${filePath}` : filePath;
160
+ let msg = `Wrote ${content.length} chars to ${label}`;
161
+ if (traceId) msg += ` [trace: ${traceId}]`;
162
+ if (collisionNote) msg += collisionNote;
163
+ return new ToolResult(msg);
164
+ };
148
165
 
149
- // Auto-commit to git for workspace writes (silently no-ops if gitignored or git unavailable)
150
- let traceId = null;
166
+ // v0.7.3: route writes to shared coordination paths (rules/catalog.json,
167
+ // tasks.json, refs/manifest.json, etc.) through the workspace lock so
168
+ // concurrent writers serialize. No-op for non-shared paths and for
169
+ // project-scope writes (project dir is the user's, not shared engine state).
151
170
  if (scope === "workspace") {
152
- traceId = this._workspace.autoCommit(filePath, "update");
171
+ return await this._workspace.withSharedLockIfApplicable(filePath, doWrite);
153
172
  }
154
-
155
- const label = scope === "project" ? `[project] ${filePath}` : filePath;
156
- let msg = `Wrote ${content.length} chars to ${label}`;
157
- if (traceId) msg += ` [trace: ${traceId}]`;
158
- if (collisionNote) msg += collisionNote;
159
- return new ToolResult(msg);
173
+ return doWrite();
160
174
  }
161
175
 
162
176
  _list(filePath, scope) {
package/src/config.js CHANGED
@@ -90,10 +90,12 @@ export function loadSettings(workspacePath) {
90
90
  tier3: env.TIER3 || gc.tiers?.tier3 || "",
91
91
  tier4: env.TIER4 || gc.tiers?.tier4 || "",
92
92
 
93
- // VLM tiers (vision/OCR models)
94
- vlmTier1: env.VLM_TIER1 || gc.vlm_tiers?.tier1 || "",
95
- vlmTier2: env.VLM_TIER2 || gc.vlm_tiers?.tier2 || "",
96
- vlmTier3: env.VLM_TIER3 || gc.vlm_tiers?.tier3 || "",
93
+ // VLM tiers (vision/OCR models). v0.7.3: accept OCR_MODEL_TIER* as
94
+ // alias since template/.env.template + initializer.js seed that name.
95
+ // VLM_TIER* takes precedence when both are set.
96
+ vlmTier1: env.VLM_TIER1 || env.OCR_MODEL_TIER1 || gc.vlm_tiers?.tier1 || "",
97
+ vlmTier2: env.VLM_TIER2 || env.OCR_MODEL_TIER2 || gc.vlm_tiers?.tier2 || "",
98
+ vlmTier3: env.VLM_TIER3 || env.OCR_MODEL_TIER3 || gc.vlm_tiers?.tier3 || "",
97
99
 
98
100
  // Worker LLM — optional, defaults to conductor config (process.env wins)
99
101
  workerProvider: penv.KC_WORKER_PROVIDER || gc.worker_provider || "",
@@ -0,0 +1,27 @@
1
+ // Single source of truth for the live KC CLI version string.
2
+ //
3
+ // Reads package.json once. Used by engine.js (passed to ReleaseTool so
4
+ // release manifests stamp the correct version) and by
5
+ // pipelines/finalization.js (anywhere it surfaces "Built by kc-beta X").
6
+ //
7
+ // Before v0.7.2, engine.js hardcoded `kcVersion: "0.5.2"` which leaked
8
+ // into every release manifest's `kc_beta_version` field regardless of
9
+ // the actual package version. Both v0.7.1 audit runs (DS + GLM)
10
+ // surfaced this. Reading package.json closes the gap.
11
+
12
+ import fs from "node:fs";
13
+ import path from "node:path";
14
+ import { fileURLToPath } from "node:url";
15
+
16
+ const __filename = fileURLToPath(import.meta.url);
17
+ const __dirname = path.dirname(__filename);
18
+
19
+ export function readKcVersion() {
20
+ try {
21
+ const pkgPath = path.resolve(__dirname, "../../package.json");
22
+ const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
23
+ return pkg.version || "unknown";
24
+ } catch {
25
+ return "unknown";
26
+ }
27
+ }
@@ -23,6 +23,13 @@ skills/ — Meta skills encoding verification methodology
23
23
  .env — Configuration: API keys, model tiers, thresholds, language
24
24
  ```
25
25
 
26
+ Note: KC's session workspace under `~/.kc_agent/workspaces/<sessionId>/`
27
+ uses lowercase counterparts (`rules/`, `samples/`, `input/`, `output/`,
28
+ `logs/`, `workflows/`, `rule_skills/`) — these are runtime-internal and
29
+ separate from this project's user-facing folders above. The asymmetry
30
+ is intentional: title-case for human-facing project dirs, lowercase for
31
+ KC's working state.
32
+
26
33
  ## Your Mission
27
34
 
28
35
  Follow this lifecycle. Each step references the skill(s) to consult:
@@ -93,6 +100,12 @@ skills/ — 编码核查方法论的元技能
93
100
  .env — 配置:API密钥、模型层级、阈值、语言
94
101
  ```
95
102
 
103
+ 注:KC 在 `~/.kc_agent/workspaces/<sessionId>/` 下的会话工作区使用
104
+ 小写对应目录(`rules/`、`samples/`、`input/`、`output/`、`logs/`、
105
+ `workflows/`、`rule_skills/`)—— 这些是运行时内部目录,与本项目上面
106
+ 那些用户可见的目录是分开的。这种大小写不对称是有意的:项目里给人看
107
+ 的目录用首字母大写;KC 自己的工作状态用小写。
108
+
96
109
  ## 你的使命
97
110
 
98
111
  遵循以下生命周期。每一步标注了需要参考的技能:
@@ -133,6 +133,65 @@ conversation or existing catalog. Therefore, when composing the brief:
133
133
  catalog.json.** rule_catalog uses workspace file locking;
134
134
  sandbox_exec bypasses it and races with other writers.
135
135
 
136
+ ## How to read regulation files (default: read whole)
137
+
138
+ Regulations are the audit's authoritative basis. Every `source_ref`
139
+ in your extracted rules must be verifiable against the source text.
140
+ For typical regulation documents (a single file under ~50 KB / under
141
+ ~100 pages), **read each regulation file whole using `workspace_file`
142
+ (operation=read) in a single call**:
143
+
144
+ ```js
145
+ workspace_file({ operation: "read", scope: "project", path: "Rules/01_some_regulation.md" })
146
+ ```
147
+
148
+ `workspace_file.read` is capped at 50,000 chars per call, which
149
+ covers virtually every individual regulation document. This is the
150
+ default. **Read every regulation file whole before you start
151
+ extracting rules from any of them.**
152
+
153
+ ### Tool choice — `workspace_file` vs `sandbox_exec`
154
+
155
+ | Tool | Per-call cap | Use for |
156
+ |---|---:|---|
157
+ | `workspace_file` (read) | 50,000 chars | **full reads of regulation / rule documents** |
158
+ | `sandbox_exec` (cat/head/etc) | 10,000 chars | shell commands, **not** full file reads |
159
+
160
+ `sandbox_exec` is designed for shell commands; its 10K cap is too
161
+ small for most regulations. `cat rules/01_*.md` returns only the
162
+ first ~10 KB followed by `\n[truncated]`. Re-issuing with `head -N` /
163
+ `tail -M` to scroll the window loses positional precision and burns
164
+ turns. **When you see truncation, don't fight the cap — switch
165
+ tools.**
166
+
167
+ ### Asymmetry — regs read whole, samples sampled
168
+
169
+ Regulations are limited (typically 1-10 files), authoritative, and
170
+ read once. Read every regulation whole.
171
+
172
+ Sample documents may number 30 to 1000+, are heterogeneous, and get
173
+ read many times during testing. **Don't try to read every sample
174
+ whole.** Use rule-applicability filters or sampled subsets to focus
175
+ attention.
176
+
177
+ ### Escape valve — when a single reg exceeds ~200K chars
178
+
179
+ Rare in practice. The largest regulation in `test_data_4` is 42 KB;
180
+ typical Chinese banking regs (资管新规, 信披办法, etc.) all fit
181
+ under 50 KB. But if you do encounter a single regulation so large
182
+ that reading it whole would crowd the context window — heuristic:
183
+ the file exceeds ~200,000 chars or ~25% of your context budget —
184
+ use your own judgment:
185
+
186
+ - Read by chapter (e.g., `第X章` / `Chapter X`) using `document_parse`
187
+ or paginated `workspace_file` reads
188
+ - Or build an in-workspace index file pointing to chapter offsets and
189
+ read on-demand per rule being extracted
190
+
191
+ The 50 KB cap is high enough that this almost never triggers. **The
192
+ default is read whole; deviate only when the file genuinely doesn't
193
+ fit.**
194
+
136
195
  ## Extraction Strategies
137
196
 
138
197
  ### Strategy 1: Structured Input (Developer User Provides Rules)
@@ -223,6 +282,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
223
282
 
224
283
  Do not skip ambiguous rules. They are often the most important ones.
225
284
 
285
+ ## Sanity-check applicability against the sample corpus
286
+
287
+ After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
288
+
289
+ For every rule:
290
+ 1. Walk `samples/`, classify each by product type / report type / document format
291
+ 2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
292
+ 3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
293
+
294
+ E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
295
+
296
+ If many rules are 0-sample, either:
297
+ - **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
298
+ - **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
299
+ - **Update the test corpus** to include matching samples (work with the developer user)
300
+
301
+ Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
302
+
226
303
  ## When Rules Change
227
304
 
228
305
  Regulations evolve. When the developer user adds new or updated regulation documents:
@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
45
45
  ### The hybrid approach (most common)
46
46
  Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
47
47
 
48
+ ### When regex alone isn't enough — decision rubric
49
+
50
+ Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
51
+
52
+ - **Semantic** ("is this a positive guarantee or a disclaimer?")
53
+ - **Contextual** ("interpret this in light of the document's product type")
54
+ - **Counterfactual** ("what should this value be, given the other fields?")
55
+ - **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
56
+
57
+ regex alone rarely suffices. Three acceptable forms:
58
+
59
+ 1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
60
+ 2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
61
+ 3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
62
+
63
+ Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
64
+
65
+ ### Worker LLM cost-aware tier choice
66
+
67
+ If you do escalate to LLM:
68
+ - **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
69
+ - **tier2-3**: bulk extraction with simple semantic checks
70
+ - **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
71
+
72
+ Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
73
+
48
74
  ## Workflow Structure
49
75
 
50
76
  A workflow is a Python file (or small set of files) in `workflows/`:
@@ -85,7 +85,7 @@ Bundle multiple rules into a single task (and a single check_r###_r###.py file)
85
85
  - The judgment logic for one rule is a substring or close variant of the next
86
86
  - A single failure typically implies multiple failures (you can't pass R013 if R015 fails)
87
87
 
88
- Example: R013 / R015 / R017 all check that a specific table on page 3 of the report contains certain mandatory fields. Same chunk, same parse, same verdict shape. Bundle as `check_r013_r015_r017.py` and create a single TaskCreate task `R013/R015/R017 — required-fields table`. The engine's filesystem-derived milestones recognize the grouped check.py and credit all three rule_ids.
88
+ Example: R013 / R015 / R017 all check that a specific table on page 3 of the report contains certain mandatory fields. Same chunk, same parse, same verdict shape. Bundle as `check_r013_r015_r017.py` and create a single task: `TaskCreate({id: "R013-R015-R017-skill_authoring", title: "R013/R015/R017 — required-fields table", phase: "skill_authoring"})`. The engine's filesystem-derived milestones recognize the grouped check.py and credit all three rule_ids.
89
89
 
90
90
  ### When to keep separate
91
91
 
@@ -147,6 +147,41 @@ E2E #6 v070 surfaced this pattern (DS bundled-skill check.py files
147
147
  all returned `{"pass": null, "method": "stub"}` deferring to
148
148
  workflows/). v0.7.1 added this anti-pattern explicitly.
149
149
 
150
+ E2E #7 v071 showed the teaching prevented the stub anti-pattern in
151
+ both conductors (no `{"pass": null}` patterns in either run), but
152
+ **DS still inverted the canonical-vs-distilled relationship**: DS's
153
+ 6 thematic skill folders had SKILL.md only (no check.py), with the
154
+ real verification code living in `workflows/<skill>/check.py`. The
155
+ absence of stubs is good; the inversion is not — editing a rule then
156
+ requires touching both SKILL.md (the doc) and the workflow check.py
157
+ (the code). Single source of truth is lost.
158
+
159
+ GLM v071 by contrast landed the canonical pattern: 97/97 skills had
160
+ both SKILL.md AND a real `check.py` (median 143 LOC of regex +
161
+ applicability logic), and `workflows/<id>/workflow_v1.py` was a
162
+ 50-line thin wrapper that imported and called it:
163
+
164
+ ```python
165
+ # workflows/D01-01/workflow_v1.py — thin wrapper, 52 LOC
166
+ import importlib.util, json
167
+ from pathlib import Path
168
+
169
+ def run(doc_text: str, meta: dict = None) -> dict:
170
+ check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
171
+ spec = importlib.util.spec_from_file_location("check", check_path)
172
+ mod = importlib.util.module_from_spec(spec)
173
+ spec.loader.exec_module(mod)
174
+ result = mod.check(doc_text, meta)
175
+ result["_workflow"] = "D01-01_v1"
176
+ return result
177
+ ```
178
+
179
+ This is the v0.7.2+ canonical pattern: workflow is a shim that
180
+ points at the skill's check.py. To iterate on a rule's verification,
181
+ edit `rule_skills/<id>/check.py`. The workflow doesn't change. v0.7.2
182
+ clarifies the teaching: avoid stubs AND keep the canonical
183
+ relationship (skill is canonical, workflow is distilled wrapper).
184
+
150
185
  ### Naming convention for grouped checks
151
186
 
152
187
  When you do bundle, name the file with the explicit range:
@@ -309,18 +344,50 @@ When entering skill_authoring with an empty TaskBoard:
309
344
  5. **Pick the first task.** Work it to completion (skill + check + at least one local test). Update PATTERNS.md with whatever you learned. Move to the next task.
310
345
  6. **At task ~5 and task ~10:** stop and re-read PATTERNS.md. If patterns suggest a refactor of earlier work, do it now (cheap) rather than later (expensive).
311
346
 
312
- ### Why PATTERNS.md FIRST, before any skill code
347
+ ### Calling TaskCreate / TaskUpdate / TaskComplete
348
+
349
+ The engine registers three task-board tools (v0.7.3+):
350
+
351
+ - `TaskCreate({id, title, phase, ruleId?})` — adds a task to `tasks.json`. `id` must be unique within the session; pick a stable shape like `<rule_id>-<phase>` for per-rule tasks or `<group-name>-<phase>` for grouped / non-rule tasks. `phase` is the phase the task belongs to (current phase or a future phase you're pre-populating). `ruleId` is optional — set it for per-rule tasks so the engine can credit the rule_id in milestone derivation.
352
+ - `TaskUpdate({id, status?, summary?})` — updates a task's status to `pending` / `in_progress` / `completed` / `failed`, optionally with a short summary.
353
+ - `TaskComplete({id, summary?})` — sugar for `TaskUpdate({id, status:"completed", summary})`. Use this for the common path after finishing a unit of work.
354
+
355
+ After you call `TaskCreate` for your decomposition and exit the current turn, the Ralph loop pulls the next pending task and runs it. Finish the work, call `TaskComplete`, and the loop advances. If a task can't be completed (irrecoverable error), call `TaskUpdate({id, status:"failed", summary:"reason"})` so the queue moves on rather than blocking on the failed task.
356
+
357
+ Examples:
358
+
359
+ ```
360
+ TaskCreate({ id: "R001-skill_authoring", title: "Author skill for R001",
361
+ phase: "skill_authoring", ruleId: "R001" })
362
+
363
+ TaskCreate({ id: "trust-bundle-skill_authoring",
364
+ title: "R013/R015/R017 — required-fields table",
365
+ phase: "skill_authoring" })
366
+
367
+ TaskComplete({ id: "R001-skill_authoring",
368
+ summary: "regex check passes 89/90; R001 done" })
369
+ ```
370
+
371
+ ### Persisted methodology — PATTERNS.md OR phase logs OR AGENT.md decisions
372
+
373
+ The principle: capture framework-level decisions to disk before each phase advance. The conversation will compact, agents will restart, the next phase will lose grounding. Whichever format you pick, write to disk — don't rely on conversation context that disappears.
374
+
375
+ Three formats, each defensible. Pick one and stick with it:
376
+
377
+ - **`rules/PATTERNS.md`** — concise, framework-only, updated as the project progresses. Best for greenfield projects with clear hypothesis-up-front structure. Capped at ~5 KB; entries are transferable shapes / project constraints / anti-patterns with rationale (see "What to write" above).
313
378
 
314
- If you start writing skill code (rule_skills/<id>/check.py) before PATTERNS.md exists, **stop**. Even a 200-byte initial PATTERNS.md ("decided Shannon-Huffman; first hard rule R028 will dictate verdict shape; sample corpus has bilingual table headings") sets the framework. You'll save the time later not re-deriving the same shapes per rule.
379
+ - **`logs/phase_<name>_complete.md` per phase** incremental, captures what each phase produced + decisions made + what the next phase inherits. Best for iterative discovery work where the framework crystallizes mid-run. E2E #7 GLM used this pattern across 6 phase docs and an `evolution_summary_v1.2.md`; the methodology was captured even though PATTERNS.md was never written.
315
380
 
316
- "I'll write the skills first, then PATTERNS.md when I have insights."
381
+ - **`AGENT.md` decisions section + domain notes** — narrative-style, living document of "what we know" and "why". Best for projects with rich domain context to capture (regulations, edge cases, thresholds, sample format distributions). E2E #7 GLM's AGENT.md included regulation enforcement dates, product type taxonomies, threshold values, and sample format counts — this is fine; it's a different idiom for the same goal.
317
382
 
318
- By the time you have N skills, you've made N implicit decisions about verdict shape, chunker boundaries, worker tier each rule re-derives from scratch. Refactoring requires touching N files instead of one.
383
+ What you should NOT do: skip persistence and rely only on the live conversation context. By the time you have N skills authored without any persisted methodology, you've made N implicit decisions about verdict shape, chunker boundaries, and worker tier. Each rule re-derives from scratch. Refactoring requires touching N files instead of one.
319
384
 
320
- "Write PATTERNS.md, even tentatively, then re-read it before each new rule. Update it when discoveries change the framework."
385
+ "I'll capture insights when I have time."
321
386
 
322
- PATTERNS.md is your project's index card. Build it before the work, update it during the work, harvest it after.
387
+ "Before each phase advance, write what I learned to whichever persistence file matches this project's idiom — even if it's tentative."
323
388
 
324
- E2E #6 v070 surfaced this: DS only wrote PATTERNS.md after a rollback intervention; the per-skill design decisions before that point were already locked in and had to be re-touched. v0.7.1 reinforced this guidance.
389
+ E2E history:
390
+ - E2E #6 v070 DS wrote PATTERNS.md only after a rollback. Per-skill decisions before that point had to be re-touched. v0.7.1 added "PATTERNS.md FIRST" reinforcement.
391
+ - E2E #7 v071 neither DS nor GLM wrote PATTERNS.md, but GLM wrote 6 rich phase-completion logs and a comprehensive AGENT.md — the methodology WAS captured, just in different files. v0.7.2 blesses the broader principle: persist before you advance, format flexible.
325
392
 
326
- The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract.
393
+ The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract; the persistence file is your project's memory.