kc-beta 0.7.1 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -8
- package/package.json +1 -1
- package/src/agent/engine.js +32 -2
- package/src/agent/pipelines/_milestone-derive.js +65 -42
- package/src/agent/pipelines/finalization.js +2 -6
- package/src/agent/pipelines/initializer.js +13 -0
- package/src/agent/tools/copy-to-workspace.js +17 -12
- package/src/agent/tools/release.js +151 -1
- package/src/agent/tools/sandbox-exec.js +4 -1
- package/src/agent/tools/task-board.js +194 -0
- package/src/agent/tools/workspace-file.js +58 -44
- package/src/config.js +6 -4
- package/src/util/kc-version.js +27 -0
- package/template/CLAUDE.md +13 -0
- package/template/skills/en/meta-meta/rule-extraction/SKILL.md +77 -0
- package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md +26 -0
- package/template/skills/en/meta-meta/work-decomposition/SKILL.md +76 -9
- package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +65 -0
- package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md +26 -0
- package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +74 -9
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import { BaseTool, ToolResult } from "./base.js";
|
|
2
|
+
|
|
3
|
+
const TASKS_REL = "tasks.json";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* v0.7.3 — TaskCreate / TaskUpdate / TaskComplete tools.
|
|
7
|
+
*
|
|
8
|
+
* Completes the v0.7.0 "agent owns TaskBoard" design. The engine no longer
|
|
9
|
+
* auto-populates per-rule tasks on phase entry (PER_RULE_PHASES is empty by
|
|
10
|
+
* default — see task-manager.js); the agent reads the rule list via
|
|
11
|
+
* describeState, picks a decomposition (single / grouped / range / non-rule),
|
|
12
|
+
* and calls these tools to populate tasks.json. The Ralph loop in
|
|
13
|
+
* AgentEngine._runTaskLoopSerial then walks pending tasks one at a time.
|
|
14
|
+
*
|
|
15
|
+
* Skill teaching for these tools lives in
|
|
16
|
+
* template/skills/{en,zh}/meta-meta/work-decomposition/SKILL.md.
|
|
17
|
+
*
|
|
18
|
+
* tasks.json is a shared-coordination path (workspace.js
|
|
19
|
+
* SHARED_COORDINATION_PATHS) — every write goes through
|
|
20
|
+
* withSharedLockIfApplicable so two writers (main + subagent) serialize.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
export class TaskCreateTool extends BaseTool {
|
|
24
|
+
constructor(workspace, taskManager) {
|
|
25
|
+
super();
|
|
26
|
+
this._workspace = workspace;
|
|
27
|
+
this._taskManager = taskManager;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
get name() { return "TaskCreate"; }
|
|
31
|
+
|
|
32
|
+
get description() {
|
|
33
|
+
return (
|
|
34
|
+
"Add a task to the session task board. Tasks gate the Ralph loop — " +
|
|
35
|
+
"after the current turn ends, the engine pulls the next pending task " +
|
|
36
|
+
"and runs it. Use one task per unit of work you want to iterate on " +
|
|
37
|
+
"(per-rule, per-group, per-document — your decomposition). " +
|
|
38
|
+
"Call this on phase entry after reading describeState."
|
|
39
|
+
);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
get inputSchema() {
|
|
43
|
+
return {
|
|
44
|
+
type: "object",
|
|
45
|
+
properties: {
|
|
46
|
+
id: {
|
|
47
|
+
type: "string",
|
|
48
|
+
description: "Unique task ID within this session (e.g. 'R001-skill_authoring' or 'group-trust-1').",
|
|
49
|
+
},
|
|
50
|
+
title: {
|
|
51
|
+
type: "string",
|
|
52
|
+
description: "Short human-readable title for the task.",
|
|
53
|
+
},
|
|
54
|
+
phase: {
|
|
55
|
+
type: "string",
|
|
56
|
+
description: "Phase this task belongs to (e.g. 'skill_authoring', 'skill_testing', 'distillation').",
|
|
57
|
+
},
|
|
58
|
+
ruleId: {
|
|
59
|
+
type: "string",
|
|
60
|
+
description: "Optional rule_id if this is a per-rule task. Omit for grouped or non-rule tasks.",
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
required: ["id", "title", "phase"],
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
async execute(input) {
|
|
68
|
+
const id = input.id || "";
|
|
69
|
+
const title = input.title || "";
|
|
70
|
+
const phase = input.phase || "";
|
|
71
|
+
const ruleId = input.ruleId || null;
|
|
72
|
+
|
|
73
|
+
if (!id) return new ToolResult("id required", true);
|
|
74
|
+
if (!title) return new ToolResult("title required", true);
|
|
75
|
+
if (!phase) return new ToolResult("phase required", true);
|
|
76
|
+
|
|
77
|
+
return await this._workspace.withSharedLockIfApplicable(TASKS_REL, () => {
|
|
78
|
+
const before = this._taskManager.getAllTasks().some((t) => t.id === id);
|
|
79
|
+
this._taskManager.addTask({ id, title, phase, ruleId });
|
|
80
|
+
if (before) {
|
|
81
|
+
return new ToolResult(`Task ${id} already existed (no-op).`);
|
|
82
|
+
}
|
|
83
|
+
const p = this._taskManager.progress;
|
|
84
|
+
return new ToolResult(
|
|
85
|
+
`Task ${id} created. Board: ${p.pending} pending, ${p.inProgress} in_progress, ${p.completed} completed.`,
|
|
86
|
+
);
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export class TaskUpdateTool extends BaseTool {
|
|
92
|
+
constructor(workspace, taskManager) {
|
|
93
|
+
super();
|
|
94
|
+
this._workspace = workspace;
|
|
95
|
+
this._taskManager = taskManager;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
get name() { return "TaskUpdate"; }
|
|
99
|
+
|
|
100
|
+
get description() {
|
|
101
|
+
return (
|
|
102
|
+
"Update a task's status and optional summary. Status: 'pending', " +
|
|
103
|
+
"'in_progress', 'completed', or 'failed'. Use TaskComplete instead " +
|
|
104
|
+
"for the common case of marking a task done with a summary."
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
get inputSchema() {
|
|
109
|
+
return {
|
|
110
|
+
type: "object",
|
|
111
|
+
properties: {
|
|
112
|
+
id: { type: "string", description: "Task ID to update." },
|
|
113
|
+
status: {
|
|
114
|
+
type: "string",
|
|
115
|
+
enum: ["pending", "in_progress", "completed", "failed"],
|
|
116
|
+
description: "New status for the task.",
|
|
117
|
+
},
|
|
118
|
+
summary: {
|
|
119
|
+
type: "string",
|
|
120
|
+
description: "Optional short summary (e.g. why the task failed, what was produced).",
|
|
121
|
+
},
|
|
122
|
+
},
|
|
123
|
+
required: ["id"],
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
async execute(input) {
|
|
128
|
+
const id = input.id || "";
|
|
129
|
+
const status = input.status;
|
|
130
|
+
const summary = input.summary;
|
|
131
|
+
|
|
132
|
+
if (!id) return new ToolResult("id required", true);
|
|
133
|
+
|
|
134
|
+
return await this._workspace.withSharedLockIfApplicable(TASKS_REL, () => {
|
|
135
|
+
const exists = this._taskManager.getAllTasks().some((t) => t.id === id);
|
|
136
|
+
if (!exists) return new ToolResult(`Task ${id} not found.`, true);
|
|
137
|
+
this._taskManager.updateTask(id, { status, summary });
|
|
138
|
+
const p = this._taskManager.progress;
|
|
139
|
+
return new ToolResult(
|
|
140
|
+
`Task ${id} updated${status ? ` to ${status}` : ""}. ` +
|
|
141
|
+
`Board: ${p.pending} pending, ${p.inProgress} in_progress, ${p.completed} completed, ${p.failed} failed.`,
|
|
142
|
+
);
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
export class TaskCompleteTool extends BaseTool {
|
|
148
|
+
constructor(workspace, taskManager) {
|
|
149
|
+
super();
|
|
150
|
+
this._workspace = workspace;
|
|
151
|
+
this._taskManager = taskManager;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
get name() { return "TaskComplete"; }
|
|
155
|
+
|
|
156
|
+
get description() {
|
|
157
|
+
return (
|
|
158
|
+
"Mark a task as completed with an optional summary. Sugar for " +
|
|
159
|
+
"TaskUpdate({id, status: 'completed', summary}). The Ralph loop " +
|
|
160
|
+
"advances to the next pending task after this returns."
|
|
161
|
+
);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
get inputSchema() {
|
|
165
|
+
return {
|
|
166
|
+
type: "object",
|
|
167
|
+
properties: {
|
|
168
|
+
id: { type: "string", description: "Task ID to complete." },
|
|
169
|
+
summary: {
|
|
170
|
+
type: "string",
|
|
171
|
+
description: "Optional short summary of what was produced.",
|
|
172
|
+
},
|
|
173
|
+
},
|
|
174
|
+
required: ["id"],
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
async execute(input) {
|
|
179
|
+
const id = input.id || "";
|
|
180
|
+
const summary = input.summary;
|
|
181
|
+
|
|
182
|
+
if (!id) return new ToolResult("id required", true);
|
|
183
|
+
|
|
184
|
+
return await this._workspace.withSharedLockIfApplicable(TASKS_REL, () => {
|
|
185
|
+
const exists = this._taskManager.getAllTasks().some((t) => t.id === id);
|
|
186
|
+
if (!exists) return new ToolResult(`Task ${id} not found.`, true);
|
|
187
|
+
this._taskManager.markDone(id, summary);
|
|
188
|
+
const p = this._taskManager.progress;
|
|
189
|
+
return new ToolResult(
|
|
190
|
+
`Task ${id} completed. Board: ${p.pending} pending, ${p.inProgress} in_progress, ${p.completed} completed.`,
|
|
191
|
+
);
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
}
|
|
@@ -30,7 +30,9 @@ export class WorkspaceFileTool extends BaseTool {
|
|
|
30
30
|
"Read, write, or list files. " +
|
|
31
31
|
"scope='workspace' (default): KC's working directory for rules, skills, workflows, results. " +
|
|
32
32
|
"scope='project': the user's project folder where KC was launched — source regulations and samples live here. " +
|
|
33
|
-
"Operations: read (returns file content), write (creates/overwrites a file), list (shows directory contents)."
|
|
33
|
+
"Operations: read (returns file content), write (creates/overwrites a file), list (shows directory contents). " +
|
|
34
|
+
"read returns up to 50,000 chars per call; longer files are truncated. " +
|
|
35
|
+
"For full reads of regulation/rule documents (typically smaller than this cap), prefer this tool over sandbox_exec."
|
|
34
36
|
);
|
|
35
37
|
}
|
|
36
38
|
|
|
@@ -87,7 +89,7 @@ export class WorkspaceFileTool extends BaseTool {
|
|
|
87
89
|
|
|
88
90
|
try {
|
|
89
91
|
if (op === "read") return this._read(filePath, scope);
|
|
90
|
-
if (op === "write") return this._write(filePath, content, scope);
|
|
92
|
+
if (op === "write") return await this._write(filePath, content, scope);
|
|
91
93
|
if (op === "list") return this._list(filePath, scope);
|
|
92
94
|
return new ToolResult(`Unknown operation: ${op}`, true);
|
|
93
95
|
} catch (err) {
|
|
@@ -107,56 +109,68 @@ export class WorkspaceFileTool extends BaseTool {
|
|
|
107
109
|
return new ToolResult(text);
|
|
108
110
|
}
|
|
109
111
|
|
|
110
|
-
_write(filePath, content, scope) {
|
|
112
|
+
async _write(filePath, content, scope) {
|
|
111
113
|
if (!filePath || filePath === ".") {
|
|
112
114
|
return new ToolResult("Path required for write operation", true);
|
|
113
115
|
}
|
|
114
116
|
const resolved = this._resolveForScope(filePath, scope);
|
|
115
|
-
fs.mkdirSync(path.dirname(resolved), { recursive: true });
|
|
116
|
-
|
|
117
|
-
// v0.7.0 Group M (#84 remainder): on case-insensitive filesystems
|
|
118
|
-
// (macOS/Windows defaults), warn when the target's basename collides
|
|
119
|
-
// with an existing sibling differing only in case. Write proceeds
|
|
120
|
-
// — agents may legitimately overwrite — but the agent gets visible
|
|
121
|
-
// signal so it doesn't end up confused like E2E #5 GLM ("SKILL.md
|
|
122
|
-
// disappeared" when the inode was shared with skill.md). Workspace-
|
|
123
|
-
// scope only; project-dir scope is the user's territory.
|
|
124
|
-
let collisionNote = "";
|
|
125
|
-
if (
|
|
126
|
-
scope === "workspace" &&
|
|
127
|
-
this._workspace.fsCaseSensitive === false
|
|
128
|
-
) {
|
|
129
|
-
try {
|
|
130
|
-
const parent = path.dirname(resolved);
|
|
131
|
-
const targetBase = path.basename(resolved);
|
|
132
|
-
const targetLower = targetBase.toLowerCase();
|
|
133
|
-
const siblings = fs.readdirSync(parent);
|
|
134
|
-
const collision = siblings.find(
|
|
135
|
-
(s) => s !== targetBase && s.toLowerCase() === targetLower,
|
|
136
|
-
);
|
|
137
|
-
if (collision) {
|
|
138
|
-
collisionNote =
|
|
139
|
-
` ⚠ case-collision: case-insensitive filesystem already has '${collision}'` +
|
|
140
|
-
` at this path; both names resolve to the same inode. Pick one canonical case` +
|
|
141
|
-
` (lowercase preferred for skill files) and use it consistently — otherwise` +
|
|
142
|
-
` archive_file / Read on either name affects the other.`;
|
|
143
|
-
}
|
|
144
|
-
} catch { /* readdirSync may fail on a fresh dir; that's fine, no collision possible */ }
|
|
145
|
-
}
|
|
146
117
|
|
|
147
|
-
|
|
118
|
+
const doWrite = () => {
|
|
119
|
+
fs.mkdirSync(path.dirname(resolved), { recursive: true });
|
|
120
|
+
|
|
121
|
+
// v0.7.0 Group M (#84 remainder): on case-insensitive filesystems
|
|
122
|
+
// (macOS/Windows defaults), warn when the target's basename collides
|
|
123
|
+
// with an existing sibling differing only in case. Write proceeds
|
|
124
|
+
// — agents may legitimately overwrite — but the agent gets visible
|
|
125
|
+
// signal so it doesn't end up confused like E2E #5 GLM ("SKILL.md
|
|
126
|
+
// disappeared" when the inode was shared with skill.md). Workspace-
|
|
127
|
+
// scope only; project-dir scope is the user's territory.
|
|
128
|
+
let collisionNote = "";
|
|
129
|
+
if (
|
|
130
|
+
scope === "workspace" &&
|
|
131
|
+
this._workspace.fsCaseSensitive === false
|
|
132
|
+
) {
|
|
133
|
+
try {
|
|
134
|
+
const parent = path.dirname(resolved);
|
|
135
|
+
const targetBase = path.basename(resolved);
|
|
136
|
+
const targetLower = targetBase.toLowerCase();
|
|
137
|
+
const siblings = fs.readdirSync(parent);
|
|
138
|
+
const collision = siblings.find(
|
|
139
|
+
(s) => s !== targetBase && s.toLowerCase() === targetLower,
|
|
140
|
+
);
|
|
141
|
+
if (collision) {
|
|
142
|
+
collisionNote =
|
|
143
|
+
` ⚠ case-collision: case-insensitive filesystem already has '${collision}'` +
|
|
144
|
+
` at this path; both names resolve to the same inode. Pick one canonical case` +
|
|
145
|
+
` (lowercase preferred for skill files) and use it consistently — otherwise` +
|
|
146
|
+
` archive_file / Read on either name affects the other.`;
|
|
147
|
+
}
|
|
148
|
+
} catch { /* readdirSync may fail on a fresh dir; that's fine, no collision possible */ }
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
fs.writeFileSync(resolved, content, "utf-8");
|
|
152
|
+
|
|
153
|
+
// Auto-commit to git for workspace writes (silently no-ops if gitignored or git unavailable)
|
|
154
|
+
let traceId = null;
|
|
155
|
+
if (scope === "workspace") {
|
|
156
|
+
traceId = this._workspace.autoCommit(filePath, "update");
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const label = scope === "project" ? `[project] ${filePath}` : filePath;
|
|
160
|
+
let msg = `Wrote ${content.length} chars to ${label}`;
|
|
161
|
+
if (traceId) msg += ` [trace: ${traceId}]`;
|
|
162
|
+
if (collisionNote) msg += collisionNote;
|
|
163
|
+
return new ToolResult(msg);
|
|
164
|
+
};
|
|
148
165
|
|
|
149
|
-
//
|
|
150
|
-
|
|
166
|
+
// v0.7.3: route writes to shared coordination paths (rules/catalog.json,
|
|
167
|
+
// tasks.json, refs/manifest.json, etc.) through the workspace lock so
|
|
168
|
+
// concurrent writers serialize. No-op for non-shared paths and for
|
|
169
|
+
// project-scope writes (project dir is the user's, not shared engine state).
|
|
151
170
|
if (scope === "workspace") {
|
|
152
|
-
|
|
171
|
+
return await this._workspace.withSharedLockIfApplicable(filePath, doWrite);
|
|
153
172
|
}
|
|
154
|
-
|
|
155
|
-
const label = scope === "project" ? `[project] ${filePath}` : filePath;
|
|
156
|
-
let msg = `Wrote ${content.length} chars to ${label}`;
|
|
157
|
-
if (traceId) msg += ` [trace: ${traceId}]`;
|
|
158
|
-
if (collisionNote) msg += collisionNote;
|
|
159
|
-
return new ToolResult(msg);
|
|
173
|
+
return doWrite();
|
|
160
174
|
}
|
|
161
175
|
|
|
162
176
|
_list(filePath, scope) {
|
package/src/config.js
CHANGED
|
@@ -90,10 +90,12 @@ export function loadSettings(workspacePath) {
|
|
|
90
90
|
tier3: env.TIER3 || gc.tiers?.tier3 || "",
|
|
91
91
|
tier4: env.TIER4 || gc.tiers?.tier4 || "",
|
|
92
92
|
|
|
93
|
-
// VLM tiers (vision/OCR models)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
93
|
+
// VLM tiers (vision/OCR models). v0.7.3: accept OCR_MODEL_TIER* as
|
|
94
|
+
// alias since template/.env.template + initializer.js seed that name.
|
|
95
|
+
// VLM_TIER* takes precedence when both are set.
|
|
96
|
+
vlmTier1: env.VLM_TIER1 || env.OCR_MODEL_TIER1 || gc.vlm_tiers?.tier1 || "",
|
|
97
|
+
vlmTier2: env.VLM_TIER2 || env.OCR_MODEL_TIER2 || gc.vlm_tiers?.tier2 || "",
|
|
98
|
+
vlmTier3: env.VLM_TIER3 || env.OCR_MODEL_TIER3 || gc.vlm_tiers?.tier3 || "",
|
|
97
99
|
|
|
98
100
|
// Worker LLM — optional, defaults to conductor config (process.env wins)
|
|
99
101
|
workerProvider: penv.KC_WORKER_PROVIDER || gc.worker_provider || "",
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
// Single source of truth for the live KC CLI version string.
|
|
2
|
+
//
|
|
3
|
+
// Reads package.json once. Used by engine.js (passed to ReleaseTool so
|
|
4
|
+
// release manifests stamp the correct version) and by
|
|
5
|
+
// pipelines/finalization.js (anywhere it surfaces "Built by kc-beta X").
|
|
6
|
+
//
|
|
7
|
+
// Before v0.7.2, engine.js hardcoded `kcVersion: "0.5.2"` which leaked
|
|
8
|
+
// into every release manifest's `kc_beta_version` field regardless of
|
|
9
|
+
// the actual package version. Both v0.7.1 audit runs (DS + GLM)
|
|
10
|
+
// surfaced this. Reading package.json closes the gap.
|
|
11
|
+
|
|
12
|
+
import fs from "node:fs";
|
|
13
|
+
import path from "node:path";
|
|
14
|
+
import { fileURLToPath } from "node:url";
|
|
15
|
+
|
|
16
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
17
|
+
const __dirname = path.dirname(__filename);
|
|
18
|
+
|
|
19
|
+
export function readKcVersion() {
|
|
20
|
+
try {
|
|
21
|
+
const pkgPath = path.resolve(__dirname, "../../package.json");
|
|
22
|
+
const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
23
|
+
return pkg.version || "unknown";
|
|
24
|
+
} catch {
|
|
25
|
+
return "unknown";
|
|
26
|
+
}
|
|
27
|
+
}
|
package/template/CLAUDE.md
CHANGED
|
@@ -23,6 +23,13 @@ skills/ — Meta skills encoding verification methodology
|
|
|
23
23
|
.env — Configuration: API keys, model tiers, thresholds, language
|
|
24
24
|
```
|
|
25
25
|
|
|
26
|
+
Note: KC's session workspace under `~/.kc_agent/workspaces/<sessionId>/`
|
|
27
|
+
uses lowercase counterparts (`rules/`, `samples/`, `input/`, `output/`,
|
|
28
|
+
`logs/`, `workflows/`, `rule_skills/`) — these are runtime-internal and
|
|
29
|
+
separate from this project's user-facing folders above. The asymmetry
|
|
30
|
+
is intentional: title-case for human-facing project dirs, lowercase for
|
|
31
|
+
KC's working state.
|
|
32
|
+
|
|
26
33
|
## Your Mission
|
|
27
34
|
|
|
28
35
|
Follow this lifecycle. Each step references the skill(s) to consult:
|
|
@@ -93,6 +100,12 @@ skills/ — 编码核查方法论的元技能
|
|
|
93
100
|
.env — 配置:API密钥、模型层级、阈值、语言
|
|
94
101
|
```
|
|
95
102
|
|
|
103
|
+
注:KC 在 `~/.kc_agent/workspaces/<sessionId>/` 下的会话工作区使用
|
|
104
|
+
小写对应目录(`rules/`、`samples/`、`input/`、`output/`、`logs/`、
|
|
105
|
+
`workflows/`、`rule_skills/`)—— 这些是运行时内部目录,与本项目上面
|
|
106
|
+
那些用户可见的目录是分开的。这种大小写不对称是有意的:项目里给人看
|
|
107
|
+
的目录用首字母大写;KC 自己的工作状态用小写。
|
|
108
|
+
|
|
96
109
|
## 你的使命
|
|
97
110
|
|
|
98
111
|
遵循以下生命周期。每一步标注了需要参考的技能:
|
|
@@ -133,6 +133,65 @@ conversation or existing catalog. Therefore, when composing the brief:
|
|
|
133
133
|
catalog.json.** rule_catalog uses workspace file locking;
|
|
134
134
|
sandbox_exec bypasses it and races with other writers.
|
|
135
135
|
|
|
136
|
+
## How to read regulation files (default: read whole)
|
|
137
|
+
|
|
138
|
+
Regulations are the audit's authoritative basis. Every `source_ref`
|
|
139
|
+
in your extracted rules must be verifiable against the source text.
|
|
140
|
+
For typical regulation documents (a single file under ~50 KB / under
|
|
141
|
+
~100 pages), **read each regulation file whole using `workspace_file`
|
|
142
|
+
(operation=read) in a single call**:
|
|
143
|
+
|
|
144
|
+
```js
|
|
145
|
+
workspace_file({ operation: "read", scope: "project", path: "Rules/01_some_regulation.md" })
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
`workspace_file.read` is capped at 50,000 chars per call, which
|
|
149
|
+
covers virtually every individual regulation document. This is the
|
|
150
|
+
default. **Read every regulation file whole before you start
|
|
151
|
+
extracting rules from any of them.**
|
|
152
|
+
|
|
153
|
+
### Tool choice — `workspace_file` vs `sandbox_exec`
|
|
154
|
+
|
|
155
|
+
| Tool | Per-call cap | Use for |
|
|
156
|
+
|---|---:|---|
|
|
157
|
+
| `workspace_file` (read) | 50,000 chars | **full reads of regulation / rule documents** |
|
|
158
|
+
| `sandbox_exec` (cat/head/etc) | 10,000 chars | shell commands, **not** full file reads |
|
|
159
|
+
|
|
160
|
+
`sandbox_exec` is designed for shell commands; its 10K cap is too
|
|
161
|
+
small for most regulations. `cat rules/01_*.md` returns only the
|
|
162
|
+
first ~10 KB followed by `\n[truncated]`. Re-issuing with `head -N` /
|
|
163
|
+
`tail -M` to scroll the window loses positional precision and burns
|
|
164
|
+
turns. **When you see truncation, don't fight the cap — switch
|
|
165
|
+
tools.**
|
|
166
|
+
|
|
167
|
+
### Asymmetry — regs read whole, samples sampled
|
|
168
|
+
|
|
169
|
+
Regulations are limited (typically 1-10 files), authoritative, and
|
|
170
|
+
read once. Read every regulation whole.
|
|
171
|
+
|
|
172
|
+
Sample documents may number 30 to 1000+, are heterogeneous, and get
|
|
173
|
+
read many times during testing. **Don't try to read every sample
|
|
174
|
+
whole.** Use rule-applicability filters or sampled subsets to focus
|
|
175
|
+
attention.
|
|
176
|
+
|
|
177
|
+
### Escape valve — when a single reg exceeds ~200K chars
|
|
178
|
+
|
|
179
|
+
Rare in practice. The largest regulation in `test_data_4` is 42 KB;
|
|
180
|
+
typical Chinese banking regs (资管新规, 信披办法, etc.) all fit
|
|
181
|
+
under 50 KB. But if you do encounter a single regulation so large
|
|
182
|
+
that reading it whole would crowd the context window — heuristic:
|
|
183
|
+
the file exceeds ~200,000 chars or ~25% of your context budget —
|
|
184
|
+
use your own judgment:
|
|
185
|
+
|
|
186
|
+
- Read by chapter (e.g., `第X章` / `Chapter X`) using `document_parse`
|
|
187
|
+
or paginated `workspace_file` reads
|
|
188
|
+
- Or build an in-workspace index file pointing to chapter offsets and
|
|
189
|
+
read on-demand per rule being extracted
|
|
190
|
+
|
|
191
|
+
The 50 KB cap is high enough that this almost never triggers. **The
|
|
192
|
+
default is read whole; deviate only when the file genuinely doesn't
|
|
193
|
+
fit.**
|
|
194
|
+
|
|
136
195
|
## Extraction Strategies
|
|
137
196
|
|
|
138
197
|
### Strategy 1: Structured Input (Developer User Provides Rules)
|
|
@@ -223,6 +282,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
|
|
|
223
282
|
|
|
224
283
|
Do not skip ambiguous rules. They are often the most important ones.
|
|
225
284
|
|
|
285
|
+
## Sanity-check applicability against the sample corpus
|
|
286
|
+
|
|
287
|
+
After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
|
|
288
|
+
|
|
289
|
+
For every rule:
|
|
290
|
+
1. Walk `samples/`, classify each by product type / report type / document format
|
|
291
|
+
2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
|
|
292
|
+
3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
|
|
293
|
+
|
|
294
|
+
E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
|
|
295
|
+
|
|
296
|
+
If many rules are 0-sample, either:
|
|
297
|
+
- **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
|
|
298
|
+
- **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
|
|
299
|
+
- **Update the test corpus** to include matching samples (work with the developer user)
|
|
300
|
+
|
|
301
|
+
Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
|
|
302
|
+
|
|
226
303
|
## When Rules Change
|
|
227
304
|
|
|
228
305
|
Regulations evolve. When the developer user adds new or updated regulation documents:
|
|
@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
|
|
|
45
45
|
### The hybrid approach (most common)
|
|
46
46
|
Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
|
|
47
47
|
|
|
48
|
+
### When regex alone isn't enough — decision rubric
|
|
49
|
+
|
|
50
|
+
Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
|
|
51
|
+
|
|
52
|
+
- **Semantic** ("is this a positive guarantee or a disclaimer?")
|
|
53
|
+
- **Contextual** ("interpret this in light of the document's product type")
|
|
54
|
+
- **Counterfactual** ("what should this value be, given the other fields?")
|
|
55
|
+
- **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
|
|
56
|
+
|
|
57
|
+
regex alone rarely suffices. Three acceptable forms:
|
|
58
|
+
|
|
59
|
+
1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
|
|
60
|
+
2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
|
|
61
|
+
3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
|
|
62
|
+
|
|
63
|
+
Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
|
|
64
|
+
|
|
65
|
+
### Worker LLM cost-aware tier choice
|
|
66
|
+
|
|
67
|
+
If you do escalate to LLM:
|
|
68
|
+
- **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
|
|
69
|
+
- **tier2-3**: bulk extraction with simple semantic checks
|
|
70
|
+
- **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
|
|
71
|
+
|
|
72
|
+
Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
|
|
73
|
+
|
|
48
74
|
## Workflow Structure
|
|
49
75
|
|
|
50
76
|
A workflow is a Python file (or small set of files) in `workflows/`:
|
|
@@ -85,7 +85,7 @@ Bundle multiple rules into a single task (and a single check_r###_r###.py file)
|
|
|
85
85
|
- The judgment logic for one rule is a substring or close variant of the next
|
|
86
86
|
- A single failure typically implies multiple failures (you can't pass R013 if R015 fails)
|
|
87
87
|
|
|
88
|
-
Example: R013 / R015 / R017 all check that a specific table on page 3 of the report contains certain mandatory fields. Same chunk, same parse, same verdict shape. Bundle as `check_r013_r015_r017.py` and create a single
|
|
88
|
+
Example: R013 / R015 / R017 all check that a specific table on page 3 of the report contains certain mandatory fields. Same chunk, same parse, same verdict shape. Bundle as `check_r013_r015_r017.py` and create a single task: `TaskCreate({id: "R013-R015-R017-skill_authoring", title: "R013/R015/R017 — required-fields table", phase: "skill_authoring"})`. The engine's filesystem-derived milestones recognize the grouped check.py and credit all three rule_ids.
|
|
89
89
|
|
|
90
90
|
### When to keep separate
|
|
91
91
|
|
|
@@ -147,6 +147,41 @@ E2E #6 v070 surfaced this pattern (DS bundled-skill check.py files
|
|
|
147
147
|
all returned `{"pass": null, "method": "stub"}` deferring to
|
|
148
148
|
workflows/). v0.7.1 added this anti-pattern explicitly.
|
|
149
149
|
|
|
150
|
+
E2E #7 v071 showed the teaching prevented the stub anti-pattern in
|
|
151
|
+
both conductors (no `{"pass": null}` patterns in either run), but
|
|
152
|
+
**DS still inverted the canonical-vs-distilled relationship**: DS's
|
|
153
|
+
6 thematic skill folders had SKILL.md only (no check.py), with the
|
|
154
|
+
real verification code living in `workflows/<skill>/check.py`. The
|
|
155
|
+
absence of stubs is good; the inversion is not — editing a rule then
|
|
156
|
+
requires touching both SKILL.md (the doc) and the workflow check.py
|
|
157
|
+
(the code). Single source of truth is lost.
|
|
158
|
+
|
|
159
|
+
GLM v071 by contrast landed the canonical pattern: 97/97 skills had
|
|
160
|
+
both SKILL.md AND a real `check.py` (median 143 LOC of regex +
|
|
161
|
+
applicability logic), and `workflows/<id>/workflow_v1.py` was a
|
|
162
|
+
50-line thin wrapper that imported and called it:
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
# workflows/D01-01/workflow_v1.py — thin wrapper, 52 LOC
|
|
166
|
+
import importlib.util, json
|
|
167
|
+
from pathlib import Path
|
|
168
|
+
|
|
169
|
+
def run(doc_text: str, meta: dict = None) -> dict:
|
|
170
|
+
check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
|
|
171
|
+
spec = importlib.util.spec_from_file_location("check", check_path)
|
|
172
|
+
mod = importlib.util.module_from_spec(spec)
|
|
173
|
+
spec.loader.exec_module(mod)
|
|
174
|
+
result = mod.check(doc_text, meta)
|
|
175
|
+
result["_workflow"] = "D01-01_v1"
|
|
176
|
+
return result
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
This is the v0.7.2+ canonical pattern: workflow is a shim that
|
|
180
|
+
points at the skill's check.py. To iterate on a rule's verification,
|
|
181
|
+
edit `rule_skills/<id>/check.py`. The workflow doesn't change. v0.7.2
|
|
182
|
+
clarifies the teaching: avoid stubs AND keep the canonical
|
|
183
|
+
relationship (skill is canonical, workflow is distilled wrapper).
|
|
184
|
+
|
|
150
185
|
### Naming convention for grouped checks
|
|
151
186
|
|
|
152
187
|
When you do bundle, name the file with the explicit range:
|
|
@@ -309,18 +344,50 @@ When entering skill_authoring with an empty TaskBoard:
|
|
|
309
344
|
5. **Pick the first task.** Work it to completion (skill + check + at least one local test). Update PATTERNS.md with whatever you learned. Move to the next task.
|
|
310
345
|
6. **At task ~5 and task ~10:** stop and re-read PATTERNS.md. If patterns suggest a refactor of earlier work, do it now (cheap) rather than later (expensive).
|
|
311
346
|
|
|
312
|
-
###
|
|
347
|
+
### Calling TaskCreate / TaskUpdate / TaskComplete
|
|
348
|
+
|
|
349
|
+
The engine registers three task-board tools (v0.7.3+):
|
|
350
|
+
|
|
351
|
+
- `TaskCreate({id, title, phase, ruleId?})` — adds a task to `tasks.json`. `id` must be unique within the session; pick a stable shape like `<rule_id>-<phase>` for per-rule tasks or `<group-name>-<phase>` for grouped / non-rule tasks. `phase` is the phase the task belongs to (current phase or a future phase you're pre-populating). `ruleId` is optional — set it for per-rule tasks so the engine can credit the rule_id in milestone derivation.
|
|
352
|
+
- `TaskUpdate({id, status?, summary?})` — updates a task's status to `pending` / `in_progress` / `completed` / `failed`, optionally with a short summary.
|
|
353
|
+
- `TaskComplete({id, summary?})` — sugar for `TaskUpdate({id, status:"completed", summary})`. Use this for the common path after finishing a unit of work.
|
|
354
|
+
|
|
355
|
+
After you call `TaskCreate` for your decomposition and exit the current turn, the Ralph loop pulls the next pending task and runs it. Finish the work, call `TaskComplete`, and the loop advances. If a task can't be completed (irrecoverable error), call `TaskUpdate({id, status:"failed", summary:"reason"})` so the queue moves on rather than blocking on the failed task.
|
|
356
|
+
|
|
357
|
+
Examples:
|
|
358
|
+
|
|
359
|
+
```
|
|
360
|
+
TaskCreate({ id: "R001-skill_authoring", title: "Author skill for R001",
|
|
361
|
+
phase: "skill_authoring", ruleId: "R001" })
|
|
362
|
+
|
|
363
|
+
TaskCreate({ id: "trust-bundle-skill_authoring",
|
|
364
|
+
title: "R013/R015/R017 — required-fields table",
|
|
365
|
+
phase: "skill_authoring" })
|
|
366
|
+
|
|
367
|
+
TaskComplete({ id: "R001-skill_authoring",
|
|
368
|
+
summary: "regex check passes 89/90; R001 done" })
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
### Persisted methodology — PATTERNS.md OR phase logs OR AGENT.md decisions
|
|
372
|
+
|
|
373
|
+
The principle: capture framework-level decisions to disk before each phase advance. The conversation will compact, agents will restart, the next phase will lose grounding. Whichever format you pick, write to disk — don't rely on conversation context that disappears.
|
|
374
|
+
|
|
375
|
+
Three formats, each defensible. Pick one and stick with it:
|
|
376
|
+
|
|
377
|
+
- **`rules/PATTERNS.md`** — concise, framework-only, updated as the project progresses. Best for greenfield projects with clear hypothesis-up-front structure. Capped at ~5 KB; entries are transferable shapes / project constraints / anti-patterns with rationale (see "What to write" above).
|
|
313
378
|
|
|
314
|
-
|
|
379
|
+
- **`logs/phase_<name>_complete.md` per phase** — incremental, captures what each phase produced + decisions made + what the next phase inherits. Best for iterative discovery work where the framework crystallizes mid-run. E2E #7 GLM used this pattern across 6 phase docs and an `evolution_summary_v1.2.md`; the methodology was captured even though PATTERNS.md was never written.
|
|
315
380
|
|
|
316
|
-
|
|
381
|
+
- **`AGENT.md` decisions section + domain notes** — narrative-style, living document of "what we know" and "why". Best for projects with rich domain context to capture (regulations, edge cases, thresholds, sample format distributions). E2E #7 GLM's AGENT.md included regulation enforcement dates, product type taxonomies, threshold values, and sample format counts — this is fine; it's a different idiom for the same goal.
|
|
317
382
|
|
|
318
|
-
By the time you have N skills, you've made N implicit decisions about verdict shape, chunker boundaries, worker tier
|
|
383
|
+
What you should NOT do: skip persistence and rely only on the live conversation context. By the time you have N skills authored without any persisted methodology, you've made N implicit decisions about verdict shape, chunker boundaries, and worker tier. Each rule re-derives from scratch. Refactoring requires touching N files instead of one.
|
|
319
384
|
|
|
320
|
-
|
|
385
|
+
❌ "I'll capture insights when I have time."
|
|
321
386
|
|
|
322
|
-
|
|
387
|
+
✅ "Before each phase advance, write what I learned to whichever persistence file matches this project's idiom — even if it's tentative."
|
|
323
388
|
|
|
324
|
-
E2E
|
|
389
|
+
E2E history:
|
|
390
|
+
- E2E #6 v070 DS wrote PATTERNS.md only after a rollback. Per-skill decisions before that point had to be re-touched. v0.7.1 added "PATTERNS.md FIRST" reinforcement.
|
|
391
|
+
- E2E #7 v071 neither DS nor GLM wrote PATTERNS.md, but GLM wrote 6 rich phase-completion logs and a comprehensive AGENT.md — the methodology WAS captured, just in different files. v0.7.2 blesses the broader principle: persist before you advance, format flexible.
|
|
325
392
|
|
|
326
|
-
The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract.
|
|
393
|
+
The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract; the persistence file is your project's memory.
|