kc-beta 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +81 -0
- package/LICENSE-COMMERCIAL.md +125 -0
- package/README.md +21 -3
- package/package.json +14 -5
- package/src/agent/context-window.js +9 -12
- package/src/agent/context.js +14 -1
- package/src/agent/document-parser.js +169 -0
- package/src/agent/engine.js +367 -18
- package/src/agent/history/event-history.js +222 -0
- package/src/agent/llm-client.js +55 -0
- package/src/agent/message-utils.js +63 -0
- package/src/agent/pipelines/_milestone-derive.js +511 -0
- package/src/agent/pipelines/base.js +21 -0
- package/src/agent/pipelines/distillation.js +28 -15
- package/src/agent/pipelines/extraction.js +103 -36
- package/src/agent/pipelines/finalization.js +178 -11
- package/src/agent/pipelines/index.js +6 -1
- package/src/agent/pipelines/initializer.js +74 -8
- package/src/agent/pipelines/production-qc.js +31 -44
- package/src/agent/pipelines/skill-authoring.js +97 -80
- package/src/agent/pipelines/skill-testing.js +67 -23
- package/src/agent/retry.js +10 -2
- package/src/agent/scheduler.js +14 -2
- package/src/agent/session-state.js +18 -1
- package/src/agent/skill-loader.js +13 -7
- package/src/agent/skill-validator.js +19 -5
- package/src/agent/task-manager.js +61 -5
- package/src/agent/tools/document-chunk.js +21 -9
- package/src/agent/tools/phase-advance.js +18 -3
- package/src/agent/tools/release.js +51 -9
- package/src/agent/tools/rule-catalog.js +11 -1
- package/src/agent/tools/workspace-file.js +32 -0
- package/src/agent/workspace.js +39 -1
- package/src/cli/components.js +64 -14
- package/src/cli/index.js +62 -3
- package/src/cli/meme.js +26 -25
- package/src/config.js +65 -22
- package/src/model-tiers.json +24 -8
- package/src/providers.js +42 -0
- package/template/release/v1/README.md.tmpl +108 -0
- package/template/release/v1/catalog.json.tmpl +4 -0
- package/template/release/v1/kc_runtime/__init__.py +11 -0
- package/template/release/v1/kc_runtime/confidence.py +63 -0
- package/template/release/v1/kc_runtime/doc_parser.py +127 -0
- package/template/release/v1/manifest.json.tmpl +11 -0
- package/template/release/v1/render_dashboard.py +117 -0
- package/template/release/v1/run.py +212 -0
- package/template/release/v1/serve.sh +17 -0
- package/template/skills/en/meta-meta/work-decomposition/SKILL.md +266 -0
- package/template/skills/en/skill-creator/SKILL.md +1 -1
- package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +264 -0
- package/template/skills/zh/skill-creator/SKILL.md +1 -1
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
KC release runner v1.
|
|
4
|
+
|
|
5
|
+
Entry point for a self-contained KC release bundle. Loads the manifest,
|
|
6
|
+
iterates rules, dispatches each rule's workflow against the supplied
|
|
7
|
+
input documents, writes per-document verdict JSONs to output/results/.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python3 run.py <input_dir>
|
|
11
|
+
python3 run.py <input_dir> --rules R001,R005,R012
|
|
12
|
+
python3 run.py --doc <single_file> # single-doc smoke test
|
|
13
|
+
|
|
14
|
+
The bundle is shipped from KC's finalization phase. KC's run-in-CLI
|
|
15
|
+
mode is the source of truth; this is the ship-as-artifact form for
|
|
16
|
+
re-running verification on new document batches without the full KC
|
|
17
|
+
toolchain installed.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import json
|
|
24
|
+
import os
|
|
25
|
+
import sys
|
|
26
|
+
import subprocess
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
HERE = Path(__file__).resolve().parent
|
|
30
|
+
|
|
31
|
+
# Add kc_runtime to path so submodules import cleanly when run from any cwd.
|
|
32
|
+
sys.path.insert(0, str(HERE))
|
|
33
|
+
|
|
34
|
+
from kc_runtime import doc_parser, confidence # noqa: E402
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _load_json(path: Path, *, required: bool = False, default=None):
|
|
38
|
+
if not path.exists():
|
|
39
|
+
if required:
|
|
40
|
+
raise SystemExit(
|
|
41
|
+
f"required file missing: {path}\n"
|
|
42
|
+
f"this release bundle was shipped without a complete manifest.\n"
|
|
43
|
+
f"re-run KC finalization or contact the bundle author."
|
|
44
|
+
)
|
|
45
|
+
return default
|
|
46
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _select_rules(catalog, rule_filter):
|
|
50
|
+
if not rule_filter:
|
|
51
|
+
return catalog
|
|
52
|
+
wanted = set(rule_filter.split(","))
|
|
53
|
+
return [r for r in catalog if r.get("id") in wanted]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _list_input_docs(input_dir: Path):
|
|
57
|
+
if not input_dir.is_dir():
|
|
58
|
+
raise SystemExit(f"input_dir not a directory: {input_dir}")
|
|
59
|
+
docs = []
|
|
60
|
+
for entry in sorted(input_dir.iterdir()):
|
|
61
|
+
if entry.is_file() and not entry.name.startswith("."):
|
|
62
|
+
docs.append(entry)
|
|
63
|
+
return docs
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _run_workflow(rule_id: str, workflow_path: Path, doc_path: Path) -> dict:
|
|
67
|
+
"""
|
|
68
|
+
Dispatch a single workflow against a single document.
|
|
69
|
+
|
|
70
|
+
Each workflow is a stand-alone Python script that takes a document
|
|
71
|
+
path on argv and emits a JSON verdict on stdout. Workflows are
|
|
72
|
+
sandbox-runnable: no shared module state, no special imports beyond
|
|
73
|
+
stdlib + kc_runtime.
|
|
74
|
+
"""
|
|
75
|
+
if not workflow_path.exists():
|
|
76
|
+
return {
|
|
77
|
+
"rule_id": rule_id,
|
|
78
|
+
"verdict": "ERROR",
|
|
79
|
+
"confidence": 0.0,
|
|
80
|
+
"error_type": "workflow_missing",
|
|
81
|
+
"reason": f"workflow not found: {workflow_path.name}",
|
|
82
|
+
}
|
|
83
|
+
try:
|
|
84
|
+
proc = subprocess.run(
|
|
85
|
+
[sys.executable, str(workflow_path), str(doc_path)],
|
|
86
|
+
capture_output=True,
|
|
87
|
+
text=True,
|
|
88
|
+
timeout=180,
|
|
89
|
+
)
|
|
90
|
+
if proc.returncode != 0:
|
|
91
|
+
return {
|
|
92
|
+
"rule_id": rule_id,
|
|
93
|
+
"verdict": "ERROR",
|
|
94
|
+
"confidence": 0.0,
|
|
95
|
+
"error_type": "workflow_exit_nonzero",
|
|
96
|
+
"reason": (proc.stderr or proc.stdout or "").strip()[:500],
|
|
97
|
+
}
|
|
98
|
+
# Workflow contract: last stdout line is the verdict JSON.
|
|
99
|
+
last = next(
|
|
100
|
+
(line for line in reversed(proc.stdout.splitlines()) if line.strip()),
|
|
101
|
+
None,
|
|
102
|
+
)
|
|
103
|
+
if not last:
|
|
104
|
+
return {
|
|
105
|
+
"rule_id": rule_id,
|
|
106
|
+
"verdict": "ERROR",
|
|
107
|
+
"confidence": 0.0,
|
|
108
|
+
"error_type": "empty_workflow_output",
|
|
109
|
+
}
|
|
110
|
+
verdict = json.loads(last)
|
|
111
|
+
verdict.setdefault("rule_id", rule_id)
|
|
112
|
+
return verdict
|
|
113
|
+
except subprocess.TimeoutExpired:
|
|
114
|
+
return {
|
|
115
|
+
"rule_id": rule_id,
|
|
116
|
+
"verdict": "ERROR",
|
|
117
|
+
"confidence": 0.0,
|
|
118
|
+
"error_type": "workflow_timeout",
|
|
119
|
+
}
|
|
120
|
+
except json.JSONDecodeError as exc:
|
|
121
|
+
return {
|
|
122
|
+
"rule_id": rule_id,
|
|
123
|
+
"verdict": "ERROR",
|
|
124
|
+
"confidence": 0.0,
|
|
125
|
+
"error_type": "workflow_output_not_json",
|
|
126
|
+
"reason": str(exc),
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def main():
|
|
131
|
+
parser = argparse.ArgumentParser(prog="run.py", description="KC release runner")
|
|
132
|
+
parser.add_argument("input_dir", nargs="?", help="Directory of input documents")
|
|
133
|
+
parser.add_argument("--doc", help="Single document path (smoke-test mode)")
|
|
134
|
+
parser.add_argument("--rules", help="Comma-separated rule_ids to run (default: all)")
|
|
135
|
+
parser.add_argument("--output-dir", default=str(HERE / "output" / "results"))
|
|
136
|
+
args = parser.parse_args()
|
|
137
|
+
|
|
138
|
+
if not args.input_dir and not args.doc:
|
|
139
|
+
parser.error("either input_dir or --doc is required")
|
|
140
|
+
|
|
141
|
+
manifest = _load_json(HERE / "manifest.json", required=True)
|
|
142
|
+
catalog = _load_json(HERE / "catalog.json", required=False, default=[])
|
|
143
|
+
historical = _load_json(
|
|
144
|
+
HERE / "confidence_calibration.json",
|
|
145
|
+
required=False,
|
|
146
|
+
default={"historical_accuracy": {}},
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
rules = _select_rules(catalog, args.rules)
|
|
150
|
+
if not rules:
|
|
151
|
+
raise SystemExit("no rules to run (catalog empty or filter excluded all)")
|
|
152
|
+
|
|
153
|
+
if args.doc:
|
|
154
|
+
docs = [Path(args.doc).resolve()]
|
|
155
|
+
else:
|
|
156
|
+
docs = _list_input_docs(Path(args.input_dir).resolve())
|
|
157
|
+
|
|
158
|
+
output_dir = Path(args.output_dir).resolve()
|
|
159
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
160
|
+
|
|
161
|
+
workflows = manifest.get("workflows", {})
|
|
162
|
+
summary = {"total_runs": 0, "by_verdict": {}, "errors": 0}
|
|
163
|
+
|
|
164
|
+
for doc in docs:
|
|
165
|
+
# Lightweight parse — let the workflow do its own parse if needed,
|
|
166
|
+
# but offer a doc_parser preflight so workflows can rely on the
|
|
167
|
+
# text being available.
|
|
168
|
+
try:
|
|
169
|
+
doc_parser.preflight(doc)
|
|
170
|
+
except Exception as exc:
|
|
171
|
+
print(f"[run.py] preflight failed for {doc.name}: {exc}", file=sys.stderr)
|
|
172
|
+
|
|
173
|
+
results = {}
|
|
174
|
+
for rule in rules:
|
|
175
|
+
rule_id = rule.get("id")
|
|
176
|
+
if not rule_id:
|
|
177
|
+
continue
|
|
178
|
+
wf_relpath = workflows.get(rule_id)
|
|
179
|
+
if not wf_relpath:
|
|
180
|
+
results[rule_id] = {
|
|
181
|
+
"rule_id": rule_id,
|
|
182
|
+
"verdict": "NO_WORKFLOW",
|
|
183
|
+
"confidence": 0.0,
|
|
184
|
+
}
|
|
185
|
+
continue
|
|
186
|
+
wf_path = HERE / wf_relpath
|
|
187
|
+
verdict = _run_workflow(rule_id, wf_path, doc)
|
|
188
|
+
verdict = confidence.calibrate(verdict, historical)
|
|
189
|
+
results[rule_id] = verdict
|
|
190
|
+
summary["total_runs"] += 1
|
|
191
|
+
v = verdict.get("verdict", "UNKNOWN")
|
|
192
|
+
summary["by_verdict"][v] = summary["by_verdict"].get(v, 0) + 1
|
|
193
|
+
if v == "ERROR":
|
|
194
|
+
summary["errors"] += 1
|
|
195
|
+
|
|
196
|
+
out_file = output_dir / f"{doc.stem}.json"
|
|
197
|
+
out_file.write_text(
|
|
198
|
+
json.dumps(
|
|
199
|
+
{"document": str(doc), "results": results}, ensure_ascii=False, indent=2
|
|
200
|
+
),
|
|
201
|
+
encoding="utf-8",
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
summary_path = output_dir / "summary.json"
|
|
205
|
+
summary_path.write_text(
|
|
206
|
+
json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8"
|
|
207
|
+
)
|
|
208
|
+
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
if __name__ == "__main__":
|
|
212
|
+
main()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
# Minimal local-preview server for the KC release dashboard.
|
|
3
|
+
# Renders dashboard.html if missing, then serves on PORT (default 8765).
|
|
4
|
+
|
|
5
|
+
set -e
|
|
6
|
+
HERE="$(cd "$(dirname "$0")" && pwd)"
|
|
7
|
+
cd "$HERE"
|
|
8
|
+
|
|
9
|
+
PORT="${PORT:-8765}"
|
|
10
|
+
|
|
11
|
+
if [ ! -f dashboard.html ] || [ output/results/summary.json -nt dashboard.html ]; then
|
|
12
|
+
echo "rendering dashboard.html..."
|
|
13
|
+
python3 render_dashboard.py output/results/ > dashboard.html
|
|
14
|
+
fi
|
|
15
|
+
|
|
16
|
+
echo "serving on http://localhost:$PORT/dashboard.html"
|
|
17
|
+
exec python3 -m http.server "$PORT"
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: work-decomposition
|
|
3
|
+
description: Decide how to decompose the rule set into TaskBoard tasks during rule_extraction → skill_authoring transition. Covers ordering methodologies (difficulty-first / Shannon–Huffman, breadth-first, depth-first, binary partition), grouping rules (when to bundle multiple rules into one task vs. keep separate), three-axis difficulty estimation, and how to write PATTERNS.md project memory that stays useful across the run. Use when entering rule_extraction, when entering skill_authoring, or whenever the TaskBoard feels wrong and you want to re-decompose.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Work Decomposition
|
|
7
|
+
|
|
8
|
+
KC's main agent is the conductor. The conductor decides what work to do next — and that decision is upstream of every other choice that follows. Wrong decomposition makes the rest of the run expensive: if rules are processed in the wrong order, the agent re-designs the same shape three times. If unrelated rules are bundled into one skill, the resulting check.py becomes the unified-runner anti-pattern from E2E #4. If related rules are split across separate skills, the agent re-derives the shared chunker logic 17 times.
|
|
9
|
+
|
|
10
|
+
This skill is the conductor's playbook for that decision. It ships under `meta-meta/` because work decomposition is a system-level discipline, not a per-rule technique. The complementary `task-decomposition` skill (also under `meta-meta/`) covers the *internal* structure of one rule's check — locate, extract, normalize, judge, comment. This skill covers how the rule **set** should be split into TaskBoard items.
|
|
11
|
+
|
|
12
|
+
## When to use this skill
|
|
13
|
+
|
|
14
|
+
- **Entering rule_extraction.** Read the regulation, decompose into rules, then decide how those rules will be ordered and grouped before declaring the phase done. Coverage audit + chunk refs are downstream of these decisions.
|
|
15
|
+
- **Entering skill_authoring.** TaskBoard is empty (engine no longer auto-populates per-rule tasks). Read the rule list from `describeState`, decide grouping + order, then call `TaskCreate` for each unit of work.
|
|
16
|
+
- **Mid-run re-decomposition.** If the TaskBoard feels wrong (rules accumulating in the wrong order, an obviously-bundled pair across two tasks), stop adding work and re-decompose. The cost of pausing 5 minutes to re-plan is recovered within 2 rules of better-shaped work.
|
|
17
|
+
|
|
18
|
+
## Locked principles
|
|
19
|
+
|
|
20
|
+
1. **Hard tracking, soft executing.** The engine derives milestones from disk facts (`rule_skills/<id>/SKILL.md`, `check_*.py`, `workflows/<id>/...`) regardless of how you grouped your tasks. Coverage is engine-verified; grouping is your call. You cannot bypass the floor by clever task naming, but the floor doesn't dictate task shape.
|
|
21
|
+
2. **The hardest rule contains the most information.** Hard rules force the chunker, classifier, verdict shape, and worker LLM tier you'll need. Easy rules can compile down to a subset of that machinery cheaply. Encode the hard cases first; let the easy cases inherit.
|
|
22
|
+
3. **PATTERNS.md is the load-bearing memory.** Without an accumulating reference, every rule starts from a blank slate and you re-design the same shape repeatedly. With it, work compounds.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Ordering methodologies
|
|
27
|
+
|
|
28
|
+
Pick one explicitly and write it into your first PATTERNS.md entry. "I'm going Shannon–Huffman because R028's multi-party verdict shape will dictate the chunker for everyone else" is a valid decision; "I started at the top of catalog.json and kept going" is not — it's just absence of decision.
|
|
29
|
+
|
|
30
|
+
### Shannon–Huffman (difficulty-first) — recommended default
|
|
31
|
+
|
|
32
|
+
Process the **hardest** rule first. Use the chunker, verdict shape, and worker tier that hard rule demands as the design floor. Process subsequent rules in descending difficulty, each one a degenerate case of the machinery already built.
|
|
33
|
+
|
|
34
|
+
**When to pick:** the rule set has uneven complexity and you suspect a few hard rules will dictate the shape (almost always true for compliance / regulatory work). E2E #5 GLM accidentally followed this path and produced 0.6% ERROR on real LLM-driven workflows; DS started bottom-up and shipped 78% NOT_APPLICABLE.
|
|
35
|
+
|
|
36
|
+
**Why "Huffman" not "Shannon" for the analogy:** Huffman builds optimal prefix codes by processing low-frequency symbols first. KC's analogue is the high-cost-per-rule, low-frequency rules — the R028s that dominate the design space even though there are few of them. Touch them first. The easy rules inherit the framework cheaply.
|
|
37
|
+
|
|
38
|
+
**The ordering compiler-design parallel:** don't optimize the common case until you've handled the worst case correctly. The common case being fast doesn't matter if the worst case requires a redesign.
|
|
39
|
+
|
|
40
|
+
### Breadth-first (round-robin)
|
|
41
|
+
|
|
42
|
+
Process every rule to a shallow depth (skill skeleton + first regex pass), then go back and deepen each one. Useful when:
|
|
43
|
+
|
|
44
|
+
- The full set's quality matters more than per-rule depth (e.g., you need a coverage report fast)
|
|
45
|
+
- You don't yet know which rules are hard
|
|
46
|
+
- You're piloting a new methodology and want to validate the pipeline shape across many rules cheaply
|
|
47
|
+
|
|
48
|
+
**Trap:** you may declare rule_extraction done with shallow skills that never deepen. Worse than depth-first because the gate appears satisfied from coverage alone.
|
|
49
|
+
|
|
50
|
+
### Depth-first (one rule at a time, fully done)
|
|
51
|
+
|
|
52
|
+
Process rule 1 to completion (SKILL.md + check.py + tests passing) before touching rule 2. Useful when:
|
|
53
|
+
|
|
54
|
+
- Rules are largely independent (rare in compliance work)
|
|
55
|
+
- The conductor model has small context and re-loading shape between rules is cheap
|
|
56
|
+
- You're proving the end-to-end pipeline before scaling
|
|
57
|
+
|
|
58
|
+
**Trap:** the first rule's shape gets locked in; refactoring after rule 5 means rewriting 1-4. Combine with PATTERNS.md to mitigate.
|
|
59
|
+
|
|
60
|
+
### Binary partition
|
|
61
|
+
|
|
62
|
+
Split the rule set into two halves on a meaningful axis (public/private products, document type, regulation chapter), then recurse. Useful when:
|
|
63
|
+
|
|
64
|
+
- The split axis is structural (e.g., banking rules vs trust rules) — you can build separate tools per partition
|
|
65
|
+
- Some partitions can be skipped entirely (D6 applicability filter says "not applicable for this corpus")
|
|
66
|
+
|
|
67
|
+
**Trap:** premature partitioning when the axis isn't real. The agent commits to two tools that turn out to need a shared base. Validate the split with 2-3 rules per side before committing.
|
|
68
|
+
|
|
69
|
+
### "Easiest first" — what NOT to default to
|
|
70
|
+
|
|
71
|
+
Tempting because it builds confidence and ships something visible quickly. Do not default to it for regulatory rule sets — the easy rules teach you nothing transferable about the hard ones, and the framework crystallizes around the wrong shape. Use it only when you're piloting tooling on a brand-new project and need to prove the pipeline can produce ANY output before sizing the real work.
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Grouping rules
|
|
76
|
+
|
|
77
|
+
The default is **one rule per task → one rule per skill directory**. This keeps coverage measurable and the TaskBoard clear. Group only when grouping reduces total work without coupling unrelated concerns.
|
|
78
|
+
|
|
79
|
+
### When to bundle
|
|
80
|
+
|
|
81
|
+
Bundle multiple rules into a single task (and a single check_r###_r###.py file) when ALL of:
|
|
82
|
+
|
|
83
|
+
- The rules share the same source chunk(s) — looking at the same paragraph of the same regulation
|
|
84
|
+
- They share the same input format (e.g., a required-fields table)
|
|
85
|
+
- The judgment logic for one rule is a substring or close variant of the next
|
|
86
|
+
- A single failure typically implies multiple failures (you can't pass R013 if R015 fails)
|
|
87
|
+
|
|
88
|
+
Example: R013 / R015 / R017 all check that a specific table on page 3 of the report contains certain mandatory fields. Same chunk, same parse, same verdict shape. Bundle as `check_r013_r015_r017.py` and create a single TaskCreate task `R013/R015/R017 — required-fields table`. The engine's filesystem-derived milestones recognize the grouped check.py and credit all three rule_ids.
|
|
89
|
+
|
|
90
|
+
### When to keep separate
|
|
91
|
+
|
|
92
|
+
Keep separate when ANY of:
|
|
93
|
+
|
|
94
|
+
- Rules cite different regulation chapters — even if conceptually related (e.g., R013 disclosure-content and R028 custodian-responsibility — both about reports, but different chapters / different evidence chains)
|
|
95
|
+
- Rules need different worker LLM tiers (R005 needs a flagship for nuanced judgment, R001 is regex)
|
|
96
|
+
- Rules apply to different document types (one applies only to public-fund reports, another only to private-fund reports)
|
|
97
|
+
- One rule's failure mode is a specific failure mode of another (don't bundle parent + child rules — the child's check redundantly re-runs the parent's)
|
|
98
|
+
|
|
99
|
+
The v0.6.2 D2 anti-pattern wording captures the failure case clearly:
|
|
100
|
+
> If you find yourself writing a unified_qc.py-style monolith that bypasses individual skills, your per-rule skills are wrong. Fix them, don't replace them.
|
|
101
|
+
|
|
102
|
+
That came from E2E #4 where one conductor wrote a 2,400-line `unified_qc.py` that ran all rules at once. It produced 1,150 ERROR verdicts (16.6%) because every rule's failure cascaded into every other rule's verdict. Per-rule skills are KC's unit of granularity for a reason.
|
|
103
|
+
|
|
104
|
+
### Naming convention for grouped checks
|
|
105
|
+
|
|
106
|
+
When you do bundle, name the file with the explicit range:
|
|
107
|
+
|
|
108
|
+
- `check_r013_r015_r017.py` — three specific rules
|
|
109
|
+
- `check_r002_r007.py` — contiguous range (R002 through R007)
|
|
110
|
+
- `check_r013-r017.py` — alternative spelling, also accepted
|
|
111
|
+
|
|
112
|
+
The engine's filesystem-derived milestones parse these names and credit each constituent rule_id. The grouping is documentation as much as code organization — downstream consumers (workflow-run, dashboards, release tool) read the filename to know coverage.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Difficulty estimation — three-axis triage
|
|
117
|
+
|
|
118
|
+
Before you commit to an order, score each extracted rule on three axes. One quick worker LLM call per rule (tier3 is sufficient — not a deep judgment) writes a `rules/difficulty.json` that the conductor then reads when deciding TaskBoard order.
|
|
119
|
+
|
|
120
|
+
### Axis 1 — Chain-of-thought depth
|
|
121
|
+
|
|
122
|
+
How many sequential judgments does the rule require? Count operations the agent has to chain together:
|
|
123
|
+
|
|
124
|
+
- 1: `text contains "industry-unified channel"` (regex)
|
|
125
|
+
- 2: classify product type, then check channel (two-step)
|
|
126
|
+
- 3+: classify product type, locate disclosure section, parse table, compare against another section's table (multi-step)
|
|
127
|
+
|
|
128
|
+
Score: 1 / 2 / 3+ on this axis.
|
|
129
|
+
|
|
130
|
+
### Axis 2 — Module count
|
|
131
|
+
|
|
132
|
+
How many distinct sub-checks does the rule encompass? A "module" is a logically separable predicate.
|
|
133
|
+
|
|
134
|
+
- 1: single predicate ("must mention channel A")
|
|
135
|
+
- 2-3: a small required-fields list ("must mention A, B, C, D")
|
|
136
|
+
- 4+: a large checklist or conditional branch ("if public fund, then channels X+Y; if private, then channel Z; in all cases also include the manager identity")
|
|
137
|
+
|
|
138
|
+
Score: 1 / 2-3 / 4+ on this axis.
|
|
139
|
+
|
|
140
|
+
### Axis 3 — Cross-rule interaction
|
|
141
|
+
|
|
142
|
+
Does the rule reference another rule, depend on its output, or have to resolve consistency with it?
|
|
143
|
+
|
|
144
|
+
- 0: standalone (most rules)
|
|
145
|
+
- 1: cross-references one other rule (e.g., R007 references R013's table existence)
|
|
146
|
+
- 2+: tightly coupled with multiple rules, requires consistency reasoning across them
|
|
147
|
+
|
|
148
|
+
Score: 0 / 1 / 2+ on this axis.
|
|
149
|
+
|
|
150
|
+
### Total difficulty
|
|
151
|
+
|
|
152
|
+
Sum the three axes (1+1+0 = 2 minimum, 3+3+2 = 8 maximum). Sort descending. The 2-3 highest are your design-floor cases — work them first.
|
|
153
|
+
|
|
154
|
+
For a 70-rule corpus, expect difficulty distribution roughly:
|
|
155
|
+
- 10-15 hard (sum 5-8)
|
|
156
|
+
- 30-40 medium (sum 3-4)
|
|
157
|
+
- 20-30 easy (sum 2)
|
|
158
|
+
|
|
159
|
+
Don't over-engineer the triage. It's a planning aid, not a contract. If during work you discover a rule scored 2 was actually a 6, update PATTERNS.md and re-sort the remaining queue.
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## PATTERNS.md — project memory discipline
|
|
164
|
+
|
|
165
|
+
KC's main agent does not have continuous memory across phases. Every time the agent re-reads `describeState`, it sees the same rule list and the same milestones. Without an external accumulating reference, every rule's design starts from scratch.
|
|
166
|
+
|
|
167
|
+
`rules/PATTERNS.md` is that reference. The agent owns it (writes via `workspace_file`, not via any tool wrapper). The engine surfaces it in every system prompt of skill_authoring + skill_testing. Capped at ~5 KB so token cost stays trivial.
|
|
168
|
+
|
|
169
|
+
### What to write — patterns that transfer
|
|
170
|
+
|
|
171
|
+
A good PATTERNS.md entry captures something that will SAVE work on the next rule. Three legitimate categories:
|
|
172
|
+
|
|
173
|
+
✅ **Transferable shape** — a verdict shape, chunker granularity, or interface decision that subsequent rules will reuse.
|
|
174
|
+
|
|
175
|
+
```
|
|
176
|
+
R028 (custodian responsibility) needed multi-party verdict shape:
|
|
177
|
+
{ primary_party: PASS|FAIL, secondary_parties: [...], reasons: [...] }
|
|
178
|
+
Adopting as default for any rule with multiple liable entities.
|
|
179
|
+
Confirmed reusable on R029, R031.
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
✅ **Project-level constraint** — a fact about the corpus or environment that affects multiple rules.
|
|
183
|
+
|
|
184
|
+
```
|
|
185
|
+
Sample corpus has bilingual table headings (EN+ZH).
|
|
186
|
+
Chunker MUST split on the ZH heading boundary, not the EN one —
|
|
187
|
+
verified on 5 sample docs. Without this, R013 / R015 / R017 all
|
|
188
|
+
under-extract.
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
✅ **Anti-pattern with rationale** — a thing you tried, why it failed, what to do instead.
|
|
192
|
+
|
|
193
|
+
```
|
|
194
|
+
Tried tier4 for JSON-output verdicts → empty responses 80% of the time.
|
|
195
|
+
tier3 (Qwen3.5) hallucinates field names. Settled on tier2 (DeepSeek-V3.2)
|
|
196
|
+
for any structured-output rule. Tier1 reserved for verdict reasoning under
|
|
197
|
+
ambiguous evidence (R005, R024).
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### What NOT to write — log-dump anti-patterns
|
|
201
|
+
|
|
202
|
+
These add token cost without adding decision value. Future-you reading PATTERNS.md is trying to figure out what to do, not reconstruct what already happened.
|
|
203
|
+
|
|
204
|
+
❌ **Completion log** — already in tasks.json + filesystem.
|
|
205
|
+
|
|
206
|
+
```
|
|
207
|
+
R001 done. R002 done. R003 partial pass. R004 done.
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
❌ **Tool history echo** — already in events.jsonl.
|
|
211
|
+
|
|
212
|
+
```
|
|
213
|
+
Called workspace_file to write check_R013.py. Then called sandbox_exec.
|
|
214
|
+
Then ran the result through worker_llm_call.
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
❌ **Filesystem-authoritative facts** — the engine derives these from disk.
|
|
218
|
+
|
|
219
|
+
```
|
|
220
|
+
Workflows live under workflows/R001_workflow.py. There are 28 of them.
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
❌ **Conversation summary** — neither agent nor user reads PATTERNS.md as narrative.
|
|
224
|
+
|
|
225
|
+
```
|
|
226
|
+
After discussing with the user, we decided to focus on banking rules first.
|
|
227
|
+
The user mentioned that trust products are out of scope.
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
If a project-level decision came out of conversation, write it as a constraint:
|
|
231
|
+
|
|
232
|
+
```
|
|
233
|
+
Trust products excluded from this run (D6 applicability NO).
|
|
234
|
+
Skip R078, R092, R104 — their skills exist as stubs only.
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### When to update vs append
|
|
238
|
+
|
|
239
|
+
- **Append** when you discover something new and transferable.
|
|
240
|
+
- **Update an existing entry** when work on a later rule reveals a better abstraction. Don't lock yourself into the first hard rule's shape — JIT compilers recompile when profile data invalidates the original assumption; PATTERNS.md should evolve the same way.
|
|
241
|
+
- **Delete an entry** when you discover it was wrong. Mark the deletion with a brief rationale at the bottom of the file:
|
|
242
|
+
|
|
243
|
+
```
|
|
244
|
+
[DELETED 2026-04-29] "Always use tier1 for FAIL verdicts"
|
|
245
|
+
Why: R005 + R007 work fine on tier2; tier1 reserved for genuinely
|
|
246
|
+
ambiguous evidence cases only (3 rules across the set).
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### Sizing
|
|
250
|
+
|
|
251
|
+
Keep PATTERNS.md under ~5 KB total. If it exceeds, prune the least-actionable entries (the ones that haven't influenced any decision in the last 5 rules). Memory is for what you're using, not what you've seen.
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## Putting it together — opening sequence
|
|
256
|
+
|
|
257
|
+
When entering skill_authoring with an empty TaskBoard:
|
|
258
|
+
|
|
259
|
+
1. **Read `describeState`.** Look at the rule list, the milestones (rules with chunk refs / coverage audited), and any existing PATTERNS.md.
|
|
260
|
+
2. **If PATTERNS.md is empty:** spend ~2 turns deciding ordering methodology + first 3-5 patterns. Write PATTERNS.md as your first artifact, before any skill code.
|
|
261
|
+
3. **If `rules/difficulty.json` exists:** sort rules by difficulty descending. Group where appropriate per the rules above. Call `TaskCreate` for each unit.
|
|
262
|
+
4. **If `rules/difficulty.json` doesn't exist:** decide whether to spend the worker LLM calls to triage (almost always yes for a corpus of >20 rules). Run the triage step (one tier3 call per rule, batched in groups of 10 if you want), write `rules/difficulty.json`, then proceed to step 3.
|
|
263
|
+
5. **Pick the first task.** Work it to completion (skill + check + at least one local test). Update PATTERNS.md with whatever you learned. Move to the next task.
|
|
264
|
+
6. **At task ~5 and task ~10:** stop and re-read PATTERNS.md. If patterns suggest a refactor of earlier work, do it now (cheap) rather than later (expensive).
|
|
265
|
+
|
|
266
|
+
The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: skill-creator
|
|
3
|
-
description:
|
|
3
|
+
description: Anthropic's skill-scaffolding toolkit — use for iterating/improving existing skills or running evals on them, NOT as the primary reference for building KC's per-rule verification skills. For KC rule skills, read `meta-meta/skill-authoring` first (canonical folder layout + granularity rules + KC-specific check.py entry-point conventions) and `meta-meta/work-decomposition` for ordering + grouping decisions. This skill applies once per-rule skills exist and the agent wants to optimize their description/triggering or run formal evals.
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# Skill Creator
|