kc-beta 0.3.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agent/confidence-scorer.js +8 -0
- package/src/agent/context.js +25 -0
- package/src/agent/corner-case-registry.js +5 -0
- package/src/agent/engine.js +514 -75
- package/src/agent/event-log.js +15 -2
- package/src/agent/history.js +91 -23
- package/src/agent/pipelines/initializer.js +3 -6
- package/src/agent/retry.js +9 -1
- package/src/agent/scheduler.js +276 -0
- package/src/agent/session-state.js +11 -2
- package/src/agent/task-manager.js +5 -0
- package/src/agent/tools/agent-tool.js +57 -14
- package/src/agent/tools/archive-file.js +94 -0
- package/src/agent/tools/copy-to-workspace.js +140 -0
- package/src/agent/tools/phase-advance.js +60 -0
- package/src/agent/tools/release.js +322 -0
- package/src/agent/tools/schedule-fetch.js +118 -0
- package/src/agent/tools/snapshot.js +101 -0
- package/src/agent/tools/workspace-file.js +10 -7
- package/src/agent/version-manager.js +29 -120
- package/src/agent/workspace.js +127 -4
- package/src/cli/components.js +4 -1
- package/src/cli/index.js +57 -4
- package/src/config.js +10 -1
- package/template/release-runtime/README.md.tmpl +84 -0
- package/template/release-runtime/kc_runtime/__init__.py +2 -0
- package/template/release-runtime/kc_runtime/confidence.py +93 -0
- package/template/release-runtime/kc_runtime/dashboard.py +208 -0
- package/template/release-runtime/render_dashboard.py +49 -0
- package/template/release-runtime/run.py +230 -0
- package/template/release-runtime/serve.sh +15 -0
- package/template/skills/en/meta/entity-extraction/SKILL.md +6 -0
- package/template/skills/en/meta-meta/bootstrap-workspace/SKILL.md +11 -0
- package/template/skills/en/meta-meta/quality-control/SKILL.md +13 -1
- package/template/skills/en/meta-meta/rule-extraction/SKILL.md +35 -0
- package/template/skills/en/meta-meta/rule-graph/SKILL.md +16 -0
- package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md +8 -0
- package/template/skills/en/meta-meta/task-decomposition/SKILL.md +13 -0
- package/template/skills/en/meta-meta/version-control/SKILL.md +13 -0
- package/template/skills/zh/meta/entity-extraction/SKILL.md +6 -0
- package/template/skills/zh/meta-meta/bootstrap-workspace/SKILL.md +11 -0
- package/template/skills/zh/meta-meta/quality-control/SKILL.md +12 -0
- package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +35 -0
- package/template/skills/zh/meta-meta/rule-graph/SKILL.md +16 -0
- package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md +8 -0
- package/template/skills/zh/meta-meta/task-decomposition/SKILL.md +16 -0
- package/template/skills/zh/meta-meta/version-control/SKILL.md +13 -0
- package/template/workspace.gitignore +22 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""
|
|
2
|
+
End-user dashboard renderer — Python port of DashboardRenderTool._renderHtml.
|
|
3
|
+
|
|
4
|
+
Takes a release-run result JSON (the output of run.py) and emits a static
|
|
5
|
+
HTML dashboard. Dark theme, two tabs (Summary + Per-Rule), no external
|
|
6
|
+
dependencies, no JS framework — vanilla JS for tab switching only.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import html as _html
|
|
10
|
+
import json
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def render(result, manifest):
|
|
15
|
+
"""
|
|
16
|
+
result: dict from run.py — keys: release, snapshot_tag, input,
|
|
17
|
+
started_at, duration_ms, results: [{rule_id, value, confidence,
|
|
18
|
+
confidence_band, extraction_method, exit_code, raw}]
|
|
19
|
+
manifest: dict from the bundle's manifest.json (for header info)
|
|
20
|
+
Returns: a complete HTML string.
|
|
21
|
+
"""
|
|
22
|
+
label = manifest.get("label", result.get("release", ""))
|
|
23
|
+
snap_tag = manifest.get("snapshot_tag", result.get("snapshot_tag", ""))
|
|
24
|
+
input_doc = result.get("input", "")
|
|
25
|
+
started = result.get("started_at", "")
|
|
26
|
+
duration_ms = result.get("duration_ms", 0)
|
|
27
|
+
rules = manifest.get("rules", [])
|
|
28
|
+
rule_titles = {r["id"]: r.get("title", "") for r in rules}
|
|
29
|
+
results = result.get("results", [])
|
|
30
|
+
generated_at = datetime.now(timezone.utc).isoformat()
|
|
31
|
+
|
|
32
|
+
# Aggregates
|
|
33
|
+
total = len(results)
|
|
34
|
+
by_band = {"high": 0, "medium": 0, "low": 0}
|
|
35
|
+
failed = 0
|
|
36
|
+
for r in results:
|
|
37
|
+
b = r.get("confidence_band") or "low"
|
|
38
|
+
by_band[b] = by_band.get(b, 0) + 1
|
|
39
|
+
if r.get("exit_code", 0) != 0:
|
|
40
|
+
failed += 1
|
|
41
|
+
|
|
42
|
+
summary_rows = []
|
|
43
|
+
for r in results:
|
|
44
|
+
rid = r.get("rule_id", "")
|
|
45
|
+
title = rule_titles.get(rid, "")
|
|
46
|
+
value = _short(r.get("value") or _value_from_raw(r.get("raw")))
|
|
47
|
+
conf = r.get("confidence", 0)
|
|
48
|
+
b = r.get("confidence_band") or "low"
|
|
49
|
+
method = r.get("extraction_method") or "?"
|
|
50
|
+
exit_code = r.get("exit_code", 0)
|
|
51
|
+
status_icon = "✓" if exit_code == 0 else "✗"
|
|
52
|
+
status_class = f"band-{b}" if exit_code == 0 else "band-fail"
|
|
53
|
+
summary_rows.append(
|
|
54
|
+
f"<tr class='{status_class}'>"
|
|
55
|
+
f"<td>{status_icon}</td>"
|
|
56
|
+
f"<td><code>{_html.escape(rid)}</code></td>"
|
|
57
|
+
f"<td>{_html.escape(title)}</td>"
|
|
58
|
+
f"<td>{_html.escape(value)}</td>"
|
|
59
|
+
f"<td>{conf:.3f}</td>"
|
|
60
|
+
f"<td>{_html.escape(b)}</td>"
|
|
61
|
+
f"<td>{_html.escape(method)}</td>"
|
|
62
|
+
f"</tr>"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
detail_blocks = []
|
|
66
|
+
for r in results:
|
|
67
|
+
rid = r.get("rule_id", "")
|
|
68
|
+
title = rule_titles.get(rid, "")
|
|
69
|
+
raw_json = json.dumps(r.get("raw") or {}, ensure_ascii=False, indent=2)
|
|
70
|
+
detail_blocks.append(
|
|
71
|
+
f"<div class='detail-card'>"
|
|
72
|
+
f"<h3><code>{_html.escape(rid)}</code> · {_html.escape(title)}</h3>"
|
|
73
|
+
f"<dl>"
|
|
74
|
+
f"<dt>Value</dt><dd>{_html.escape(_short(r.get('value') or _value_from_raw(r.get('raw'))))}</dd>"
|
|
75
|
+
f"<dt>Confidence</dt><dd>{r.get('confidence', 0):.3f} ({_html.escape(r.get('confidence_band') or '')})</dd>"
|
|
76
|
+
f"<dt>Method</dt><dd>{_html.escape(r.get('extraction_method') or '?')}</dd>"
|
|
77
|
+
f"<dt>Exit code</dt><dd>{r.get('exit_code', 0)}</dd>"
|
|
78
|
+
f"</dl>"
|
|
79
|
+
f"<details><summary>Raw workflow output</summary>"
|
|
80
|
+
f"<pre>{_html.escape(raw_json)}</pre>"
|
|
81
|
+
f"</details>"
|
|
82
|
+
f"</div>"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return TEMPLATE.format(
|
|
86
|
+
label=_html.escape(label),
|
|
87
|
+
snap_tag=_html.escape(snap_tag),
|
|
88
|
+
input_doc=_html.escape(input_doc),
|
|
89
|
+
started=_html.escape(started),
|
|
90
|
+
duration_s=f"{duration_ms / 1000:.2f}",
|
|
91
|
+
total=total,
|
|
92
|
+
high=by_band["high"],
|
|
93
|
+
medium=by_band["medium"],
|
|
94
|
+
low=by_band["low"],
|
|
95
|
+
failed=failed,
|
|
96
|
+
summary_rows="\n".join(summary_rows) or "<tr><td colspan='7'>(no results)</td></tr>",
|
|
97
|
+
detail_blocks="\n".join(detail_blocks) or "<p>(no results)</p>",
|
|
98
|
+
generated_at=generated_at,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _short(s, n=80):
|
|
103
|
+
s = "" if s is None else str(s)
|
|
104
|
+
return s if len(s) <= n else s[: n - 1] + "…"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _value_from_raw(raw):
|
|
108
|
+
if not isinstance(raw, dict):
|
|
109
|
+
return ""
|
|
110
|
+
for k in ("extracted_value", "value", "result"):
|
|
111
|
+
if k in raw:
|
|
112
|
+
return raw[k]
|
|
113
|
+
return ""
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
TEMPLATE = """<!DOCTYPE html>
|
|
117
|
+
<html lang="en">
|
|
118
|
+
<head>
|
|
119
|
+
<meta charset="UTF-8">
|
|
120
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
121
|
+
<title>KC Release {label} — Verification Result</title>
|
|
122
|
+
<style>
|
|
123
|
+
body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
124
|
+
max-width: 1100px; margin: 0 auto; padding: 24px;
|
|
125
|
+
background: #0a0a0a; color: #e5e5e5; }}
|
|
126
|
+
h1 {{ color: #f4f4f5; font-size: 1.5em; margin-bottom: 4px; }}
|
|
127
|
+
.meta {{ color: #737373; font-size: 0.85em; margin-bottom: 24px; }}
|
|
128
|
+
.meta code {{ color: #a3a3a3; }}
|
|
129
|
+
.card {{ background: #171717; border: 1px solid #262626; border-radius: 8px;
|
|
130
|
+
padding: 16px; margin: 12px 0; }}
|
|
131
|
+
.metrics {{ display: flex; gap: 32px; flex-wrap: wrap; }}
|
|
132
|
+
.metric .value {{ font-size: 2em; font-weight: 600; }}
|
|
133
|
+
.metric .label {{ font-size: 0.8em; color: #737373; text-transform: uppercase; letter-spacing: .03em; }}
|
|
134
|
+
.v-high {{ color: #22c55e; }}
|
|
135
|
+
.v-med {{ color: #eab308; }}
|
|
136
|
+
.v-low {{ color: #f97316; }}
|
|
137
|
+
.v-fail {{ color: #ef4444; }}
|
|
138
|
+
.tabs {{ display: flex; gap: 0; border-bottom: 1px solid #262626; margin: 24px 0 12px; }}
|
|
139
|
+
.tab {{ padding: 8px 16px; cursor: pointer; color: #737373; border-bottom: 2px solid transparent; user-select: none; }}
|
|
140
|
+
.tab.active {{ color: #f4f4f5; border-bottom-color: #22c55e; }}
|
|
141
|
+
table {{ width: 100%; border-collapse: collapse; }}
|
|
142
|
+
th, td {{ text-align: left; padding: 8px 10px; border-bottom: 1px solid #262626; font-size: 0.92em; }}
|
|
143
|
+
th {{ color: #737373; font-weight: 500; font-size: 0.78em; text-transform: uppercase; letter-spacing: .04em; }}
|
|
144
|
+
td code {{ color: #a3a3a3; }}
|
|
145
|
+
tr.band-high td:nth-child(6) {{ color: #22c55e; }}
|
|
146
|
+
tr.band-medium td:nth-child(6) {{ color: #eab308; }}
|
|
147
|
+
tr.band-low td:nth-child(6) {{ color: #f97316; }}
|
|
148
|
+
tr.band-fail td:nth-child(6) {{ color: #ef4444; }}
|
|
149
|
+
.detail-card {{ background: #171717; border: 1px solid #262626; border-radius: 8px;
|
|
150
|
+
padding: 14px 18px; margin: 14px 0; }}
|
|
151
|
+
.detail-card h3 {{ margin: 0 0 10px; font-size: 1em; color: #e5e5e5; }}
|
|
152
|
+
.detail-card dl {{ display: grid; grid-template-columns: 100px 1fr; gap: 4px 16px; margin: 0; }}
|
|
153
|
+
.detail-card dt {{ color: #737373; font-size: 0.85em; }}
|
|
154
|
+
.detail-card dd {{ margin: 0; color: #e5e5e5; }}
|
|
155
|
+
details summary {{ cursor: pointer; color: #a3a3a3; font-size: 0.85em; margin-top: 8px; }}
|
|
156
|
+
pre {{ background: #0d0d0d; border: 1px solid #262626; border-radius: 4px;
|
|
157
|
+
padding: 10px; overflow-x: auto; font-size: 0.82em; color: #d4d4d4; }}
|
|
158
|
+
.footer {{ color: #525252; font-size: 0.78em; margin-top: 32px; text-align: center; }}
|
|
159
|
+
</style>
|
|
160
|
+
</head>
|
|
161
|
+
<body>
|
|
162
|
+
<h1>KC Release <code>{label}</code></h1>
|
|
163
|
+
<p class="meta">
|
|
164
|
+
Snapshot: <code>{snap_tag}</code> ·
|
|
165
|
+
Input: <code>{input_doc}</code> ·
|
|
166
|
+
Started: <code>{started}</code> ·
|
|
167
|
+
Duration: <code>{duration_s}s</code>
|
|
168
|
+
</p>
|
|
169
|
+
|
|
170
|
+
<div class="card metrics">
|
|
171
|
+
<div class="metric"><div class="value">{total}</div><div class="label">Rules run</div></div>
|
|
172
|
+
<div class="metric"><div class="value v-high">{high}</div><div class="label">High confidence</div></div>
|
|
173
|
+
<div class="metric"><div class="value v-med">{medium}</div><div class="label">Medium</div></div>
|
|
174
|
+
<div class="metric"><div class="value v-low">{low}</div><div class="label">Low</div></div>
|
|
175
|
+
<div class="metric"><div class="value v-fail">{failed}</div><div class="label">Failed</div></div>
|
|
176
|
+
</div>
|
|
177
|
+
|
|
178
|
+
<div class="tabs">
|
|
179
|
+
<div class="tab active" data-target="summary" onclick="kcShow('summary', this)">Summary</div>
|
|
180
|
+
<div class="tab" data-target="detail" onclick="kcShow('detail', this)">Per-rule detail</div>
|
|
181
|
+
</div>
|
|
182
|
+
|
|
183
|
+
<div id="summary" class="view">
|
|
184
|
+
<div class="card">
|
|
185
|
+
<table>
|
|
186
|
+
<tr><th></th><th>Rule</th><th>Title</th><th>Value</th><th>Conf.</th><th>Band</th><th>Method</th></tr>
|
|
187
|
+
{summary_rows}
|
|
188
|
+
</table>
|
|
189
|
+
</div>
|
|
190
|
+
</div>
|
|
191
|
+
|
|
192
|
+
<div id="detail" class="view" style="display:none">
|
|
193
|
+
{detail_blocks}
|
|
194
|
+
</div>
|
|
195
|
+
|
|
196
|
+
<p class="footer">Generated {generated_at} — KC Agent CLI</p>
|
|
197
|
+
|
|
198
|
+
<script>
|
|
199
|
+
function kcShow(id, tab) {{
|
|
200
|
+
document.querySelectorAll('.view').forEach(v => v.style.display = 'none');
|
|
201
|
+
document.getElementById(id).style.display = '';
|
|
202
|
+
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
|
203
|
+
tab.classList.add('active');
|
|
204
|
+
}}
|
|
205
|
+
</script>
|
|
206
|
+
</body>
|
|
207
|
+
</html>
|
|
208
|
+
"""
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Re-render an HTML dashboard from an existing run.py result JSON.
|
|
4
|
+
|
|
5
|
+
Useful when run.py was invoked without --dashboard, or when the dashboard
|
|
6
|
+
template is updated and you want to re-render past results.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python render_dashboard.py <result.json> [--output dashboard.html]
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
HERE = Path(__file__).resolve().parent
|
|
18
|
+
sys.path.insert(0, str(HERE))
|
|
19
|
+
|
|
20
|
+
from kc_runtime import dashboard as kc_dash
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main():
|
|
24
|
+
ap = argparse.ArgumentParser()
|
|
25
|
+
ap.add_argument("result", help="Path to a result.json produced by run.py")
|
|
26
|
+
ap.add_argument("--output", "-o", help="HTML output path (default: alongside result)")
|
|
27
|
+
args = ap.parse_args()
|
|
28
|
+
|
|
29
|
+
result_path = Path(args.result).resolve()
|
|
30
|
+
if not result_path.is_file():
|
|
31
|
+
print(f"error: result file not found: {result_path}", file=sys.stderr)
|
|
32
|
+
sys.exit(2)
|
|
33
|
+
|
|
34
|
+
manifest_path = HERE / "manifest.json"
|
|
35
|
+
if not manifest_path.is_file():
|
|
36
|
+
print(f"error: manifest.json not found alongside this script", file=sys.stderr)
|
|
37
|
+
sys.exit(2)
|
|
38
|
+
|
|
39
|
+
result = json.loads(result_path.read_text(encoding="utf-8"))
|
|
40
|
+
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
41
|
+
html = kc_dash.render(result, manifest)
|
|
42
|
+
|
|
43
|
+
out_path = Path(args.output) if args.output else result_path.with_suffix(".html")
|
|
44
|
+
out_path.write_text(html, encoding="utf-8")
|
|
45
|
+
print(f"Wrote {out_path}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
if __name__ == "__main__":
|
|
49
|
+
main()
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
KC release runner — standalone, no kc-beta dependency.
|
|
4
|
+
|
|
5
|
+
Loads the bundled release manifest, runs each rule's workflow against an
|
|
6
|
+
input document, scores confidence, aggregates results.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python run.py <input-doc> [--rule R001] [--output result.json] [--dashboard]
|
|
10
|
+
|
|
11
|
+
Required env vars (same conventions as KC's .env):
|
|
12
|
+
LLM_API_KEY, LLM_BASE_URL
|
|
13
|
+
TIER1, TIER2, TIER3, TIER4 (any subset of model lists, comma-separated)
|
|
14
|
+
|
|
15
|
+
Workflows are invoked as `python <workflow_path> <input-doc>` and must emit
|
|
16
|
+
their result as a single JSON object on the last line of stdout.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import subprocess
|
|
23
|
+
import sys
|
|
24
|
+
import time
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
# kc_runtime is bundled next to this file
|
|
29
|
+
HERE = Path(__file__).resolve().parent
|
|
30
|
+
sys.path.insert(0, str(HERE))
|
|
31
|
+
|
|
32
|
+
from kc_runtime import confidence as kc_conf
|
|
33
|
+
from kc_runtime import dashboard as kc_dash
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def main():
|
|
37
|
+
ap = argparse.ArgumentParser(description="Run a KC release on a document.")
|
|
38
|
+
ap.add_argument("input", help="Path to the input document (PDF, DOCX, TXT, ...)")
|
|
39
|
+
ap.add_argument("--rule", help="Run only this rule id (default: all rules in catalog)")
|
|
40
|
+
ap.add_argument("--output", "-o", help="Write aggregated JSON here (default: stdout)")
|
|
41
|
+
ap.add_argument("--dashboard", action="store_true",
|
|
42
|
+
help="Also emit an HTML dashboard next to the JSON output")
|
|
43
|
+
args = ap.parse_args()
|
|
44
|
+
|
|
45
|
+
input_path = Path(args.input).resolve()
|
|
46
|
+
if not input_path.is_file():
|
|
47
|
+
_die(f"Input file not found: {input_path}")
|
|
48
|
+
|
|
49
|
+
manifest = _load_json(HERE / "manifest.json", required=True)
|
|
50
|
+
catalog = _load_json(HERE / "catalog.json", required=False) or []
|
|
51
|
+
historical = _load_calibration(HERE / "confidence_calibration.json")
|
|
52
|
+
corner_cases = _load_json(HERE / "corner_cases.json", required=False)
|
|
53
|
+
|
|
54
|
+
rules = manifest.get("rules", [])
|
|
55
|
+
if args.rule:
|
|
56
|
+
rules = [r for r in rules if r.get("id") == args.rule]
|
|
57
|
+
if not rules:
|
|
58
|
+
_die(f"No rule '{args.rule}' in manifest")
|
|
59
|
+
|
|
60
|
+
if not _check_env():
|
|
61
|
+
sys.exit(2)
|
|
62
|
+
|
|
63
|
+
started = datetime.now(timezone.utc).isoformat()
|
|
64
|
+
t0 = time.monotonic()
|
|
65
|
+
|
|
66
|
+
results = []
|
|
67
|
+
any_failure = False
|
|
68
|
+
for rule in rules:
|
|
69
|
+
result = _run_one(rule, input_path, catalog,
|
|
70
|
+
historical=historical, corner_cases=corner_cases)
|
|
71
|
+
results.append(result)
|
|
72
|
+
if result.get("exit_code", 0) != 0:
|
|
73
|
+
any_failure = True
|
|
74
|
+
|
|
75
|
+
duration_ms = int((time.monotonic() - t0) * 1000)
|
|
76
|
+
aggregated = {
|
|
77
|
+
"release": manifest.get("label"),
|
|
78
|
+
"snapshot_tag": manifest.get("snapshot_tag"),
|
|
79
|
+
"input": str(input_path),
|
|
80
|
+
"started_at": started,
|
|
81
|
+
"duration_ms": duration_ms,
|
|
82
|
+
"results": results,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
out_text = json.dumps(aggregated, ensure_ascii=False, indent=2)
|
|
86
|
+
if args.output:
|
|
87
|
+
out_path = Path(args.output).resolve()
|
|
88
|
+
out_path.write_text(out_text, encoding="utf-8")
|
|
89
|
+
print(f"Wrote {out_path}", file=sys.stderr)
|
|
90
|
+
else:
|
|
91
|
+
print(out_text)
|
|
92
|
+
|
|
93
|
+
if args.dashboard:
|
|
94
|
+
html = kc_dash.render(aggregated, manifest)
|
|
95
|
+
if args.output:
|
|
96
|
+
html_path = Path(args.output).with_suffix(".html")
|
|
97
|
+
else:
|
|
98
|
+
html_path = HERE / f"result_{int(time.time())}.html"
|
|
99
|
+
html_path.write_text(html, encoding="utf-8")
|
|
100
|
+
print(f"Dashboard: {html_path}", file=sys.stderr)
|
|
101
|
+
|
|
102
|
+
sys.exit(1 if any_failure else 0)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _run_one(rule, input_path, catalog, *, historical, corner_cases):
|
|
106
|
+
rule_id = rule.get("id")
|
|
107
|
+
workflow_rel = rule.get("workflow")
|
|
108
|
+
if not workflow_rel:
|
|
109
|
+
return _error_result(rule_id, "no workflow path in manifest")
|
|
110
|
+
|
|
111
|
+
workflow_abs = (HERE / workflow_rel).resolve()
|
|
112
|
+
if not workflow_abs.is_file():
|
|
113
|
+
return _error_result(rule_id, f"workflow not found: {workflow_rel}")
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
proc = subprocess.run(
|
|
117
|
+
["python", str(workflow_abs), str(input_path)],
|
|
118
|
+
capture_output=True, text=True, timeout=300,
|
|
119
|
+
)
|
|
120
|
+
except subprocess.TimeoutExpired:
|
|
121
|
+
return _error_result(rule_id, "workflow timed out (300s)")
|
|
122
|
+
except FileNotFoundError:
|
|
123
|
+
return _error_result(rule_id, "`python` not found on PATH")
|
|
124
|
+
|
|
125
|
+
raw_stdout = (proc.stdout or "").strip()
|
|
126
|
+
raw_data = _parse_last_json_line(raw_stdout)
|
|
127
|
+
|
|
128
|
+
extracted_value = _extract_value(raw_data)
|
|
129
|
+
method = (raw_data or {}).get("extraction_method") or "llm"
|
|
130
|
+
source_text = (raw_data or {}).get("raw_text") or ""
|
|
131
|
+
|
|
132
|
+
conf = kc_conf.score(
|
|
133
|
+
rule_id=rule_id,
|
|
134
|
+
extracted_value=str(extracted_value),
|
|
135
|
+
source_text=source_text,
|
|
136
|
+
method=method,
|
|
137
|
+
document=str(input_path),
|
|
138
|
+
historical=historical,
|
|
139
|
+
corner_cases=corner_cases,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
"rule_id": rule_id,
|
|
144
|
+
"value": extracted_value,
|
|
145
|
+
"confidence": conf,
|
|
146
|
+
"confidence_band": kc_conf.band(conf),
|
|
147
|
+
"extraction_method": method,
|
|
148
|
+
"exit_code": proc.returncode,
|
|
149
|
+
"raw": raw_data if raw_data is not None else {"stderr": (proc.stderr or "")[:2000]},
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _error_result(rule_id, msg):
|
|
154
|
+
return {
|
|
155
|
+
"rule_id": rule_id,
|
|
156
|
+
"value": None,
|
|
157
|
+
"confidence": 0.0,
|
|
158
|
+
"confidence_band": "low",
|
|
159
|
+
"extraction_method": "fallback",
|
|
160
|
+
"exit_code": 2,
|
|
161
|
+
"raw": {"error": msg},
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _parse_last_json_line(text):
|
|
166
|
+
if not text:
|
|
167
|
+
return None
|
|
168
|
+
# Walk lines from the bottom, return the first that parses as a JSON object
|
|
169
|
+
for line in reversed(text.split("\n")):
|
|
170
|
+
line = line.strip()
|
|
171
|
+
if not line:
|
|
172
|
+
continue
|
|
173
|
+
if line[0] not in "{[":
|
|
174
|
+
continue
|
|
175
|
+
try:
|
|
176
|
+
return json.loads(line)
|
|
177
|
+
except json.JSONDecodeError:
|
|
178
|
+
continue
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _extract_value(raw):
|
|
183
|
+
if not isinstance(raw, dict):
|
|
184
|
+
return None
|
|
185
|
+
for k in ("extracted_value", "value", "result"):
|
|
186
|
+
if k in raw:
|
|
187
|
+
return raw[k]
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _load_json(path, *, required):
|
|
192
|
+
if not path.is_file():
|
|
193
|
+
if required:
|
|
194
|
+
_die(f"Required file missing: {path.name}")
|
|
195
|
+
return None
|
|
196
|
+
try:
|
|
197
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
198
|
+
except json.JSONDecodeError as e:
|
|
199
|
+
_die(f"Invalid JSON in {path.name}: {e}")
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _load_calibration(path):
|
|
203
|
+
data = _load_json(path, required=False)
|
|
204
|
+
if not data:
|
|
205
|
+
return {}
|
|
206
|
+
return data.get("historical_accuracy") or data or {}
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _check_env():
|
|
210
|
+
missing = []
|
|
211
|
+
for k in ("LLM_API_KEY",):
|
|
212
|
+
if not os.environ.get(k):
|
|
213
|
+
missing.append(k)
|
|
214
|
+
tiers = [t for t in ("TIER1", "TIER2", "TIER3", "TIER4") if os.environ.get(t)]
|
|
215
|
+
if not tiers:
|
|
216
|
+
missing.append("at least one of TIER1..TIER4")
|
|
217
|
+
if missing:
|
|
218
|
+
print("Missing env vars: " + ", ".join(missing), file=sys.stderr)
|
|
219
|
+
print("Workflows in this release call worker LLMs and need these set.", file=sys.stderr)
|
|
220
|
+
return False
|
|
221
|
+
return True
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _die(msg):
|
|
225
|
+
print(f"error: {msg}", file=sys.stderr)
|
|
226
|
+
sys.exit(2)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
if __name__ == "__main__":
|
|
230
|
+
main()
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
# Serve this release directory locally so dashboards open in a browser.
|
|
3
|
+
# Generated HTML files (e.g. result_*.html, dashboard.html) become reachable
|
|
4
|
+
# at http://localhost:<port>/...
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
# ./serve.sh # default port 8080
|
|
8
|
+
# ./serve.sh 9000 # custom port
|
|
9
|
+
#
|
|
10
|
+
# Stop with Ctrl-C.
|
|
11
|
+
|
|
12
|
+
PORT="${1:-8080}"
|
|
13
|
+
cd "$(dirname "$0")" || exit 1
|
|
14
|
+
echo "Serving $(pwd) on http://localhost:${PORT}/"
|
|
15
|
+
exec python -m http.server "$PORT"
|
|
@@ -49,6 +49,12 @@ Many real verification tasks require semantic understanding — "is this descrip
|
|
|
49
49
|
|
|
50
50
|
If a method's results fall below the accuracy threshold, try a different method or a more capable model. If regex works and meets accuracy — keep it, it's free. If regex produces results below threshold, escalate to worker LLM. If a cheap worker LLM isn't accurate enough, try a more capable tier. Record what works for each extraction type in AGENT.md for future reference.
|
|
51
51
|
|
|
52
|
+
## Project Glossary
|
|
53
|
+
|
|
54
|
+
The project glossary (built and maintained by `rule-extraction`, stored at `rules/glossary.json`) is a useful resource when designing extraction. It records canonical names and known aliases for entities that appear across rules. Reading it before extracting helps keep entity names schema-aligned and avoids parallel labels for the same thing.
|
|
55
|
+
|
|
56
|
+
Whether the glossary becomes more than a naming convention — for instance, driving cheap pattern matching for entities with stable surface forms — is a per-project judgment. Apply the same cost-accuracy logic as elsewhere: whatever method meets the accuracy threshold for the task at hand.
|
|
57
|
+
|
|
52
58
|
## Schema Design
|
|
53
59
|
|
|
54
60
|
Define the expected output for each extraction. Keep it simple and JIT:
|
|
@@ -61,6 +61,17 @@ After the conversation:
|
|
|
61
61
|
5. Initialize version tracking (a `versions.json` manifest).
|
|
62
62
|
6. Log the bootstrap conversation summary for future reference.
|
|
63
63
|
|
|
64
|
+
## Scheduled Ingestion (Production)
|
|
65
|
+
|
|
66
|
+
Once a project is past bootstrap and into production, fresh documents often arrive on a regular cadence — daily regulator drops, hourly API pulls, batch uploads from upstream systems. Use the `schedule_fetch` tool to register ingestion jobs the OS scheduler runs while kc-beta is closed:
|
|
67
|
+
|
|
68
|
+
- Each job is a shell command (rsync, curl, custom script) that lands files in `$INPUT_DIR`.
|
|
69
|
+
- KC writes a wrapper script under `scripts/ingest/<job-id>.sh`; the user installs the script line into their crontab via `crontab -e`.
|
|
70
|
+
- Newly-arrived files are auto-prefixed with `<job-id>_<UTC-timestamp>_` so origin and arrival time are visible in the filename.
|
|
71
|
+
- View status with `/schedule` or `schedule_fetch list`. Tail of `logs/ingest.log` shows recent runs.
|
|
72
|
+
|
|
73
|
+
Discuss the cadence with the developer user during bootstrap — knowing the production input rhythm shapes how skills and workflows should be written (batch vs streaming, idempotency requirements, etc.).
|
|
74
|
+
|
|
64
75
|
## When to Re-Bootstrap
|
|
65
76
|
|
|
66
77
|
Return to this skill when:
|
|
@@ -106,7 +106,19 @@ For production Input/ documents:
|
|
|
106
106
|
4. Review the selected results (LLM-as-Judge or manual review by the developer user).
|
|
107
107
|
5. Compute batch accuracy from reviewed results.
|
|
108
108
|
6. Log batch QC report.
|
|
109
|
-
7.
|
|
109
|
+
7. Move processed input docs to `input/archived/` via `archive_file` so the next session sees only fresh arrivals.
|
|
110
|
+
8. If accuracy is acceptable, finalize the batch. If not, trigger evolution loop.
|
|
111
|
+
|
|
112
|
+
Production input often arrives on a schedule (see `bootstrap-workspace` → "Scheduled Ingestion"). Files in `input/` are auto-prefixed with `<job-id>_<UTC-timestamp>_` by the ingestion wrapper, so each batch carries provenance in its filenames. When a batch fails QC, the prefixes let you trace which scheduled run produced the bad data.
|
|
113
|
+
|
|
114
|
+
## Two Dashboard Surfaces
|
|
115
|
+
|
|
116
|
+
There are two distinct dashboards in this system:
|
|
117
|
+
|
|
118
|
+
- **Developer dashboard** — `dashboard_render` tool, generated inside the workspace from `output/results/`, `logs/evolution/`, and `output/qc/`. For your audit and the developer user's day-to-day monitoring during BUILD and DISTILL.
|
|
119
|
+
- **End-user dashboard** — the `render_dashboard.py` script bundled inside a release (built via the `release` tool). For non-developer recipients of a packaged release. It renders results from a single `run.py` invocation; no workspace dependency.
|
|
120
|
+
|
|
121
|
+
When a release is built, point end users at the bundled dashboard, not the workspace one. Workspace dashboard stays your developer surface.
|
|
110
122
|
|
|
111
123
|
## Developer User Involvement
|
|
112
124
|
|
|
@@ -104,6 +104,41 @@ Maintain a lightweight catalog of all extracted rules. This is your index, not t
|
|
|
104
104
|
|
|
105
105
|
Format: a simple markdown table or JSON file. Do not over-engineer this. The catalog exists to give you and the developer user an overview of progress.
|
|
106
106
|
|
|
107
|
+
## Project Glossary
|
|
108
|
+
|
|
109
|
+
Alongside the rule catalog, build a project glossary — a living vocabulary of the entities, terms, and patterns the verification system encounters. The glossary is what keeps entity names consistent across rules: without it, the same balance-sheet item might be named "注册资本", "registered capital", and "paid-in capital" by three different rule skills, breaking shared-entity matching and producing inconsistent extraction outputs.
|
|
110
|
+
|
|
111
|
+
The glossary is not frozen at the end of extraction. It is a living document. Update it when you discover new aliases in samples, when a worker LLM extraction reveals a variant phrasing, when corner cases surface unfamiliar terminology. Both the coding agent and any operator can edit it.
|
|
112
|
+
|
|
113
|
+
### When to seed it
|
|
114
|
+
|
|
115
|
+
During rule extraction. As you decompose each rule, note the entities the rule references — capital ratios, signature pages, related-party transactions, dates, parties, monetary values. Seed the glossary with the canonical name and any aliases already visible in the source documents.
|
|
116
|
+
|
|
117
|
+
### Storage and shape
|
|
118
|
+
|
|
119
|
+
Save as `rules/glossary.json` next to `catalog.json`. Each entry is small:
|
|
120
|
+
|
|
121
|
+
```json
|
|
122
|
+
{
|
|
123
|
+
"canonical": "registered_capital",
|
|
124
|
+
"aliases": ["注册资本", "registered capital", "实收资本"],
|
|
125
|
+
"definition": "The capital amount registered with regulators",
|
|
126
|
+
"entity_type": "monetary_value",
|
|
127
|
+
"seen_in": ["rules/regulation_A.pdf:p12", "samples/annual_report_2024.pdf:p3"],
|
|
128
|
+
"status": "extracted"
|
|
129
|
+
}
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Status field tracks maturity: `extracted` (from rules), `validated` (confirmed in samples), `production` (used by deployed workflows). Add or drop fields as the project demands — same JIT philosophy as the rule schema.
|
|
133
|
+
|
|
134
|
+
### How it integrates
|
|
135
|
+
|
|
136
|
+
- `rule-graph` consumes the glossary so `shares_entity` edges reference canonical labels rather than free-text strings.
|
|
137
|
+
- `entity-extraction` references the glossary for canonical names and known aliases when designing extraction logic.
|
|
138
|
+
- Skills authored under `skill-authoring` should use canonical names in their schemas.
|
|
139
|
+
|
|
140
|
+
How the glossary is used downstream is a per-project judgment. A mature glossary may enable cheap pattern-based matching for some entities; for others it just keeps naming consistent. Let the cost-accuracy logic in `entity-extraction` decide per case.
|
|
141
|
+
|
|
107
142
|
## Handling Ambiguity
|
|
108
143
|
|
|
109
144
|
Regulations are often ambiguous. When you encounter ambiguity:
|
|
@@ -43,6 +43,22 @@ Two rules that can produce contradictory guidance. Regulation A requires disclos
|
|
|
43
43
|
|
|
44
44
|
Edge cases that affect multiple rules. A document with an unusual structure (merged cells in a table, non-standard date format) may cause extraction failures across several rules. The graph links these rules to the shared corner case so a fix in one propagates awareness to others.
|
|
45
45
|
|
|
46
|
+
## Project Glossary
|
|
47
|
+
|
|
48
|
+
The glossary (built and owned by `rule-extraction`, stored at `rules/glossary.json`) is the canonical-label registry that makes `shares_entity` edges meaningful. Without it, two rules can target the same entity under different names and the edge between them never gets drawn.
|
|
49
|
+
|
|
50
|
+
Edges that reference entities should use the glossary's canonical labels, not free-text strings copied from rule descriptions:
|
|
51
|
+
|
|
52
|
+
```json
|
|
53
|
+
{"from": "R001", "to": "R004", "type": "shares_entity", "entity": "registered_capital"}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Where `registered_capital` is the canonical name in `glossary.json`, with aliases like `注册资本` and `paid-in capital` recorded under it.
|
|
57
|
+
|
|
58
|
+
When the glossary is updated — new aliases discovered in samples, two entries merged, a definition refined — revisit affected `shares_entity` edges. New aliases may surface previously hidden cross-rule connections; merged entries collapse parallel edges into one.
|
|
59
|
+
|
|
60
|
+
The glossary is built and owned by rule-extraction; rule-graph just consumes it.
|
|
61
|
+
|
|
46
62
|
## Three Uses
|
|
47
63
|
|
|
48
64
|
### 1. Impact Analysis
|
|
@@ -135,6 +135,14 @@ The coding agent's skill-based results are the ground truth. For each document i
|
|
|
135
135
|
|
|
136
136
|
Each iteration of a workflow is a new version file: `workflow_v1.py`, `workflow_v2.py`, etc. Track which version is active in `config.json`. See `version-control` skill for the full methodology.
|
|
137
137
|
|
|
138
|
+
## Releasing Workflows
|
|
139
|
+
|
|
140
|
+
Once workflows hit accuracy threshold, they can be packaged for end users via the `release` tool. Each release is a self-contained directory under `output/releases/<slug>/` with the pinned workflows, a Python runner, a confidence scorer, an HTML dashboard generator, and a `serve.sh` helper. The bundle has no kc-beta dependency — anyone with Python and a worker LLM API key can run `python run.py <doc>` and produce verification results.
|
|
141
|
+
|
|
142
|
+
What to include is your call: all rules in catalog, or a curated subset via the `include` parameter; bundling 1-3 representative samples as `fixtures/` if you want the recipient to be able to dry-run without their own data.
|
|
143
|
+
|
|
144
|
+
The `release` tool snapshots the workspace first (git tag `snap/release-<slug>`), so the bundle is regenerable from git even if `output/releases/` is later cleaned. Decide when to release — there's no automation, no forced cadence. Typical triggers: workflows reach SKILL/WORKFLOW_ACCURACY thresholds, a stakeholder needs a hand-off, a production cron should run pinned versions instead of latest. Discuss with the developer user.
|
|
145
|
+
|
|
138
146
|
## Cost Tracking
|
|
139
147
|
|
|
140
148
|
Track the cost of each workflow run:
|
|
@@ -96,6 +96,19 @@ Tags enable three capabilities that you cannot afford to lose:
|
|
|
96
96
|
|
|
97
97
|
Tag format: a simple string field on every intermediate output. Example values: `regex`, `python_calc`, `llm_tier2`, `manual_review`. Be consistent within a project. Define the tag vocabulary once at project setup and enforce it across all skills and workflows.
|
|
98
98
|
|
|
99
|
+
## Multi-agent coordination — keep it lock-free
|
|
100
|
+
|
|
101
|
+
When a task is large enough that you reach for `agent_tool` to spawn parallel sub-agents, partition by an independent unit (one rule per sub-agent, one document per sub-agent, etc.) so the sub-agents never need to coordinate through a shared mutable file.
|
|
102
|
+
|
|
103
|
+
Lesson from a peer-team failure: they tried equal-status agents claiming work via a shared coordination file with locks. Two predictable failures emerged. (1) Agents held locks too long or forgot to release them; even with locks working, twenty agents' throughput dropped to that of two or three because most time went to waiting. (2) Fragility — agents could fail while holding a lock, try to acquire a lock they already held, or update the coordination file without acquiring a lock at all.
|
|
104
|
+
|
|
105
|
+
KC's preferred patterns:
|
|
106
|
+
|
|
107
|
+
- **Single-dispatcher** — `TaskManager` hands tasks out one at a time to the conductor. No locks, no peer coordination. This is the default ralph-loop architecture.
|
|
108
|
+
- **Partition-by-unit** — when spawning sub-agents via `agent_tool`, give each one a non-overlapping slice (per-rule, per-document). Sub-agents write to their own `sub_agents/<taskId>/` for state, and to per-rule paths in `rule_skills/<id>/` or `workflows/<id>/` for shared artifacts. Block 11's git auto-commit serializes the shared writes; partition-by-rule keeps last-writer-wins from being a problem.
|
|
109
|
+
|
|
110
|
+
If two would-be sub-agents need to talk to each other to make progress, they should probably be one task (run sequentially) or a sequence (parent dispatches second after first finishes), not concurrent peers.
|
|
111
|
+
|
|
99
112
|
## Anti-Patterns
|
|
100
113
|
|
|
101
114
|
Five failure modes recur across projects. Learn to recognize them early.
|