kc-beta 0.3.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agent/confidence-scorer.js +8 -0
- package/src/agent/context.js +25 -0
- package/src/agent/corner-case-registry.js +5 -0
- package/src/agent/engine.js +514 -75
- package/src/agent/event-log.js +15 -2
- package/src/agent/history.js +91 -23
- package/src/agent/pipelines/initializer.js +3 -6
- package/src/agent/retry.js +9 -1
- package/src/agent/scheduler.js +276 -0
- package/src/agent/session-state.js +11 -2
- package/src/agent/task-manager.js +5 -0
- package/src/agent/tools/agent-tool.js +57 -14
- package/src/agent/tools/archive-file.js +94 -0
- package/src/agent/tools/copy-to-workspace.js +140 -0
- package/src/agent/tools/phase-advance.js +60 -0
- package/src/agent/tools/release.js +322 -0
- package/src/agent/tools/schedule-fetch.js +118 -0
- package/src/agent/tools/snapshot.js +101 -0
- package/src/agent/tools/workspace-file.js +10 -7
- package/src/agent/version-manager.js +29 -120
- package/src/agent/workspace.js +127 -4
- package/src/cli/components.js +4 -1
- package/src/cli/index.js +57 -4
- package/src/config.js +10 -1
- package/template/release-runtime/README.md.tmpl +84 -0
- package/template/release-runtime/kc_runtime/__init__.py +2 -0
- package/template/release-runtime/kc_runtime/confidence.py +93 -0
- package/template/release-runtime/kc_runtime/dashboard.py +208 -0
- package/template/release-runtime/render_dashboard.py +49 -0
- package/template/release-runtime/run.py +230 -0
- package/template/release-runtime/serve.sh +15 -0
- package/template/skills/en/meta-meta/bootstrap-workspace/SKILL.md +11 -0
- package/template/skills/en/meta-meta/quality-control/SKILL.md +13 -1
- package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md +8 -0
- package/template/skills/en/meta-meta/task-decomposition/SKILL.md +13 -0
- package/template/skills/en/meta-meta/version-control/SKILL.md +13 -0
- package/template/skills/zh/meta-meta/bootstrap-workspace/SKILL.md +11 -0
- package/template/skills/zh/meta-meta/quality-control/SKILL.md +12 -0
- package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md +8 -0
- package/template/skills/zh/meta-meta/task-decomposition/SKILL.md +16 -0
- package/template/skills/zh/meta-meta/version-control/SKILL.md +13 -0
- package/template/workspace.gitignore +22 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""
|
|
2
|
+
End-user dashboard renderer — Python port of DashboardRenderTool._renderHtml.
|
|
3
|
+
|
|
4
|
+
Takes a release-run result JSON (the output of run.py) and emits a static
|
|
5
|
+
HTML dashboard. Dark theme, two tabs (Summary + Per-Rule), no external
|
|
6
|
+
dependencies, no JS framework — vanilla JS for tab switching only.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import html as _html
|
|
10
|
+
import json
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def render(result, manifest):
|
|
15
|
+
"""
|
|
16
|
+
result: dict from run.py — keys: release, snapshot_tag, input,
|
|
17
|
+
started_at, duration_ms, results: [{rule_id, value, confidence,
|
|
18
|
+
confidence_band, extraction_method, exit_code, raw}]
|
|
19
|
+
manifest: dict from the bundle's manifest.json (for header info)
|
|
20
|
+
Returns: a complete HTML string.
|
|
21
|
+
"""
|
|
22
|
+
label = manifest.get("label", result.get("release", ""))
|
|
23
|
+
snap_tag = manifest.get("snapshot_tag", result.get("snapshot_tag", ""))
|
|
24
|
+
input_doc = result.get("input", "")
|
|
25
|
+
started = result.get("started_at", "")
|
|
26
|
+
duration_ms = result.get("duration_ms", 0)
|
|
27
|
+
rules = manifest.get("rules", [])
|
|
28
|
+
rule_titles = {r["id"]: r.get("title", "") for r in rules}
|
|
29
|
+
results = result.get("results", [])
|
|
30
|
+
generated_at = datetime.now(timezone.utc).isoformat()
|
|
31
|
+
|
|
32
|
+
# Aggregates
|
|
33
|
+
total = len(results)
|
|
34
|
+
by_band = {"high": 0, "medium": 0, "low": 0}
|
|
35
|
+
failed = 0
|
|
36
|
+
for r in results:
|
|
37
|
+
b = r.get("confidence_band") or "low"
|
|
38
|
+
by_band[b] = by_band.get(b, 0) + 1
|
|
39
|
+
if r.get("exit_code", 0) != 0:
|
|
40
|
+
failed += 1
|
|
41
|
+
|
|
42
|
+
summary_rows = []
|
|
43
|
+
for r in results:
|
|
44
|
+
rid = r.get("rule_id", "")
|
|
45
|
+
title = rule_titles.get(rid, "")
|
|
46
|
+
value = _short(r.get("value") or _value_from_raw(r.get("raw")))
|
|
47
|
+
conf = r.get("confidence", 0)
|
|
48
|
+
b = r.get("confidence_band") or "low"
|
|
49
|
+
method = r.get("extraction_method") or "?"
|
|
50
|
+
exit_code = r.get("exit_code", 0)
|
|
51
|
+
status_icon = "✓" if exit_code == 0 else "✗"
|
|
52
|
+
status_class = f"band-{b}" if exit_code == 0 else "band-fail"
|
|
53
|
+
summary_rows.append(
|
|
54
|
+
f"<tr class='{status_class}'>"
|
|
55
|
+
f"<td>{status_icon}</td>"
|
|
56
|
+
f"<td><code>{_html.escape(rid)}</code></td>"
|
|
57
|
+
f"<td>{_html.escape(title)}</td>"
|
|
58
|
+
f"<td>{_html.escape(value)}</td>"
|
|
59
|
+
f"<td>{conf:.3f}</td>"
|
|
60
|
+
f"<td>{_html.escape(b)}</td>"
|
|
61
|
+
f"<td>{_html.escape(method)}</td>"
|
|
62
|
+
f"</tr>"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
detail_blocks = []
|
|
66
|
+
for r in results:
|
|
67
|
+
rid = r.get("rule_id", "")
|
|
68
|
+
title = rule_titles.get(rid, "")
|
|
69
|
+
raw_json = json.dumps(r.get("raw") or {}, ensure_ascii=False, indent=2)
|
|
70
|
+
detail_blocks.append(
|
|
71
|
+
f"<div class='detail-card'>"
|
|
72
|
+
f"<h3><code>{_html.escape(rid)}</code> · {_html.escape(title)}</h3>"
|
|
73
|
+
f"<dl>"
|
|
74
|
+
f"<dt>Value</dt><dd>{_html.escape(_short(r.get('value') or _value_from_raw(r.get('raw'))))}</dd>"
|
|
75
|
+
f"<dt>Confidence</dt><dd>{r.get('confidence', 0):.3f} ({_html.escape(r.get('confidence_band') or '')})</dd>"
|
|
76
|
+
f"<dt>Method</dt><dd>{_html.escape(r.get('extraction_method') or '?')}</dd>"
|
|
77
|
+
f"<dt>Exit code</dt><dd>{r.get('exit_code', 0)}</dd>"
|
|
78
|
+
f"</dl>"
|
|
79
|
+
f"<details><summary>Raw workflow output</summary>"
|
|
80
|
+
f"<pre>{_html.escape(raw_json)}</pre>"
|
|
81
|
+
f"</details>"
|
|
82
|
+
f"</div>"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return TEMPLATE.format(
|
|
86
|
+
label=_html.escape(label),
|
|
87
|
+
snap_tag=_html.escape(snap_tag),
|
|
88
|
+
input_doc=_html.escape(input_doc),
|
|
89
|
+
started=_html.escape(started),
|
|
90
|
+
duration_s=f"{duration_ms / 1000:.2f}",
|
|
91
|
+
total=total,
|
|
92
|
+
high=by_band["high"],
|
|
93
|
+
medium=by_band["medium"],
|
|
94
|
+
low=by_band["low"],
|
|
95
|
+
failed=failed,
|
|
96
|
+
summary_rows="\n".join(summary_rows) or "<tr><td colspan='7'>(no results)</td></tr>",
|
|
97
|
+
detail_blocks="\n".join(detail_blocks) or "<p>(no results)</p>",
|
|
98
|
+
generated_at=generated_at,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _short(s, n=80):
|
|
103
|
+
s = "" if s is None else str(s)
|
|
104
|
+
return s if len(s) <= n else s[: n - 1] + "…"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _value_from_raw(raw):
|
|
108
|
+
if not isinstance(raw, dict):
|
|
109
|
+
return ""
|
|
110
|
+
for k in ("extracted_value", "value", "result"):
|
|
111
|
+
if k in raw:
|
|
112
|
+
return raw[k]
|
|
113
|
+
return ""
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
TEMPLATE = """<!DOCTYPE html>
|
|
117
|
+
<html lang="en">
|
|
118
|
+
<head>
|
|
119
|
+
<meta charset="UTF-8">
|
|
120
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
121
|
+
<title>KC Release {label} — Verification Result</title>
|
|
122
|
+
<style>
|
|
123
|
+
body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
124
|
+
max-width: 1100px; margin: 0 auto; padding: 24px;
|
|
125
|
+
background: #0a0a0a; color: #e5e5e5; }}
|
|
126
|
+
h1 {{ color: #f4f4f5; font-size: 1.5em; margin-bottom: 4px; }}
|
|
127
|
+
.meta {{ color: #737373; font-size: 0.85em; margin-bottom: 24px; }}
|
|
128
|
+
.meta code {{ color: #a3a3a3; }}
|
|
129
|
+
.card {{ background: #171717; border: 1px solid #262626; border-radius: 8px;
|
|
130
|
+
padding: 16px; margin: 12px 0; }}
|
|
131
|
+
.metrics {{ display: flex; gap: 32px; flex-wrap: wrap; }}
|
|
132
|
+
.metric .value {{ font-size: 2em; font-weight: 600; }}
|
|
133
|
+
.metric .label {{ font-size: 0.8em; color: #737373; text-transform: uppercase; letter-spacing: .03em; }}
|
|
134
|
+
.v-high {{ color: #22c55e; }}
|
|
135
|
+
.v-med {{ color: #eab308; }}
|
|
136
|
+
.v-low {{ color: #f97316; }}
|
|
137
|
+
.v-fail {{ color: #ef4444; }}
|
|
138
|
+
.tabs {{ display: flex; gap: 0; border-bottom: 1px solid #262626; margin: 24px 0 12px; }}
|
|
139
|
+
.tab {{ padding: 8px 16px; cursor: pointer; color: #737373; border-bottom: 2px solid transparent; user-select: none; }}
|
|
140
|
+
.tab.active {{ color: #f4f4f5; border-bottom-color: #22c55e; }}
|
|
141
|
+
table {{ width: 100%; border-collapse: collapse; }}
|
|
142
|
+
th, td {{ text-align: left; padding: 8px 10px; border-bottom: 1px solid #262626; font-size: 0.92em; }}
|
|
143
|
+
th {{ color: #737373; font-weight: 500; font-size: 0.78em; text-transform: uppercase; letter-spacing: .04em; }}
|
|
144
|
+
td code {{ color: #a3a3a3; }}
|
|
145
|
+
tr.band-high td:nth-child(6) {{ color: #22c55e; }}
|
|
146
|
+
tr.band-medium td:nth-child(6) {{ color: #eab308; }}
|
|
147
|
+
tr.band-low td:nth-child(6) {{ color: #f97316; }}
|
|
148
|
+
tr.band-fail td:nth-child(6) {{ color: #ef4444; }}
|
|
149
|
+
.detail-card {{ background: #171717; border: 1px solid #262626; border-radius: 8px;
|
|
150
|
+
padding: 14px 18px; margin: 14px 0; }}
|
|
151
|
+
.detail-card h3 {{ margin: 0 0 10px; font-size: 1em; color: #e5e5e5; }}
|
|
152
|
+
.detail-card dl {{ display: grid; grid-template-columns: 100px 1fr; gap: 4px 16px; margin: 0; }}
|
|
153
|
+
.detail-card dt {{ color: #737373; font-size: 0.85em; }}
|
|
154
|
+
.detail-card dd {{ margin: 0; color: #e5e5e5; }}
|
|
155
|
+
details summary {{ cursor: pointer; color: #a3a3a3; font-size: 0.85em; margin-top: 8px; }}
|
|
156
|
+
pre {{ background: #0d0d0d; border: 1px solid #262626; border-radius: 4px;
|
|
157
|
+
padding: 10px; overflow-x: auto; font-size: 0.82em; color: #d4d4d4; }}
|
|
158
|
+
.footer {{ color: #525252; font-size: 0.78em; margin-top: 32px; text-align: center; }}
|
|
159
|
+
</style>
|
|
160
|
+
</head>
|
|
161
|
+
<body>
|
|
162
|
+
<h1>KC Release <code>{label}</code></h1>
|
|
163
|
+
<p class="meta">
|
|
164
|
+
Snapshot: <code>{snap_tag}</code> ·
|
|
165
|
+
Input: <code>{input_doc}</code> ·
|
|
166
|
+
Started: <code>{started}</code> ·
|
|
167
|
+
Duration: <code>{duration_s}s</code>
|
|
168
|
+
</p>
|
|
169
|
+
|
|
170
|
+
<div class="card metrics">
|
|
171
|
+
<div class="metric"><div class="value">{total}</div><div class="label">Rules run</div></div>
|
|
172
|
+
<div class="metric"><div class="value v-high">{high}</div><div class="label">High confidence</div></div>
|
|
173
|
+
<div class="metric"><div class="value v-med">{medium}</div><div class="label">Medium</div></div>
|
|
174
|
+
<div class="metric"><div class="value v-low">{low}</div><div class="label">Low</div></div>
|
|
175
|
+
<div class="metric"><div class="value v-fail">{failed}</div><div class="label">Failed</div></div>
|
|
176
|
+
</div>
|
|
177
|
+
|
|
178
|
+
<div class="tabs">
|
|
179
|
+
<div class="tab active" data-target="summary" onclick="kcShow('summary', this)">Summary</div>
|
|
180
|
+
<div class="tab" data-target="detail" onclick="kcShow('detail', this)">Per-rule detail</div>
|
|
181
|
+
</div>
|
|
182
|
+
|
|
183
|
+
<div id="summary" class="view">
|
|
184
|
+
<div class="card">
|
|
185
|
+
<table>
|
|
186
|
+
<tr><th></th><th>Rule</th><th>Title</th><th>Value</th><th>Conf.</th><th>Band</th><th>Method</th></tr>
|
|
187
|
+
{summary_rows}
|
|
188
|
+
</table>
|
|
189
|
+
</div>
|
|
190
|
+
</div>
|
|
191
|
+
|
|
192
|
+
<div id="detail" class="view" style="display:none">
|
|
193
|
+
{detail_blocks}
|
|
194
|
+
</div>
|
|
195
|
+
|
|
196
|
+
<p class="footer">Generated {generated_at} — KC Agent CLI</p>
|
|
197
|
+
|
|
198
|
+
<script>
|
|
199
|
+
function kcShow(id, tab) {{
|
|
200
|
+
document.querySelectorAll('.view').forEach(v => v.style.display = 'none');
|
|
201
|
+
document.getElementById(id).style.display = '';
|
|
202
|
+
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
|
203
|
+
tab.classList.add('active');
|
|
204
|
+
}}
|
|
205
|
+
</script>
|
|
206
|
+
</body>
|
|
207
|
+
</html>
|
|
208
|
+
"""
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Re-render an HTML dashboard from an existing run.py result JSON.
|
|
4
|
+
|
|
5
|
+
Useful when run.py was invoked without --dashboard, or when the dashboard
|
|
6
|
+
template is updated and you want to re-render past results.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python render_dashboard.py <result.json> [--output dashboard.html]
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
HERE = Path(__file__).resolve().parent
|
|
18
|
+
sys.path.insert(0, str(HERE))
|
|
19
|
+
|
|
20
|
+
from kc_runtime import dashboard as kc_dash
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main():
|
|
24
|
+
ap = argparse.ArgumentParser()
|
|
25
|
+
ap.add_argument("result", help="Path to a result.json produced by run.py")
|
|
26
|
+
ap.add_argument("--output", "-o", help="HTML output path (default: alongside result)")
|
|
27
|
+
args = ap.parse_args()
|
|
28
|
+
|
|
29
|
+
result_path = Path(args.result).resolve()
|
|
30
|
+
if not result_path.is_file():
|
|
31
|
+
print(f"error: result file not found: {result_path}", file=sys.stderr)
|
|
32
|
+
sys.exit(2)
|
|
33
|
+
|
|
34
|
+
manifest_path = HERE / "manifest.json"
|
|
35
|
+
if not manifest_path.is_file():
|
|
36
|
+
print(f"error: manifest.json not found alongside this script", file=sys.stderr)
|
|
37
|
+
sys.exit(2)
|
|
38
|
+
|
|
39
|
+
result = json.loads(result_path.read_text(encoding="utf-8"))
|
|
40
|
+
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
41
|
+
html = kc_dash.render(result, manifest)
|
|
42
|
+
|
|
43
|
+
out_path = Path(args.output) if args.output else result_path.with_suffix(".html")
|
|
44
|
+
out_path.write_text(html, encoding="utf-8")
|
|
45
|
+
print(f"Wrote {out_path}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
if __name__ == "__main__":
|
|
49
|
+
main()
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
KC release runner — standalone, no kc-beta dependency.
|
|
4
|
+
|
|
5
|
+
Loads the bundled release manifest, runs each rule's workflow against an
|
|
6
|
+
input document, scores confidence, aggregates results.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python run.py <input-doc> [--rule R001] [--output result.json] [--dashboard]
|
|
10
|
+
|
|
11
|
+
Required env vars (same conventions as KC's .env):
|
|
12
|
+
LLM_API_KEY, LLM_BASE_URL
|
|
13
|
+
TIER1, TIER2, TIER3, TIER4 (any subset of model lists, comma-separated)
|
|
14
|
+
|
|
15
|
+
Workflows are invoked as `python <workflow_path> <input-doc>` and must emit
|
|
16
|
+
their result as a single JSON object on the last line of stdout.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import subprocess
|
|
23
|
+
import sys
|
|
24
|
+
import time
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
# kc_runtime is bundled next to this file
|
|
29
|
+
HERE = Path(__file__).resolve().parent
|
|
30
|
+
sys.path.insert(0, str(HERE))
|
|
31
|
+
|
|
32
|
+
from kc_runtime import confidence as kc_conf
|
|
33
|
+
from kc_runtime import dashboard as kc_dash
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def main():
|
|
37
|
+
ap = argparse.ArgumentParser(description="Run a KC release on a document.")
|
|
38
|
+
ap.add_argument("input", help="Path to the input document (PDF, DOCX, TXT, ...)")
|
|
39
|
+
ap.add_argument("--rule", help="Run only this rule id (default: all rules in catalog)")
|
|
40
|
+
ap.add_argument("--output", "-o", help="Write aggregated JSON here (default: stdout)")
|
|
41
|
+
ap.add_argument("--dashboard", action="store_true",
|
|
42
|
+
help="Also emit an HTML dashboard next to the JSON output")
|
|
43
|
+
args = ap.parse_args()
|
|
44
|
+
|
|
45
|
+
input_path = Path(args.input).resolve()
|
|
46
|
+
if not input_path.is_file():
|
|
47
|
+
_die(f"Input file not found: {input_path}")
|
|
48
|
+
|
|
49
|
+
manifest = _load_json(HERE / "manifest.json", required=True)
|
|
50
|
+
catalog = _load_json(HERE / "catalog.json", required=False) or []
|
|
51
|
+
historical = _load_calibration(HERE / "confidence_calibration.json")
|
|
52
|
+
corner_cases = _load_json(HERE / "corner_cases.json", required=False)
|
|
53
|
+
|
|
54
|
+
rules = manifest.get("rules", [])
|
|
55
|
+
if args.rule:
|
|
56
|
+
rules = [r for r in rules if r.get("id") == args.rule]
|
|
57
|
+
if not rules:
|
|
58
|
+
_die(f"No rule '{args.rule}' in manifest")
|
|
59
|
+
|
|
60
|
+
if not _check_env():
|
|
61
|
+
sys.exit(2)
|
|
62
|
+
|
|
63
|
+
started = datetime.now(timezone.utc).isoformat()
|
|
64
|
+
t0 = time.monotonic()
|
|
65
|
+
|
|
66
|
+
results = []
|
|
67
|
+
any_failure = False
|
|
68
|
+
for rule in rules:
|
|
69
|
+
result = _run_one(rule, input_path, catalog,
|
|
70
|
+
historical=historical, corner_cases=corner_cases)
|
|
71
|
+
results.append(result)
|
|
72
|
+
if result.get("exit_code", 0) != 0:
|
|
73
|
+
any_failure = True
|
|
74
|
+
|
|
75
|
+
duration_ms = int((time.monotonic() - t0) * 1000)
|
|
76
|
+
aggregated = {
|
|
77
|
+
"release": manifest.get("label"),
|
|
78
|
+
"snapshot_tag": manifest.get("snapshot_tag"),
|
|
79
|
+
"input": str(input_path),
|
|
80
|
+
"started_at": started,
|
|
81
|
+
"duration_ms": duration_ms,
|
|
82
|
+
"results": results,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
out_text = json.dumps(aggregated, ensure_ascii=False, indent=2)
|
|
86
|
+
if args.output:
|
|
87
|
+
out_path = Path(args.output).resolve()
|
|
88
|
+
out_path.write_text(out_text, encoding="utf-8")
|
|
89
|
+
print(f"Wrote {out_path}", file=sys.stderr)
|
|
90
|
+
else:
|
|
91
|
+
print(out_text)
|
|
92
|
+
|
|
93
|
+
if args.dashboard:
|
|
94
|
+
html = kc_dash.render(aggregated, manifest)
|
|
95
|
+
if args.output:
|
|
96
|
+
html_path = Path(args.output).with_suffix(".html")
|
|
97
|
+
else:
|
|
98
|
+
html_path = HERE / f"result_{int(time.time())}.html"
|
|
99
|
+
html_path.write_text(html, encoding="utf-8")
|
|
100
|
+
print(f"Dashboard: {html_path}", file=sys.stderr)
|
|
101
|
+
|
|
102
|
+
sys.exit(1 if any_failure else 0)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _run_one(rule, input_path, catalog, *, historical, corner_cases):
|
|
106
|
+
rule_id = rule.get("id")
|
|
107
|
+
workflow_rel = rule.get("workflow")
|
|
108
|
+
if not workflow_rel:
|
|
109
|
+
return _error_result(rule_id, "no workflow path in manifest")
|
|
110
|
+
|
|
111
|
+
workflow_abs = (HERE / workflow_rel).resolve()
|
|
112
|
+
if not workflow_abs.is_file():
|
|
113
|
+
return _error_result(rule_id, f"workflow not found: {workflow_rel}")
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
proc = subprocess.run(
|
|
117
|
+
["python", str(workflow_abs), str(input_path)],
|
|
118
|
+
capture_output=True, text=True, timeout=300,
|
|
119
|
+
)
|
|
120
|
+
except subprocess.TimeoutExpired:
|
|
121
|
+
return _error_result(rule_id, "workflow timed out (300s)")
|
|
122
|
+
except FileNotFoundError:
|
|
123
|
+
return _error_result(rule_id, "`python` not found on PATH")
|
|
124
|
+
|
|
125
|
+
raw_stdout = (proc.stdout or "").strip()
|
|
126
|
+
raw_data = _parse_last_json_line(raw_stdout)
|
|
127
|
+
|
|
128
|
+
extracted_value = _extract_value(raw_data)
|
|
129
|
+
method = (raw_data or {}).get("extraction_method") or "llm"
|
|
130
|
+
source_text = (raw_data or {}).get("raw_text") or ""
|
|
131
|
+
|
|
132
|
+
conf = kc_conf.score(
|
|
133
|
+
rule_id=rule_id,
|
|
134
|
+
extracted_value=str(extracted_value),
|
|
135
|
+
source_text=source_text,
|
|
136
|
+
method=method,
|
|
137
|
+
document=str(input_path),
|
|
138
|
+
historical=historical,
|
|
139
|
+
corner_cases=corner_cases,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
"rule_id": rule_id,
|
|
144
|
+
"value": extracted_value,
|
|
145
|
+
"confidence": conf,
|
|
146
|
+
"confidence_band": kc_conf.band(conf),
|
|
147
|
+
"extraction_method": method,
|
|
148
|
+
"exit_code": proc.returncode,
|
|
149
|
+
"raw": raw_data if raw_data is not None else {"stderr": (proc.stderr or "")[:2000]},
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _error_result(rule_id, msg):
|
|
154
|
+
return {
|
|
155
|
+
"rule_id": rule_id,
|
|
156
|
+
"value": None,
|
|
157
|
+
"confidence": 0.0,
|
|
158
|
+
"confidence_band": "low",
|
|
159
|
+
"extraction_method": "fallback",
|
|
160
|
+
"exit_code": 2,
|
|
161
|
+
"raw": {"error": msg},
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _parse_last_json_line(text):
|
|
166
|
+
if not text:
|
|
167
|
+
return None
|
|
168
|
+
# Walk lines from the bottom, return the first that parses as a JSON object
|
|
169
|
+
for line in reversed(text.split("\n")):
|
|
170
|
+
line = line.strip()
|
|
171
|
+
if not line:
|
|
172
|
+
continue
|
|
173
|
+
if line[0] not in "{[":
|
|
174
|
+
continue
|
|
175
|
+
try:
|
|
176
|
+
return json.loads(line)
|
|
177
|
+
except json.JSONDecodeError:
|
|
178
|
+
continue
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _extract_value(raw):
|
|
183
|
+
if not isinstance(raw, dict):
|
|
184
|
+
return None
|
|
185
|
+
for k in ("extracted_value", "value", "result"):
|
|
186
|
+
if k in raw:
|
|
187
|
+
return raw[k]
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _load_json(path, *, required):
|
|
192
|
+
if not path.is_file():
|
|
193
|
+
if required:
|
|
194
|
+
_die(f"Required file missing: {path.name}")
|
|
195
|
+
return None
|
|
196
|
+
try:
|
|
197
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
198
|
+
except json.JSONDecodeError as e:
|
|
199
|
+
_die(f"Invalid JSON in {path.name}: {e}")
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _load_calibration(path):
|
|
203
|
+
data = _load_json(path, required=False)
|
|
204
|
+
if not data:
|
|
205
|
+
return {}
|
|
206
|
+
return data.get("historical_accuracy") or data or {}
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _check_env():
|
|
210
|
+
missing = []
|
|
211
|
+
for k in ("LLM_API_KEY",):
|
|
212
|
+
if not os.environ.get(k):
|
|
213
|
+
missing.append(k)
|
|
214
|
+
tiers = [t for t in ("TIER1", "TIER2", "TIER3", "TIER4") if os.environ.get(t)]
|
|
215
|
+
if not tiers:
|
|
216
|
+
missing.append("at least one of TIER1..TIER4")
|
|
217
|
+
if missing:
|
|
218
|
+
print("Missing env vars: " + ", ".join(missing), file=sys.stderr)
|
|
219
|
+
print("Workflows in this release call worker LLMs and need these set.", file=sys.stderr)
|
|
220
|
+
return False
|
|
221
|
+
return True
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _die(msg):
|
|
225
|
+
print(f"error: {msg}", file=sys.stderr)
|
|
226
|
+
sys.exit(2)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
if __name__ == "__main__":
|
|
230
|
+
main()
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
# Serve this release directory locally so dashboards open in a browser.
|
|
3
|
+
# Generated HTML files (e.g. result_*.html, dashboard.html) become reachable
|
|
4
|
+
# at http://localhost:<port>/...
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
# ./serve.sh # default port 8080
|
|
8
|
+
# ./serve.sh 9000 # custom port
|
|
9
|
+
#
|
|
10
|
+
# Stop with Ctrl-C.
|
|
11
|
+
|
|
12
|
+
PORT="${1:-8080}"
|
|
13
|
+
cd "$(dirname "$0")" || exit 1
|
|
14
|
+
echo "Serving $(pwd) on http://localhost:${PORT}/"
|
|
15
|
+
exec python -m http.server "$PORT"
|
|
@@ -61,6 +61,17 @@ After the conversation:
|
|
|
61
61
|
5. Initialize version tracking (a `versions.json` manifest).
|
|
62
62
|
6. Log the bootstrap conversation summary for future reference.
|
|
63
63
|
|
|
64
|
+
## Scheduled Ingestion (Production)
|
|
65
|
+
|
|
66
|
+
Once a project is past bootstrap and into production, fresh documents often arrive on a regular cadence — daily regulator drops, hourly API pulls, batch uploads from upstream systems. Use the `schedule_fetch` tool to register ingestion jobs the OS scheduler runs while kc-beta is closed:
|
|
67
|
+
|
|
68
|
+
- Each job is a shell command (rsync, curl, custom script) that lands files in `$INPUT_DIR`.
|
|
69
|
+
- KC writes a wrapper script under `scripts/ingest/<job-id>.sh`; the user installs the script line into their crontab via `crontab -e`.
|
|
70
|
+
- Newly-arrived files are auto-prefixed with `<job-id>_<UTC-timestamp>_` so origin and arrival time are visible in the filename.
|
|
71
|
+
- View status with `/schedule` or `schedule_fetch list`. Tail of `logs/ingest.log` shows recent runs.
|
|
72
|
+
|
|
73
|
+
Discuss the cadence with the developer user during bootstrap — knowing the production input rhythm shapes how skills and workflows should be written (batch vs streaming, idempotency requirements, etc.).
|
|
74
|
+
|
|
64
75
|
## When to Re-Bootstrap
|
|
65
76
|
|
|
66
77
|
Return to this skill when:
|
|
@@ -106,7 +106,19 @@ For production Input/ documents:
|
|
|
106
106
|
4. Review the selected results (LLM-as-Judge or manual review by the developer user).
|
|
107
107
|
5. Compute batch accuracy from reviewed results.
|
|
108
108
|
6. Log batch QC report.
|
|
109
|
-
7.
|
|
109
|
+
7. Move processed input docs to `input/archived/` via `archive_file` so the next session sees only fresh arrivals.
|
|
110
|
+
8. If accuracy is acceptable, finalize the batch. If not, trigger evolution loop.
|
|
111
|
+
|
|
112
|
+
Production input often arrives on a schedule (see `bootstrap-workspace` → "Scheduled Ingestion"). Files in `input/` are auto-prefixed with `<job-id>_<UTC-timestamp>_` by the ingestion wrapper, so each batch carries provenance in its filenames. When a batch fails QC, the prefixes let you trace which scheduled run produced the bad data.
|
|
113
|
+
|
|
114
|
+
## Two Dashboard Surfaces
|
|
115
|
+
|
|
116
|
+
There are two distinct dashboards in this system:
|
|
117
|
+
|
|
118
|
+
- **Developer dashboard** — `dashboard_render` tool, generated inside the workspace from `output/results/`, `logs/evolution/`, and `output/qc/`. For your audit and the developer user's day-to-day monitoring during BUILD and DISTILL.
|
|
119
|
+
- **End-user dashboard** — the `render_dashboard.py` script bundled inside a release (built via the `release` tool). For non-developer recipients of a packaged release. It renders results from a single `run.py` invocation; no workspace dependency.
|
|
120
|
+
|
|
121
|
+
When a release is built, point end users at the bundled dashboard, not the workspace one. Workspace dashboard stays your developer surface.
|
|
110
122
|
|
|
111
123
|
## Developer User Involvement
|
|
112
124
|
|
|
@@ -135,6 +135,14 @@ The coding agent's skill-based results are the ground truth. For each document i
|
|
|
135
135
|
|
|
136
136
|
Each iteration of a workflow is a new version file: `workflow_v1.py`, `workflow_v2.py`, etc. Track which version is active in `config.json`. See `version-control` skill for the full methodology.
|
|
137
137
|
|
|
138
|
+
## Releasing Workflows
|
|
139
|
+
|
|
140
|
+
Once workflows hit accuracy threshold, they can be packaged for end users via the `release` tool. Each release is a self-contained directory under `output/releases/<slug>/` with the pinned workflows, a Python runner, a confidence scorer, an HTML dashboard generator, and a `serve.sh` helper. The bundle has no kc-beta dependency — anyone with Python and a worker LLM API key can run `python run.py <doc>` and produce verification results.
|
|
141
|
+
|
|
142
|
+
What to include is your call: all rules in catalog, or a curated subset via the `include` parameter; bundling 1-3 representative samples as `fixtures/` if you want the recipient to be able to dry-run without their own data.
|
|
143
|
+
|
|
144
|
+
The `release` tool snapshots the workspace first (git tag `snap/release-<slug>`), so the bundle is regenerable from git even if `output/releases/` is later cleaned. Decide when to release — there's no automation, no forced cadence. Typical triggers: workflows reach SKILL/WORKFLOW_ACCURACY thresholds, a stakeholder needs a hand-off, a production cron should run pinned versions instead of latest. Discuss with the developer user.
|
|
145
|
+
|
|
138
146
|
## Cost Tracking
|
|
139
147
|
|
|
140
148
|
Track the cost of each workflow run:
|
|
@@ -96,6 +96,19 @@ Tags enable three capabilities that you cannot afford to lose:
|
|
|
96
96
|
|
|
97
97
|
Tag format: a simple string field on every intermediate output. Example values: `regex`, `python_calc`, `llm_tier2`, `manual_review`. Be consistent within a project. Define the tag vocabulary once at project setup and enforce it across all skills and workflows.
|
|
98
98
|
|
|
99
|
+
## Multi-agent coordination — keep it lock-free
|
|
100
|
+
|
|
101
|
+
When a task is large enough that you reach for `agent_tool` to spawn parallel sub-agents, partition by an independent unit (one rule per sub-agent, one document per sub-agent, etc.) so the sub-agents never need to coordinate through a shared mutable file.
|
|
102
|
+
|
|
103
|
+
Lesson from a peer-team failure: they tried equal-status agents claiming work via a shared coordination file with locks. Two predictable failures emerged. (1) Agents held locks too long or forgot to release them; even with locks working, twenty agents' throughput dropped to that of two or three because most time went to waiting. (2) Fragility — agents could fail while holding a lock, try to acquire a lock they already held, or update the coordination file without acquiring a lock at all.
|
|
104
|
+
|
|
105
|
+
KC's preferred patterns:
|
|
106
|
+
|
|
107
|
+
- **Single-dispatcher** — `TaskManager` hands tasks out one at a time to the conductor. No locks, no peer coordination. This is the default ralph-loop architecture.
|
|
108
|
+
- **Partition-by-unit** — when spawning sub-agents via `agent_tool`, give each one a non-overlapping slice (per-rule, per-document). Sub-agents write to their own `sub_agents/<taskId>/` for state, and to per-rule paths in `rule_skills/<id>/` or `workflows/<id>/` for shared artifacts. Block 11's git auto-commit serializes the shared writes; partition-by-rule keeps last-writer-wins from being a problem.
|
|
109
|
+
|
|
110
|
+
If two would-be sub-agents need to talk to each other to make progress, they should probably be one task (run sequentially) or a sequence (parent dispatches second after first finishes), not concurrent peers.
|
|
111
|
+
|
|
99
112
|
## Anti-Patterns
|
|
100
113
|
|
|
101
114
|
Five failure modes recur across projects. Learn to recognize them early.
|
|
@@ -7,6 +7,19 @@ description: Manage versioning of skills, workflows, prompts, and system configu
|
|
|
7
7
|
|
|
8
8
|
Version control here is about auditability and rollback, not collaboration. You need to know what changed, when, why, and be able to undo it if the change made things worse.
|
|
9
9
|
|
|
10
|
+
## Git Is the Source of Truth
|
|
11
|
+
|
|
12
|
+
The workspace is a git repository. Every workspace write to a tracked path (skills, workflows, rules, glossary, AGENT.md, tasks.json) is auto-committed by KC with a trace ID in the commit message. This means:
|
|
13
|
+
|
|
14
|
+
- `git log --oneline` is the timeline of every meaningful change in this session.
|
|
15
|
+
- `git diff HEAD~3 -- rule_skills/R001/` shows what changed in a skill across the last three meaningful writes.
|
|
16
|
+
- `git checkout HEAD~5 -- workflows/R001/` rolls back a workflow without touching anything else.
|
|
17
|
+
- The `snapshot` tool tags moments worth remembering (releases, "before risky operation"); restore with `git checkout snap/<label>`.
|
|
18
|
+
|
|
19
|
+
Use `sandbox_exec` with `cwd: "workspace"` to run git commands directly. Don't fight git — it's the audit trail.
|
|
20
|
+
|
|
21
|
+
The conventions below (per-version filename copies, `CHANGELOG.md`) are still useful for *human readability inside a single skill folder* — having `workflow_v1.py` and `workflow_v3.py` side-by-side lets the agent compare them without reading git history. But the system of record is git, not the deprecated `versions.json` manifest (which is no longer written for new workspaces).
|
|
22
|
+
|
|
10
23
|
## What to Version
|
|
11
24
|
|
|
12
25
|
Everything that affects verification results:
|
|
@@ -122,6 +122,17 @@ versions.json # 版本清单(工作空间根目录)
|
|
|
122
122
|
}
|
|
123
123
|
```
|
|
124
124
|
|
|
125
|
+
## 生产环境的定时摄取
|
|
126
|
+
|
|
127
|
+
项目进入生产后,新文档通常会按固定节奏到达 —— 监管机构每日发布、API 每小时拉取、上游系统批量上传。用 `schedule_fetch` 工具注册摄取任务,让 OS 调度器在 kc-beta 关闭时也能跑:
|
|
128
|
+
|
|
129
|
+
- 每个任务是一条 shell 命令(rsync、curl、自定义脚本),把文件落到 `$INPUT_DIR`。
|
|
130
|
+
- KC 在 `scripts/ingest/<job-id>.sh` 下生成一个 wrapper 脚本;用户通过 `crontab -e` 把这一行装进自己的 crontab。
|
|
131
|
+
- 新到达的文件会自动前缀成 `<job-id>_<UTC-时间戳>_`,文件名本身就告诉你来源和到达时间。
|
|
132
|
+
- 用 `/schedule` 或 `schedule_fetch list` 查看状态;`logs/ingest.log` 末尾几行展示最近的运行情况。
|
|
133
|
+
|
|
134
|
+
在初始化阶段就和开发者用户讨论这个节奏 —— 生产侧文档输入节奏直接决定 skill 和工作流的写法(批处理 vs 流式、幂等性要求等等)。
|
|
135
|
+
|
|
125
136
|
## 何时需要重新初始化
|
|
126
137
|
|
|
127
138
|
以下情况需要重新运行本技能:
|
|
@@ -167,8 +167,11 @@ IF 当前批次准确率 < WORKFLOW_ACCURACY:
|
|
|
167
167
|
5. 汇总评审结果
|
|
168
168
|
6. 判断是否需要触发演化循环
|
|
169
169
|
7. 生成质控报告
|
|
170
|
+
8. 处理完的输入文档通过 `archive_file` 移到 `input/archived/`,下次 session 只看到新到达的批次
|
|
170
171
|
```
|
|
171
172
|
|
|
173
|
+
生产环境的输入通常按节奏到达(见 `bootstrap-workspace` 的"生产环境的定时摄取"一节)。`input/` 中的文件由摄取 wrapper 自动加上 `<job-id>_<UTC-时间戳>_` 前缀,每个批次的文件名本身就带有溯源信息。批次质控不通过时,前缀能帮你定位是哪一次定时拉取出了问题。
|
|
174
|
+
|
|
172
175
|
### 输出结构
|
|
173
176
|
|
|
174
177
|
```
|
|
@@ -235,6 +238,15 @@ logs/qc/
|
|
|
235
238
|
}
|
|
236
239
|
```
|
|
237
240
|
|
|
241
|
+
## 两类仪表盘
|
|
242
|
+
|
|
243
|
+
系统中有两个独立的仪表盘:
|
|
244
|
+
|
|
245
|
+
- **开发者仪表盘** —— `dashboard_render` 工具,在工作区内基于 `output/results/`、`logs/evolution/`、`output/qc/` 生成。用于你自己审计、以及开发者用户在 BUILD/DISTILL 阶段的日常监控。
|
|
246
|
+
- **终端用户仪表盘** —— release 包内自带的 `render_dashboard.py` 脚本(由 `release` 工具产出)。面向非开发者收件人,从一次 `run.py` 调用的结果渲染,与工作区无关。
|
|
247
|
+
|
|
248
|
+
发布 release 后,把终端用户引导到 release 包内的仪表盘,不是工作区的那个。工作区仪表盘是你自己的开发者视图。
|
|
249
|
+
|
|
238
250
|
## 开发者用户参与
|
|
239
251
|
|
|
240
252
|
质量监控不应该让开发者用户去读 JSON 文件。通过仪表盘技能生成可视化报告,开发者用户只需要关注:
|
|
@@ -135,6 +135,14 @@ The coding agent's skill-based results are the ground truth. For each document i
|
|
|
135
135
|
|
|
136
136
|
Each iteration of a workflow is a new version file: `workflow_v1.py`, `workflow_v2.py`, etc. Track which version is active in `config.json`. See `version-control` skill for the full methodology.
|
|
137
137
|
|
|
138
|
+
## Releasing Workflows
|
|
139
|
+
|
|
140
|
+
Once workflows hit accuracy threshold, they can be packaged for end users via the `release` tool. Each release is a self-contained directory under `output/releases/<slug>/` with the pinned workflows, a Python runner, a confidence scorer, an HTML dashboard generator, and a `serve.sh` helper. The bundle has no kc-beta dependency — anyone with Python and a worker LLM API key can run `python run.py <doc>` and produce verification results.
|
|
141
|
+
|
|
142
|
+
What to include is your call: all rules in catalog, or a curated subset via the `include` parameter; bundling 1-3 representative samples as `fixtures/` if you want the recipient to be able to dry-run without their own data.
|
|
143
|
+
|
|
144
|
+
The `release` tool snapshots the workspace first (git tag `snap/release-<slug>`), so the bundle is regenerable from git even if `output/releases/` is later cleaned. Decide when to release — there's no automation, no forced cadence. Typical triggers: workflows reach SKILL/WORKFLOW_ACCURACY thresholds, a stakeholder needs a hand-off, a production cron should run pinned versions instead of latest. Discuss with the developer user.
|
|
145
|
+
|
|
138
146
|
## Cost Tracking
|
|
139
147
|
|
|
140
148
|
Track the cost of each workflow run:
|