kc-beta 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/kc-beta.js +16 -0
- package/package.json +32 -0
- package/src/agent/confidence-scorer.js +120 -0
- package/src/agent/context.js +124 -0
- package/src/agent/corner-case-registry.js +119 -0
- package/src/agent/engine.js +224 -0
- package/src/agent/events.js +27 -0
- package/src/agent/history.js +101 -0
- package/src/agent/llm-client.js +131 -0
- package/src/agent/pipelines/base.js +14 -0
- package/src/agent/pipelines/distillation.js +113 -0
- package/src/agent/pipelines/extraction.js +92 -0
- package/src/agent/pipelines/index.js +23 -0
- package/src/agent/pipelines/initializer.js +163 -0
- package/src/agent/pipelines/production-qc.js +99 -0
- package/src/agent/pipelines/skill-authoring.js +83 -0
- package/src/agent/pipelines/skill-testing.js +111 -0
- package/src/agent/tools/agent-tool.js +100 -0
- package/src/agent/tools/base.js +35 -0
- package/src/agent/tools/dashboard-render.js +146 -0
- package/src/agent/tools/document-parse.js +184 -0
- package/src/agent/tools/document-search.js +111 -0
- package/src/agent/tools/evolution-cycle.js +150 -0
- package/src/agent/tools/qc-sample.js +94 -0
- package/src/agent/tools/registry.js +55 -0
- package/src/agent/tools/rule-catalog.js +113 -0
- package/src/agent/tools/sandbox-exec.js +106 -0
- package/src/agent/tools/tier-downgrade.js +114 -0
- package/src/agent/tools/worker-llm-call.js +109 -0
- package/src/agent/tools/workflow-run.js +138 -0
- package/src/agent/tools/workspace-file.js +122 -0
- package/src/agent/version-manager.js +130 -0
- package/src/agent/workspace.js +82 -0
- package/src/cli/components.js +164 -0
- package/src/cli/index.js +329 -0
- package/src/cli/init.js +80 -0
- package/src/cli/onboard.js +182 -0
- package/src/cli/terminal.js +143 -0
- package/src/config.js +93 -0
- package/template/.env.template +31 -0
- package/template/CLAUDE.md +137 -0
- package/template/Input/.gitkeep +0 -0
- package/template/Output/.gitkeep +0 -0
- package/template/Rules/.gitkeep +0 -0
- package/template/Samples/.gitkeep +0 -0
- package/template/skills/en/meta/compliance-judgment/SKILL.md +114 -0
- package/template/skills/en/meta/compliance-judgment/references/output-format.md +151 -0
- package/template/skills/en/meta/confidence-system/SKILL.md +117 -0
- package/template/skills/en/meta/corner-case-management/SKILL.md +111 -0
- package/template/skills/en/meta/cross-document-verification/SKILL.md +131 -0
- package/template/skills/en/meta/cross-document-verification/references/contradiction-taxonomy.md +73 -0
- package/template/skills/en/meta/data-sensibility/SKILL.md +115 -0
- package/template/skills/en/meta/document-parsing/SKILL.md +108 -0
- package/template/skills/en/meta/document-parsing/references/parser-catalog.md +40 -0
- package/template/skills/en/meta/entity-extraction/SKILL.md +129 -0
- package/template/skills/en/meta/tree-processing/SKILL.md +103 -0
- package/template/skills/en/meta-meta/bootstrap-workspace/SKILL.md +70 -0
- package/template/skills/en/meta-meta/dashboard-reporting/SKILL.md +106 -0
- package/template/skills/en/meta-meta/dashboard-reporting/scripts/generate_dashboard.py +178 -0
- package/template/skills/en/meta-meta/evolution-loop/SKILL.md +210 -0
- package/template/skills/en/meta-meta/evolution-loop/references/convergence-guide.md +62 -0
- package/template/skills/en/meta-meta/quality-control/SKILL.md +138 -0
- package/template/skills/en/meta-meta/quality-control/references/qa-layers.md +92 -0
- package/template/skills/en/meta-meta/quality-control/references/sampling-strategies.md +76 -0
- package/template/skills/en/meta-meta/rule-extraction/SKILL.md +100 -0
- package/template/skills/en/meta-meta/rule-extraction/references/chunking-strategies.md +80 -0
- package/template/skills/en/meta-meta/rule-graph/SKILL.md +118 -0
- package/template/skills/en/meta-meta/skill-authoring/SKILL.md +108 -0
- package/template/skills/en/meta-meta/skill-authoring/references/skill-format-spec.md +78 -0
- package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md +150 -0
- package/template/skills/en/meta-meta/skill-to-workflow/references/worker-llm-catalog.md +50 -0
- package/template/skills/en/meta-meta/task-decomposition/SKILL.md +129 -0
- package/template/skills/en/meta-meta/task-decomposition/references/decision-matrix.md +81 -0
- package/template/skills/en/meta-meta/version-control/SKILL.md +152 -0
- package/template/skills/en/meta-meta/version-control/references/trace-id-spec.md +79 -0
- package/template/skills/en/skill-creator/LICENSE.txt +202 -0
- package/template/skills/en/skill-creator/SKILL.md +479 -0
- package/template/skills/en/skill-creator/agents/analyzer.md +274 -0
- package/template/skills/en/skill-creator/agents/comparator.md +202 -0
- package/template/skills/en/skill-creator/agents/grader.md +223 -0
- package/template/skills/en/skill-creator/assets/eval_review.html +146 -0
- package/template/skills/en/skill-creator/eval-viewer/generate_review.py +471 -0
- package/template/skills/en/skill-creator/eval-viewer/viewer.html +1325 -0
- package/template/skills/en/skill-creator/references/schemas.md +430 -0
- package/template/skills/en/skill-creator/scripts/__init__.py +0 -0
- package/template/skills/en/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/template/skills/en/skill-creator/scripts/generate_report.py +326 -0
- package/template/skills/en/skill-creator/scripts/improve_description.py +248 -0
- package/template/skills/en/skill-creator/scripts/package_skill.py +136 -0
- package/template/skills/en/skill-creator/scripts/quick_validate.py +103 -0
- package/template/skills/en/skill-creator/scripts/run_eval.py +310 -0
- package/template/skills/en/skill-creator/scripts/run_loop.py +332 -0
- package/template/skills/en/skill-creator/scripts/utils.py +47 -0
- package/template/skills/zh/meta/compliance-judgment/SKILL.md +303 -0
- package/template/skills/zh/meta/compliance-judgment/references/output-format.md +151 -0
- package/template/skills/zh/meta/confidence-system/SKILL.md +228 -0
- package/template/skills/zh/meta/corner-case-management/SKILL.md +235 -0
- package/template/skills/zh/meta/cross-document-verification/SKILL.md +241 -0
- package/template/skills/zh/meta/cross-document-verification/references/contradiction-taxonomy.md +73 -0
- package/template/skills/zh/meta/data-sensibility/SKILL.md +235 -0
- package/template/skills/zh/meta/document-parsing/SKILL.md +168 -0
- package/template/skills/zh/meta/document-parsing/references/parser-catalog.md +40 -0
- package/template/skills/zh/meta/entity-extraction/SKILL.md +276 -0
- package/template/skills/zh/meta/tree-processing/SKILL.md +233 -0
- package/template/skills/zh/meta-meta/bootstrap-workspace/SKILL.md +147 -0
- package/template/skills/zh/meta-meta/dashboard-reporting/SKILL.md +281 -0
- package/template/skills/zh/meta-meta/dashboard-reporting/scripts/generate_dashboard.py +178 -0
- package/template/skills/zh/meta-meta/evolution-loop/SKILL.md +302 -0
- package/template/skills/zh/meta-meta/evolution-loop/references/convergence-guide.md +62 -0
- package/template/skills/zh/meta-meta/quality-control/SKILL.md +269 -0
- package/template/skills/zh/meta-meta/quality-control/references/qa-layers.md +92 -0
- package/template/skills/zh/meta-meta/quality-control/references/sampling-strategies.md +76 -0
- package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +208 -0
- package/template/skills/zh/meta-meta/rule-extraction/references/chunking-strategies.md +80 -0
- package/template/skills/zh/meta-meta/rule-graph/SKILL.md +203 -0
- package/template/skills/zh/meta-meta/skill-authoring/SKILL.md +235 -0
- package/template/skills/zh/meta-meta/skill-authoring/references/skill-format-spec.md +78 -0
- package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md +275 -0
- package/template/skills/zh/meta-meta/skill-to-workflow/references/worker-llm-catalog.md +50 -0
- package/template/skills/zh/meta-meta/task-decomposition/SKILL.md +224 -0
- package/template/skills/zh/meta-meta/task-decomposition/references/decision-matrix.md +81 -0
- package/template/skills/zh/meta-meta/version-control/SKILL.md +284 -0
- package/template/skills/zh/meta-meta/version-control/references/trace-id-spec.md +79 -0
- package/template/skills/zh/skill-creator/LICENSE.txt +202 -0
- package/template/skills/zh/skill-creator/SKILL.md +479 -0
- package/template/skills/zh/skill-creator/agents/analyzer.md +274 -0
- package/template/skills/zh/skill-creator/agents/comparator.md +202 -0
- package/template/skills/zh/skill-creator/agents/grader.md +223 -0
- package/template/skills/zh/skill-creator/assets/eval_review.html +146 -0
- package/template/skills/zh/skill-creator/eval-viewer/generate_review.py +471 -0
- package/template/skills/zh/skill-creator/eval-viewer/viewer.html +1325 -0
- package/template/skills/zh/skill-creator/references/schemas.md +430 -0
- package/template/skills/zh/skill-creator/scripts/__init__.py +0 -0
- package/template/skills/zh/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/template/skills/zh/skill-creator/scripts/generate_report.py +326 -0
- package/template/skills/zh/skill-creator/scripts/improve_description.py +248 -0
- package/template/skills/zh/skill-creator/scripts/package_skill.py +136 -0
- package/template/skills/zh/skill-creator/scripts/quick_validate.py +103 -0
- package/template/skills/zh/skill-creator/scripts/run_eval.py +310 -0
- package/template/skills/zh/skill-creator/scripts/run_loop.py +332 -0
- package/template/skills/zh/skill-creator/scripts/utils.py +47 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { BaseTool, ToolResult } from "./base.js";
|
|
4
|
+
|
|
5
|
+
const REQUIRED_FIELDS = new Set(["id", "source_ref", "description"]);
|
|
6
|
+
const RECOMMENDED_FIELDS = new Set(["falsifiability_statement", "test_case_stub", "applicable_sections"]);
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* CRUD on the rule registry with schema enforcement.
|
|
10
|
+
* Enforces required fields (id, source_ref, description) on create/update.
|
|
11
|
+
* Persists to rules/catalog.json.
|
|
12
|
+
*/
|
|
13
|
+
export class RuleCatalogTool extends BaseTool {
|
|
14
|
+
constructor(workspace) {
|
|
15
|
+
super();
|
|
16
|
+
this._workspace = workspace;
|
|
17
|
+
this._catalogPath = path.join(workspace.cwd, "rules", "catalog.json");
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
get name() { return "rule_catalog"; }
|
|
21
|
+
get description() {
|
|
22
|
+
return (
|
|
23
|
+
"CRUD on the rule registry. Operations: create, read, update, delete, list. " +
|
|
24
|
+
"Enforces required fields (id, source_ref, description) on create/update. " +
|
|
25
|
+
"Persists to rules/catalog.json."
|
|
26
|
+
);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
get inputSchema() {
|
|
30
|
+
return {
|
|
31
|
+
type: "object",
|
|
32
|
+
properties: {
|
|
33
|
+
operation: { type: "string", enum: ["create", "read", "update", "delete", "list"], description: "Operation to perform" },
|
|
34
|
+
rule_id: { type: "string", description: "Rule ID (for read/update/delete)" },
|
|
35
|
+
data: { type: "object", description: "Rule data (for create/update). Must include: id, source_ref, description" },
|
|
36
|
+
},
|
|
37
|
+
required: ["operation"],
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async execute(input) {
|
|
42
|
+
const op = input.operation || "";
|
|
43
|
+
const ruleId = input.rule_id || "";
|
|
44
|
+
const data = input.data || {};
|
|
45
|
+
|
|
46
|
+
if (op === "list") return this._list();
|
|
47
|
+
if (op === "read") return this._read(ruleId || data.id || "");
|
|
48
|
+
if (op === "create") return this._create(data);
|
|
49
|
+
if (op === "update") return this._update(ruleId || data.id || "", data);
|
|
50
|
+
if (op === "delete") return this._delete(ruleId || data.id || "");
|
|
51
|
+
return new ToolResult(`Unknown operation: ${op}`, true);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
_load() {
|
|
55
|
+
if (!fs.existsSync(this._catalogPath)) return [];
|
|
56
|
+
try {
|
|
57
|
+
const data = JSON.parse(fs.readFileSync(this._catalogPath, "utf-8"));
|
|
58
|
+
return Array.isArray(data) ? data : [];
|
|
59
|
+
} catch { return []; }
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
_save(rules) {
|
|
63
|
+
fs.mkdirSync(path.dirname(this._catalogPath), { recursive: true });
|
|
64
|
+
fs.writeFileSync(this._catalogPath, JSON.stringify(rules, null, 2), "utf-8");
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
_list() {
|
|
68
|
+
const rules = this._load();
|
|
69
|
+
if (rules.length === 0) return new ToolResult("Catalog is empty. Use create to add rules.");
|
|
70
|
+
const summary = rules.map((r) => `- ${r.id || "?"}: ${(r.description || "(no description)").slice(0, 80)}`);
|
|
71
|
+
return new ToolResult(`${rules.length} rule(s):\n${summary.join("\n")}`);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
_read(ruleId) {
|
|
75
|
+
if (!ruleId) return new ToolResult("rule_id required for read", true);
|
|
76
|
+
const rule = this._load().find((r) => r.id === ruleId);
|
|
77
|
+
if (!rule) return new ToolResult(`Rule not found: ${ruleId}`, true);
|
|
78
|
+
return new ToolResult(JSON.stringify(rule, null, 2));
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
_create(data) {
|
|
82
|
+
const missing = [...REQUIRED_FIELDS].filter((f) => !(f in data));
|
|
83
|
+
if (missing.length > 0) return new ToolResult(`Missing required fields: ${missing.join(", ")}`, true);
|
|
84
|
+
const rules = this._load();
|
|
85
|
+
if (rules.some((r) => r.id === data.id)) return new ToolResult(`Rule already exists: ${data.id}. Use update.`, true);
|
|
86
|
+
const warnings = [...RECOMMENDED_FIELDS].filter((f) => !(f in data));
|
|
87
|
+
rules.push(data);
|
|
88
|
+
this._save(rules);
|
|
89
|
+
let msg = `Created rule: ${data.id}`;
|
|
90
|
+
if (warnings.length > 0) msg += `\nMissing recommended fields: ${warnings.join(", ")}`;
|
|
91
|
+
return new ToolResult(msg);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
_update(ruleId, data) {
|
|
95
|
+
if (!ruleId) return new ToolResult("rule_id required for update", true);
|
|
96
|
+
const rules = this._load();
|
|
97
|
+
const idx = rules.findIndex((r) => r.id === ruleId);
|
|
98
|
+
if (idx < 0) return new ToolResult(`Rule not found: ${ruleId}`, true);
|
|
99
|
+
Object.assign(rules[idx], data);
|
|
100
|
+
rules[idx].id = ruleId;
|
|
101
|
+
this._save(rules);
|
|
102
|
+
return new ToolResult(`Updated rule: ${ruleId}`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
_delete(ruleId) {
|
|
106
|
+
if (!ruleId) return new ToolResult("rule_id required for delete", true);
|
|
107
|
+
const rules = this._load();
|
|
108
|
+
const newRules = rules.filter((r) => r.id !== ruleId);
|
|
109
|
+
if (newRules.length === rules.length) return new ToolResult(`Rule not found: ${ruleId}`, true);
|
|
110
|
+
this._save(newRules);
|
|
111
|
+
return new ToolResult(`Deleted rule: ${ruleId}`);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
|
+
import { BaseTool, ToolResult } from "./base.js";
|
|
3
|
+
|
|
4
|
+
const MAX_OUTPUT = 10_000;
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Execute shell commands in the workspace directory.
|
|
8
|
+
* Uses child_process.spawn so pipes, redirects, && all work.
|
|
9
|
+
* Output (stdout + stderr combined) is capped at 10K chars.
|
|
10
|
+
*/
|
|
11
|
+
export class SandboxExecTool extends BaseTool {
|
|
12
|
+
/**
|
|
13
|
+
* @param {import('../workspace.js').Workspace} workspace
|
|
14
|
+
* @param {number} [timeout=30]
|
|
15
|
+
*/
|
|
16
|
+
constructor(workspace, timeout = 30) {
|
|
17
|
+
super();
|
|
18
|
+
this._workspace = workspace;
|
|
19
|
+
this._timeout = timeout;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
get name() { return "sandbox_exec"; }
|
|
23
|
+
|
|
24
|
+
get description() {
|
|
25
|
+
return (
|
|
26
|
+
"Execute a shell command in the workspace directory. " +
|
|
27
|
+
"Use for running scripts, installing packages, listing files, etc. " +
|
|
28
|
+
"Pipes, redirects, and chained commands (&&) are supported."
|
|
29
|
+
);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
get inputSchema() {
|
|
33
|
+
return {
|
|
34
|
+
type: "object",
|
|
35
|
+
properties: {
|
|
36
|
+
command: {
|
|
37
|
+
type: "string",
|
|
38
|
+
description: "The shell command to execute (e.g. 'python script.py', 'ls -la')",
|
|
39
|
+
},
|
|
40
|
+
},
|
|
41
|
+
required: ["command"],
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async execute(input) {
|
|
46
|
+
const command = input.command || "";
|
|
47
|
+
if (!command.trim()) {
|
|
48
|
+
return new ToolResult("No command provided", true);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
try {
|
|
52
|
+
const { output, code } = await this._run(command);
|
|
53
|
+
let result = output;
|
|
54
|
+
if (result.length > MAX_OUTPUT) {
|
|
55
|
+
result = result.slice(0, MAX_OUTPUT) + "\n[truncated]";
|
|
56
|
+
}
|
|
57
|
+
if (code !== 0) {
|
|
58
|
+
result += `\n[exit code: ${code}]`;
|
|
59
|
+
}
|
|
60
|
+
return new ToolResult(result, code !== 0);
|
|
61
|
+
} catch (err) {
|
|
62
|
+
if (err.message === "timeout") {
|
|
63
|
+
return new ToolResult(`Command timed out after ${this._timeout}s`, true);
|
|
64
|
+
}
|
|
65
|
+
return new ToolResult(`Execution error: ${err.message}`, true);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* @param {string} command
|
|
71
|
+
* @returns {Promise<{output: string, code: number}>}
|
|
72
|
+
*/
|
|
73
|
+
_run(command) {
|
|
74
|
+
return new Promise((resolve, reject) => {
|
|
75
|
+
const controller = new AbortController();
|
|
76
|
+
const proc = spawn("sh", ["-c", command], {
|
|
77
|
+
cwd: this._workspace.cwd,
|
|
78
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
79
|
+
signal: controller.signal,
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
let output = "";
|
|
83
|
+
proc.stdout.on("data", (d) => { output += d.toString(); });
|
|
84
|
+
proc.stderr.on("data", (d) => { output += d.toString(); });
|
|
85
|
+
|
|
86
|
+
const timer = setTimeout(() => {
|
|
87
|
+
controller.abort();
|
|
88
|
+
reject(new Error("timeout"));
|
|
89
|
+
}, this._timeout * 1000);
|
|
90
|
+
|
|
91
|
+
proc.on("close", (code) => {
|
|
92
|
+
clearTimeout(timer);
|
|
93
|
+
resolve({ output, code: code ?? 1 });
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
proc.on("error", (err) => {
|
|
97
|
+
clearTimeout(timer);
|
|
98
|
+
if (err.name === "AbortError") {
|
|
99
|
+
reject(new Error("timeout"));
|
|
100
|
+
} else {
|
|
101
|
+
reject(err);
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { BaseTool, ToolResult } from "./base.js";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Test a workflow step at a lower worker LLM tier.
|
|
7
|
+
* Compares accuracy at target tier against current baseline.
|
|
8
|
+
* Used during distillation to find the minimum viable tier.
|
|
9
|
+
*/
|
|
10
|
+
export class TierDowngradeTool extends BaseTool {
|
|
11
|
+
constructor(workspace, workerLlm) {
|
|
12
|
+
super();
|
|
13
|
+
this._workspace = workspace;
|
|
14
|
+
this._worker = workerLlm;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
get name() { return "tier_downgrade"; }
|
|
18
|
+
get description() {
|
|
19
|
+
return (
|
|
20
|
+
"Test a workflow step at a lower worker LLM tier. Compares accuracy " +
|
|
21
|
+
"against baseline. Use during distillation to find the cheapest tier " +
|
|
22
|
+
"that meets accuracy threshold."
|
|
23
|
+
);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
get inputSchema() {
|
|
27
|
+
return {
|
|
28
|
+
type: "object",
|
|
29
|
+
properties: {
|
|
30
|
+
rule_id: { type: "string", description: "Rule ID being tested" },
|
|
31
|
+
prompt: { type: "string", description: "The extraction/judgment prompt to test" },
|
|
32
|
+
test_inputs: { type: "array", items: { type: "string" }, description: "List of document text chunks to test" },
|
|
33
|
+
expected_outputs: { type: "array", items: { type: "string" }, description: "Expected correct outputs" },
|
|
34
|
+
current_tier: { type: "string", description: "Current tier (baseline)" },
|
|
35
|
+
target_tier: { type: "string", description: "Lower tier to test" },
|
|
36
|
+
},
|
|
37
|
+
required: ["rule_id", "prompt", "test_inputs", "expected_outputs", "current_tier", "target_tier"],
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async execute(input) {
|
|
42
|
+
const ruleId = input.rule_id || "";
|
|
43
|
+
const prompt = input.prompt || "";
|
|
44
|
+
const testInputs = input.test_inputs || [];
|
|
45
|
+
const expected = input.expected_outputs || [];
|
|
46
|
+
const currentTier = input.current_tier || "tier1";
|
|
47
|
+
const targetTier = input.target_tier || "tier2";
|
|
48
|
+
|
|
49
|
+
if (!testInputs.length || !expected.length || testInputs.length !== expected.length) {
|
|
50
|
+
return new ToolResult("test_inputs and expected_outputs must be non-empty and same length", true);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const currentResults = await this._runTier(currentTier, prompt, testInputs);
|
|
54
|
+
const targetResults = await this._runTier(targetTier, prompt, testInputs);
|
|
55
|
+
|
|
56
|
+
if (!currentResults) return new ToolResult(`Failed to run at ${currentTier}`, true);
|
|
57
|
+
if (!targetResults) return new ToolResult(`Failed to run at ${targetTier}`, true);
|
|
58
|
+
|
|
59
|
+
const currentAcc = this._accuracy(currentResults, expected);
|
|
60
|
+
const targetAcc = this._accuracy(targetResults, expected);
|
|
61
|
+
const delta = currentAcc - targetAcc;
|
|
62
|
+
|
|
63
|
+
// Read threshold from .env
|
|
64
|
+
let threshold = 0.9;
|
|
65
|
+
const envPath = path.join(this._workspace.cwd, ".env");
|
|
66
|
+
if (fs.existsSync(envPath)) {
|
|
67
|
+
for (const line of fs.readFileSync(envPath, "utf-8").split("\n")) {
|
|
68
|
+
if (line.startsWith("WORKFLOW_ACCURACY=")) {
|
|
69
|
+
try { threshold = parseFloat(line.split("=")[1].trim()); }
|
|
70
|
+
catch { /* ignore */ }
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const recommend = targetAcc >= threshold && delta <= 0.05 ? "downgrade" : "keep_current";
|
|
76
|
+
|
|
77
|
+
const report = {
|
|
78
|
+
rule_id: ruleId, current_tier: currentTier, target_tier: targetTier,
|
|
79
|
+
current_accuracy: Math.round(currentAcc * 1000) / 1000,
|
|
80
|
+
target_accuracy: Math.round(targetAcc * 1000) / 1000,
|
|
81
|
+
accuracy_delta: Math.round(delta * 1000) / 1000,
|
|
82
|
+
threshold, recommendation: recommend, test_count: testInputs.length,
|
|
83
|
+
};
|
|
84
|
+
return new ToolResult(JSON.stringify(report, null, 2));
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
async _runTier(tier, prompt, inputs) {
|
|
88
|
+
const results = [];
|
|
89
|
+
for (const text of inputs) {
|
|
90
|
+
const fullPrompt = `${prompt}\n\nDocument text:\n${text}`;
|
|
91
|
+
const result = await this._worker.execute({ tier, prompt: fullPrompt, max_tokens: 2048 });
|
|
92
|
+
if (result.isError) return null;
|
|
93
|
+
try {
|
|
94
|
+
const data = JSON.parse(result.content);
|
|
95
|
+
results.push(data.response || "");
|
|
96
|
+
} catch {
|
|
97
|
+
results.push(result.content);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
return results;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
_accuracy(outputs, expected) {
|
|
104
|
+
if (!expected.length) return 0;
|
|
105
|
+
let matches = 0;
|
|
106
|
+
for (let i = 0; i < expected.length; i++) {
|
|
107
|
+
if (outputs[i] && expected[i].trim().toLowerCase() !== "" &&
|
|
108
|
+
outputs[i].toLowerCase().includes(expected[i].trim().toLowerCase())) {
|
|
109
|
+
matches++;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return matches / expected.length;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { BaseTool, ToolResult } from "./base.js";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Call a worker LLM at a specified tier for verification tasks.
|
|
7
|
+
* Reads tier-to-model mapping from workspace .env. Routes through
|
|
8
|
+
* the configured API provider.
|
|
9
|
+
*/
|
|
10
|
+
export class WorkerLLMCallTool extends BaseTool {
|
|
11
|
+
constructor(workspace, { apiKey, baseUrl } = {}) {
|
|
12
|
+
super();
|
|
13
|
+
this._workspace = workspace;
|
|
14
|
+
this._apiKey = apiKey || "";
|
|
15
|
+
this._baseUrl = (baseUrl || "https://api.siliconflow.cn/v1").replace(/\/+$/, "");
|
|
16
|
+
this._tierModels = {};
|
|
17
|
+
this._loadTiers();
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
_loadTiers() {
|
|
21
|
+
const envPath = path.join(this._workspace.cwd, ".env");
|
|
22
|
+
if (!fs.existsSync(envPath)) return;
|
|
23
|
+
const lines = fs.readFileSync(envPath, "utf-8").split("\n");
|
|
24
|
+
for (const line of lines) {
|
|
25
|
+
for (const tier of ["TIER1", "TIER2", "TIER3", "TIER4"]) {
|
|
26
|
+
if (line.startsWith(`${tier}=`)) {
|
|
27
|
+
const models = line.split("=")[1].split(",").map((m) => m.trim()).filter(Boolean);
|
|
28
|
+
this._tierModels[tier.toLowerCase()] = models;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
get name() { return "worker_llm_call"; }
|
|
35
|
+
|
|
36
|
+
get description() {
|
|
37
|
+
return (
|
|
38
|
+
"Call a worker LLM at a specified tier (tier1-tier4) for extraction, " +
|
|
39
|
+
"judgment, or other verification tasks. Tier1 is most capable/expensive, " +
|
|
40
|
+
"tier4 is cheapest. Returns response with model used and token counts."
|
|
41
|
+
);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
get inputSchema() {
|
|
45
|
+
return {
|
|
46
|
+
type: "object",
|
|
47
|
+
properties: {
|
|
48
|
+
tier: { type: "string", enum: ["tier1", "tier2", "tier3", "tier4"], description: "Worker LLM tier to use" },
|
|
49
|
+
prompt: { type: "string", description: "The user/task prompt to send" },
|
|
50
|
+
system_prompt: { type: "string", description: "Optional system prompt for context" },
|
|
51
|
+
max_tokens: { type: "integer", description: "Maximum tokens in response (default 4096)" },
|
|
52
|
+
},
|
|
53
|
+
required: ["tier", "prompt"],
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async execute(input) {
|
|
58
|
+
const tier = input.tier || "tier2";
|
|
59
|
+
const prompt = input.prompt || "";
|
|
60
|
+
const systemPrompt = input.system_prompt;
|
|
61
|
+
const maxTokens = input.max_tokens || 4096;
|
|
62
|
+
|
|
63
|
+
if (!prompt) return new ToolResult("No prompt provided", true);
|
|
64
|
+
if (!this._apiKey) return new ToolResult("Worker LLM API key not configured", true);
|
|
65
|
+
|
|
66
|
+
this._loadTiers();
|
|
67
|
+
const models = this._tierModels[tier] || [];
|
|
68
|
+
if (models.length === 0) {
|
|
69
|
+
return new ToolResult(`No models configured for ${tier}. Check .env TIER1-TIER4 settings.`, true);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const messages = [];
|
|
73
|
+
if (systemPrompt) messages.push({ role: "system", content: systemPrompt });
|
|
74
|
+
messages.push({ role: "user", content: prompt });
|
|
75
|
+
|
|
76
|
+
let lastError = "";
|
|
77
|
+
for (const model of models) {
|
|
78
|
+
try {
|
|
79
|
+
const resp = await fetch(`${this._baseUrl}/chat/completions`, {
|
|
80
|
+
method: "POST",
|
|
81
|
+
headers: {
|
|
82
|
+
"Authorization": `Bearer ${this._apiKey}`,
|
|
83
|
+
"Content-Type": "application/json",
|
|
84
|
+
},
|
|
85
|
+
body: JSON.stringify({ model, messages, max_tokens: maxTokens }),
|
|
86
|
+
signal: AbortSignal.timeout(120000),
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
if (resp.ok) {
|
|
90
|
+
const data = await resp.json();
|
|
91
|
+
const usage = data.usage || {};
|
|
92
|
+
const result = {
|
|
93
|
+
response: data.choices[0].message.content,
|
|
94
|
+
model_used: model,
|
|
95
|
+
tier,
|
|
96
|
+
tokens_in: usage.prompt_tokens || 0,
|
|
97
|
+
tokens_out: usage.completion_tokens || 0,
|
|
98
|
+
};
|
|
99
|
+
return new ToolResult(JSON.stringify(result, null, 2));
|
|
100
|
+
}
|
|
101
|
+
lastError = `${model}: HTTP ${resp.status}`;
|
|
102
|
+
} catch (e) {
|
|
103
|
+
lastError = `${model}: ${e.message}`;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return new ToolResult(`All models for ${tier} failed. Last error: ${lastError}`, true);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { spawn } from "node:child_process";
|
|
4
|
+
import { BaseTool, ToolResult } from "./base.js";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Execute a distilled workflow script against a document.
|
|
8
|
+
* Runs the workflow script in sandbox, attaches ConfidenceScorer
|
|
9
|
+
* result and trace ID automatically. Saves structured result to output/results/.
|
|
10
|
+
*/
|
|
11
|
+
export class WorkflowRunTool extends BaseTool {
|
|
12
|
+
constructor(workspace, versionManager, confidenceScorer, { timeout = 120 } = {}) {
|
|
13
|
+
super();
|
|
14
|
+
this._workspace = workspace;
|
|
15
|
+
this._versionMgr = versionManager;
|
|
16
|
+
this._confidence = confidenceScorer;
|
|
17
|
+
this._timeout = timeout;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
get name() { return "workflow_run"; }
|
|
21
|
+
|
|
22
|
+
get description() {
|
|
23
|
+
return (
|
|
24
|
+
"Execute a distilled workflow against a document. Runs the workflow " +
|
|
25
|
+
"script, attaches confidence scores and trace IDs automatically. " +
|
|
26
|
+
"Results saved to output/results/."
|
|
27
|
+
);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
get inputSchema() {
|
|
31
|
+
return {
|
|
32
|
+
type: "object",
|
|
33
|
+
properties: {
|
|
34
|
+
rule_id: { type: "string", description: "Rule ID whose workflow to execute" },
|
|
35
|
+
document_path: { type: "string", description: "Relative path to document in workspace" },
|
|
36
|
+
workflow_version: { type: "integer", description: "Workflow version to run (default: latest)" },
|
|
37
|
+
},
|
|
38
|
+
required: ["rule_id", "document_path"],
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
async execute(input) {
|
|
43
|
+
const ruleId = input.rule_id || "";
|
|
44
|
+
const docPath = input.document_path || "";
|
|
45
|
+
const wfVersion = input.workflow_version;
|
|
46
|
+
|
|
47
|
+
if (!ruleId || !docPath) return new ToolResult("rule_id and document_path required", true);
|
|
48
|
+
|
|
49
|
+
// Find workflow script
|
|
50
|
+
const wfDir = path.join(this._workspace.cwd, "workflows", ruleId);
|
|
51
|
+
const wfScript = this._findWorkflow(
|
|
52
|
+
fs.existsSync(wfDir) ? wfDir : path.join(this._workspace.cwd, "workflows"),
|
|
53
|
+
ruleId, wfVersion,
|
|
54
|
+
);
|
|
55
|
+
if (!wfScript) return new ToolResult(`No workflow found for ${ruleId} in workflows/`, true);
|
|
56
|
+
|
|
57
|
+
let docResolved;
|
|
58
|
+
try { docResolved = this._workspace.resolvePath(docPath); }
|
|
59
|
+
catch (e) { return new ToolResult(e.message, true); }
|
|
60
|
+
if (!fs.existsSync(docResolved)) return new ToolResult(`Document not found: ${docPath}`, true);
|
|
61
|
+
|
|
62
|
+
// Run workflow in subprocess
|
|
63
|
+
const cmd = `python ${wfScript} ${docResolved}`;
|
|
64
|
+
let output;
|
|
65
|
+
try {
|
|
66
|
+
output = await this._exec(cmd);
|
|
67
|
+
} catch (e) {
|
|
68
|
+
return new ToolResult(e.message, true);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Parse output
|
|
72
|
+
let resultData;
|
|
73
|
+
try {
|
|
74
|
+
const lines = output.trim().split("\n");
|
|
75
|
+
resultData = JSON.parse(lines[lines.length - 1]);
|
|
76
|
+
} catch {
|
|
77
|
+
resultData = { raw_output: output.slice(0, 5000) };
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Attach confidence score
|
|
81
|
+
const extractedValue = String(resultData.extracted_value || resultData.value || "");
|
|
82
|
+
const method = resultData.extraction_method || "llm";
|
|
83
|
+
const confidence = this._confidence.score({
|
|
84
|
+
ruleId, extractedValue, extractionMethod: method, documentName: path.basename(docResolved),
|
|
85
|
+
});
|
|
86
|
+
resultData.confidence = confidence;
|
|
87
|
+
resultData.confidence_band = this._confidence.getBand(confidence);
|
|
88
|
+
|
|
89
|
+
// Attach trace ID
|
|
90
|
+
resultData.trace_id = this._versionMgr.generateTraceId(ruleId, "workflow_result");
|
|
91
|
+
resultData.rule_id = ruleId;
|
|
92
|
+
resultData.document = docPath;
|
|
93
|
+
|
|
94
|
+
// Save result
|
|
95
|
+
const resultsDir = path.join(this._workspace.cwd, "output", "results");
|
|
96
|
+
fs.mkdirSync(resultsDir, { recursive: true });
|
|
97
|
+
const resultFile = path.join(resultsDir, `${ruleId}_${path.parse(docResolved).name}.json`);
|
|
98
|
+
fs.writeFileSync(resultFile, JSON.stringify(resultData, null, 2), "utf-8");
|
|
99
|
+
|
|
100
|
+
return new ToolResult(JSON.stringify(resultData, null, 2));
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
_findWorkflow(wfDir, ruleId, version) {
|
|
104
|
+
if (!fs.existsSync(wfDir) || !fs.statSync(wfDir).isDirectory()) return null;
|
|
105
|
+
|
|
106
|
+
if (version) {
|
|
107
|
+
const target = path.join(wfDir, `workflow_v${version}.py`);
|
|
108
|
+
if (fs.existsSync(target)) return target;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const entries = fs.readdirSync(wfDir).sort();
|
|
112
|
+
const versioned = entries.filter((f) => /^workflow_v\d+\.py$/.test(f));
|
|
113
|
+
if (versioned.length > 0) return path.join(wfDir, versioned[versioned.length - 1]);
|
|
114
|
+
|
|
115
|
+
const any = entries.find((f) => f.endsWith(".py") && f.toLowerCase().includes("workflow"));
|
|
116
|
+
if (any) return path.join(wfDir, any);
|
|
117
|
+
|
|
118
|
+
const pyFile = entries.find((f) => f.endsWith(".py"));
|
|
119
|
+
return pyFile ? path.join(wfDir, pyFile) : null;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
_exec(command) {
|
|
123
|
+
return new Promise((resolve, reject) => {
|
|
124
|
+
const controller = new AbortController();
|
|
125
|
+
const proc = spawn("sh", ["-c", command], {
|
|
126
|
+
cwd: this._workspace.cwd,
|
|
127
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
128
|
+
signal: controller.signal,
|
|
129
|
+
});
|
|
130
|
+
let output = "";
|
|
131
|
+
proc.stdout.on("data", (d) => { output += d.toString(); });
|
|
132
|
+
proc.stderr.on("data", (d) => { output += d.toString(); });
|
|
133
|
+
const timer = setTimeout(() => { controller.abort(); reject(new Error(`Workflow timed out after ${this._timeout}s`)); }, this._timeout * 1000);
|
|
134
|
+
proc.on("close", (code) => { clearTimeout(timer); resolve(output); });
|
|
135
|
+
proc.on("error", (err) => { clearTimeout(timer); reject(err); });
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
}
|