@kweaver-ai/kweaver-sdk 0.8.1 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -52
- package/README.zh.md +41 -46
- package/dist/agent-providers/index.d.ts +7 -0
- package/dist/agent-providers/index.js +5 -0
- package/dist/agent-providers/prompt-template.d.ts +62 -0
- package/dist/agent-providers/prompt-template.js +105 -0
- package/dist/agent-providers/prompts/rubric-judge-v1.prompt.md +51 -0
- package/dist/agent-providers/prompts/within-trace-synthesizer-v1.prompt.md +60 -0
- package/dist/agent-providers/providers/claude-code-subprocess.d.ts +74 -0
- package/dist/agent-providers/providers/claude-code-subprocess.js +259 -0
- package/dist/agent-providers/providers/stub.d.ts +47 -0
- package/dist/agent-providers/providers/stub.js +77 -0
- package/dist/agent-providers/registry.d.ts +45 -0
- package/dist/agent-providers/registry.js +77 -0
- package/dist/agent-providers/types.d.ts +91 -0
- package/dist/agent-providers/types.js +25 -0
- package/dist/api/agent-chat.js +8 -6
- package/dist/api/context-loader.d.ts +1 -0
- package/dist/api/resources.d.ts +94 -0
- package/dist/api/resources.js +166 -0
- package/dist/api/semantic-search.d.ts +5 -0
- package/dist/api/semantic-search.js +5 -0
- package/dist/api/skills.d.ts +75 -2
- package/dist/api/skills.js +108 -12
- package/dist/api/trace.d.ts +5 -0
- package/dist/api/trace.js +4 -0
- package/dist/cli.js +109 -15
- package/dist/client.d.ts +3 -3
- package/dist/client.js +5 -5
- package/dist/commands/agent/mode.d.ts +6 -0
- package/dist/commands/agent/mode.js +75 -0
- package/dist/commands/agent-members.js +27 -11
- package/dist/commands/agent.js +469 -286
- package/dist/commands/auth.js +184 -71
- package/dist/commands/bkn-metric.js +37 -16
- package/dist/commands/bkn-ops.js +164 -86
- package/dist/commands/bkn-query.js +99 -31
- package/dist/commands/bkn-schema.d.ts +3 -3
- package/dist/commands/bkn-schema.js +127 -86
- package/dist/commands/bkn.js +153 -114
- package/dist/commands/call.js +23 -13
- package/dist/commands/config.js +22 -12
- package/dist/commands/context-loader.js +625 -49
- package/dist/commands/dataflow.js +14 -6
- package/dist/commands/ds.js +52 -30
- package/dist/commands/explore.js +18 -15
- package/dist/commands/model.js +53 -42
- package/dist/commands/resource.d.ts +1 -0
- package/dist/commands/{dataview.js → resource.js} +62 -84
- package/dist/commands/skill.d.ts +21 -1
- package/dist/commands/skill.js +567 -43
- package/dist/commands/token.js +11 -0
- package/dist/commands/tool.js +46 -29
- package/dist/commands/toolbox.js +31 -15
- package/dist/commands/trace.d.ts +26 -1
- package/dist/commands/trace.js +515 -15
- package/dist/commands/vega.js +466 -250
- package/dist/help/format.d.ts +65 -0
- package/dist/help/format.js +141 -0
- package/dist/index.d.ts +5 -5
- package/dist/index.js +3 -3
- package/dist/resources/bkn.d.ts +5 -0
- package/dist/resources/bkn.js +5 -0
- package/dist/resources/{dataviews.d.ts → resources.d.ts} +10 -11
- package/dist/resources/{dataviews.js → resources.js} +12 -13
- package/dist/resources/skills.d.ts +17 -1
- package/dist/resources/skills.js +32 -1
- package/dist/trace-ai/diagnose/agent-binding.d.ts +67 -0
- package/dist/trace-ai/diagnose/agent-binding.js +257 -0
- package/dist/trace-ai/diagnose/builtin-rules/tool-retry-intent-mismatch.yaml +68 -0
- package/dist/trace-ai/diagnose/index.d.ts +32 -0
- package/dist/trace-ai/diagnose/index.js +246 -0
- package/dist/trace-ai/diagnose/output-schema-converter.d.ts +24 -0
- package/dist/trace-ai/diagnose/output-schema-converter.js +81 -0
- package/dist/trace-ai/diagnose/query-extractor.d.ts +14 -0
- package/dist/trace-ai/diagnose/query-extractor.js +45 -0
- package/dist/trace-ai/diagnose/report-assembler.d.ts +31 -0
- package/dist/{trace-core → trace-ai}/diagnose/report-assembler.js +19 -9
- package/dist/trace-ai/diagnose/report-markdown.d.ts +18 -0
- package/dist/trace-ai/diagnose/report-markdown.js +192 -0
- package/dist/{trace-core → trace-ai}/diagnose/rule-loader.js +42 -8
- package/dist/{trace-core → trace-ai}/diagnose/schemas.d.ts +77 -2
- package/dist/trace-ai/diagnose/schemas.js +154 -0
- package/dist/trace-ai/diagnose/signal-probe.d.ts +17 -0
- package/dist/trace-ai/diagnose/signal-probe.js +39 -0
- package/dist/trace-ai/diagnose/synthesizer-agent.d.ts +40 -0
- package/dist/trace-ai/diagnose/synthesizer-agent.js +158 -0
- package/dist/{trace-core → trace-ai}/diagnose/trace-shaper.js +1 -0
- package/dist/{trace-core → trace-ai}/diagnose/types.d.ts +55 -6
- package/dist/trace-ai/eval-set/assertion-evaluator.d.ts +29 -0
- package/dist/trace-ai/eval-set/assertion-evaluator.js +100 -0
- package/dist/trace-ai/eval-set/builder.d.ts +36 -0
- package/dist/trace-ai/eval-set/builder.js +126 -0
- package/dist/trace-ai/eval-set/index.d.ts +15 -0
- package/dist/trace-ai/eval-set/index.js +10 -0
- package/dist/trace-ai/eval-set/output-writer.d.ts +27 -0
- package/dist/trace-ai/eval-set/output-writer.js +126 -0
- package/dist/trace-ai/eval-set/query-picker.d.ts +37 -0
- package/dist/trace-ai/eval-set/query-picker.js +147 -0
- package/dist/trace-ai/eval-set/redactor.d.ts +42 -0
- package/dist/trace-ai/eval-set/redactor.js +133 -0
- package/dist/trace-ai/eval-set/rubric-templates/answer-match-reference.prompt.md +19 -0
- package/dist/trace-ai/eval-set/schemas.d.ts +136 -0
- package/dist/trace-ai/eval-set/schemas.js +130 -0
- package/dist/trace-ai/eval-set/semantic-match-provider.d.ts +33 -0
- package/dist/trace-ai/eval-set/semantic-match-provider.js +51 -0
- package/dist/trace-ai/eval-set/test-runner.d.ts +34 -0
- package/dist/trace-ai/eval-set/test-runner.js +153 -0
- package/dist/trace-ai/eval-set/types.d.ts +46 -0
- package/dist/trace-ai/eval-set/types.js +8 -0
- package/dist/trace-ai/exp/bundle-writer.d.ts +10 -0
- package/dist/trace-ai/exp/bundle-writer.js +54 -0
- package/dist/trace-ai/exp/claude-binary.d.ts +5 -0
- package/dist/trace-ai/exp/claude-binary.js +30 -0
- package/dist/trace-ai/exp/coordinator.d.ts +45 -0
- package/dist/trace-ai/exp/coordinator.js +203 -0
- package/dist/trace-ai/exp/eval-runner.d.ts +14 -0
- package/dist/trace-ai/exp/eval-runner.js +47 -0
- package/dist/trace-ai/exp/exp-store/abort-signal.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/abort-signal.js +27 -0
- package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.d.ts +4 -0
- package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.js +37 -0
- package/dist/trace-ai/exp/exp-store/events-jsonl.d.ts +17 -0
- package/dist/trace-ai/exp/exp-store/events-jsonl.js +60 -0
- package/dist/trace-ai/exp/exp-store/exp-registry.d.ts +6 -0
- package/dist/trace-ai/exp/exp-store/exp-registry.js +41 -0
- package/dist/trace-ai/exp/exp-store/index.d.ts +46 -0
- package/dist/trace-ai/exp/exp-store/index.js +59 -0
- package/dist/trace-ai/exp/exp-store/lock.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/lock.js +73 -0
- package/dist/trace-ai/exp/exp-store/mission-md.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/mission-md.js +37 -0
- package/dist/trace-ai/exp/exp-store/readme-template.d.ts +5 -0
- package/dist/trace-ai/exp/exp-store/readme-template.js +25 -0
- package/dist/trace-ai/exp/exp-store/round-yaml.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/round-yaml.js +33 -0
- package/dist/trace-ai/exp/index.d.ts +8 -0
- package/dist/trace-ai/exp/index.js +238 -0
- package/dist/trace-ai/exp/info.d.ts +35 -0
- package/dist/trace-ai/exp/info.js +120 -0
- package/dist/trace-ai/exp/patch/agent-config.d.ts +1 -0
- package/dist/trace-ai/exp/patch/agent-config.js +26 -0
- package/dist/trace-ai/exp/patch/index.d.ts +2 -0
- package/dist/trace-ai/exp/patch/index.js +13 -0
- package/dist/trace-ai/exp/patch/skill.d.ts +1 -0
- package/dist/trace-ai/exp/patch/skill.js +24 -0
- package/dist/trace-ai/exp/providers/synthesizer-client.d.ts +14 -0
- package/dist/trace-ai/exp/providers/synthesizer-client.js +39 -0
- package/dist/trace-ai/exp/providers/triage-client.d.ts +19 -0
- package/dist/trace-ai/exp/providers/triage-client.js +51 -0
- package/dist/trace-ai/exp/schemas.d.ts +147 -0
- package/dist/trace-ai/exp/schemas.js +50 -0
- package/dist/trace-ai/exp/scoring.d.ts +2 -0
- package/dist/trace-ai/exp/scoring.js +46 -0
- package/dist/trace-ai/scan/aggregator.d.ts +20 -0
- package/dist/trace-ai/scan/aggregator.js +26 -0
- package/dist/trace-ai/scan/artifacts/paths.d.ts +12 -0
- package/dist/trace-ai/scan/artifacts/paths.js +18 -0
- package/dist/trace-ai/scan/artifacts/writer.d.ts +67 -0
- package/dist/trace-ai/scan/artifacts/writer.js +96 -0
- package/dist/trace-ai/scan/batched-rubric.d.ts +55 -0
- package/dist/trace-ai/scan/batched-rubric.js +159 -0
- package/dist/trace-ai/scan/cross-trace-synthesizer.d.ts +24 -0
- package/dist/trace-ai/scan/cross-trace-synthesizer.js +93 -0
- package/dist/trace-ai/scan/index.d.ts +31 -0
- package/dist/trace-ai/scan/index.js +390 -0
- package/dist/trace-ai/scan/prompts/builtin/cross-trace-synthesizer-v1.prompt.md +44 -0
- package/dist/trace-ai/scan/prompts/builtin/rubric-judge-batch-v1.prompt.md +44 -0
- package/dist/trace-ai/scan/runner.d.ts +25 -0
- package/dist/trace-ai/scan/runner.js +42 -0
- package/dist/trace-ai/scan/sampler.d.ts +18 -0
- package/dist/trace-ai/scan/sampler.js +81 -0
- package/dist/trace-ai/scan/scan-summary-markdown.d.ts +2 -0
- package/dist/trace-ai/scan/scan-summary-markdown.js +71 -0
- package/dist/trace-ai/scan/scan-summary-schema.d.ts +73 -0
- package/dist/trace-ai/scan/scan-summary-schema.js +61 -0
- package/dist/trace-ai/scan/single-agent-validator.d.ts +23 -0
- package/dist/trace-ai/scan/single-agent-validator.js +42 -0
- package/dist/trace-ai/scan/traces-list-parser.d.ts +15 -0
- package/dist/trace-ai/scan/traces-list-parser.js +46 -0
- package/package.json +2 -2
- package/dist/api/dataviews.d.ts +0 -117
- package/dist/api/dataviews.js +0 -265
- package/dist/commands/dataview.d.ts +0 -8
- package/dist/trace-core/diagnose/index.d.ts +0 -9
- package/dist/trace-core/diagnose/index.js +0 -104
- package/dist/trace-core/diagnose/report-assembler.d.ts +0 -12
- package/dist/trace-core/diagnose/schemas.js +0 -94
- package/dist/trace-core/diagnose/signal-probe.d.ts +0 -5
- package/dist/trace-core/diagnose/signal-probe.js +0 -21
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/register.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/register.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/predicate-registry.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/predicate-registry.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/rule-loader.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/synthesizer-template.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/synthesizer-template.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/trace-shaper.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/types.js +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* M5 eval-set redactor — PII pattern matching + replacement.
|
|
3
|
+
*
|
|
4
|
+
* Three rule sources, in priority order (chain):
|
|
5
|
+
* 1. --redaction-rules=<path> (CLI flag, highest)
|
|
6
|
+
* 2. <repo>/redaction-rules/*.yaml (repo-local)
|
|
7
|
+
* 3. BUILTIN_RULES (5 low-fidelity defaults)
|
|
8
|
+
*
|
|
9
|
+
* Builtin rules cover common Chinese-context PII: phone / email / id_card /
|
|
10
|
+
* bank_card / ip. Organizations write more rules in <repo>/redaction-rules/
|
|
11
|
+
* for their business-specific patterns.
|
|
12
|
+
*
|
|
13
|
+
* Rule yaml format:
|
|
14
|
+
* rules:
|
|
15
|
+
* - name: <id>
|
|
16
|
+
* pattern: <regex source string>
|
|
17
|
+
* replace: <replacement template; supports {hash6} placeholder>
|
|
18
|
+
*
|
|
19
|
+
* Malformed regex causes loadRules to throw RedactorError (no silent fallback).
|
|
20
|
+
*/
|
|
21
|
+
import { readFile, readdir, stat } from "node:fs/promises";
|
|
22
|
+
import path from "node:path";
|
|
23
|
+
import { createHash } from "node:crypto";
|
|
24
|
+
import yaml from "js-yaml";
|
|
25
|
+
export class RedactorError extends Error {
|
|
26
|
+
path;
|
|
27
|
+
constructor(message, path) {
|
|
28
|
+
super(message);
|
|
29
|
+
this.path = path;
|
|
30
|
+
this.name = "RedactorError";
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* 5 builtin low-fidelity PII patterns. Tuned for Chinese-context defaults;
|
|
35
|
+
* organizations override with their own rules in <repo>/redaction-rules/.
|
|
36
|
+
*/
|
|
37
|
+
export const BUILTIN_RULES = [
|
|
38
|
+
{
|
|
39
|
+
name: "phone",
|
|
40
|
+
pattern: /1[3-9]\d{9}/g,
|
|
41
|
+
replace: "<phone:{hash6}>",
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
name: "email",
|
|
45
|
+
pattern: /[\w.+-]+@[\w.-]+\.\w+/g,
|
|
46
|
+
replace: "<email:{hash6}>",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
name: "id_card",
|
|
50
|
+
pattern: /\b\d{17}[\dXx]\b/g,
|
|
51
|
+
replace: "<id_card:{hash6}>",
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
name: "bank_card",
|
|
55
|
+
pattern: /\b\d{15,19}\b/g, // 银行卡号长度 15-19 位
|
|
56
|
+
replace: "<bank_card:{hash6}>",
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
name: "ip",
|
|
60
|
+
pattern: /\b(?:\d{1,3}\.){3}\d{1,3}\b/g,
|
|
61
|
+
replace: "<ip:{hash6}>",
|
|
62
|
+
},
|
|
63
|
+
];
|
|
64
|
+
function compileRule(entry, srcPath) {
|
|
65
|
+
let pattern;
|
|
66
|
+
try {
|
|
67
|
+
pattern = new RegExp(entry.pattern, "g");
|
|
68
|
+
}
|
|
69
|
+
catch (e) {
|
|
70
|
+
throw new RedactorError(`invalid regex in rule '${entry.name}' at ${srcPath}: ${e.message}`, srcPath);
|
|
71
|
+
}
|
|
72
|
+
return { name: entry.name, pattern, replace: entry.replace };
|
|
73
|
+
}
|
|
74
|
+
async function readRulesFile(filePath) {
|
|
75
|
+
let raw;
|
|
76
|
+
try {
|
|
77
|
+
raw = await readFile(filePath, "utf8");
|
|
78
|
+
}
|
|
79
|
+
catch (e) {
|
|
80
|
+
throw new RedactorError(`failed to read rule file ${filePath}: ${e.message}`, filePath);
|
|
81
|
+
}
|
|
82
|
+
let parsed;
|
|
83
|
+
try {
|
|
84
|
+
parsed = yaml.load(raw);
|
|
85
|
+
}
|
|
86
|
+
catch (e) {
|
|
87
|
+
throw new RedactorError(`failed to parse yaml ${filePath}: ${e.message}`, filePath);
|
|
88
|
+
}
|
|
89
|
+
const doc = parsed;
|
|
90
|
+
if (!doc || !Array.isArray(doc.rules)) {
|
|
91
|
+
throw new RedactorError(`rule file ${filePath} must have top-level 'rules: []'`, filePath);
|
|
92
|
+
}
|
|
93
|
+
return doc.rules.map((e) => compileRule(e, filePath));
|
|
94
|
+
}
|
|
95
|
+
export async function loadRules(opts) {
|
|
96
|
+
if (opts.cliFlag) {
|
|
97
|
+
const rules = await readRulesFile(opts.cliFlag);
|
|
98
|
+
return { rules, source: "cli-flag" };
|
|
99
|
+
}
|
|
100
|
+
if (opts.repoDir) {
|
|
101
|
+
let stats;
|
|
102
|
+
try {
|
|
103
|
+
stats = await stat(opts.repoDir);
|
|
104
|
+
}
|
|
105
|
+
catch {
|
|
106
|
+
stats = null;
|
|
107
|
+
}
|
|
108
|
+
if (stats && stats.isDirectory()) {
|
|
109
|
+
const entries = await readdir(opts.repoDir);
|
|
110
|
+
const yamlFiles = entries
|
|
111
|
+
.filter((e) => e.endsWith(".yaml") || e.endsWith(".yml"))
|
|
112
|
+
.map((e) => path.join(opts.repoDir, e));
|
|
113
|
+
if (yamlFiles.length > 0) {
|
|
114
|
+
const allRules = [];
|
|
115
|
+
for (const f of yamlFiles) {
|
|
116
|
+
allRules.push(...(await readRulesFile(f)));
|
|
117
|
+
}
|
|
118
|
+
return { rules: allRules, source: "repo" };
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
return { rules: BUILTIN_RULES, source: "builtin" };
|
|
123
|
+
}
|
|
124
|
+
function hash6(input) {
|
|
125
|
+
return createHash("sha256").update(input).digest("hex").slice(0, 6);
|
|
126
|
+
}
|
|
127
|
+
export function applyRules(text, rules) {
|
|
128
|
+
let out = text;
|
|
129
|
+
for (const rule of rules) {
|
|
130
|
+
out = out.replace(rule.pattern, (match) => rule.replace.replace("{hash6}", hash6(match)));
|
|
131
|
+
}
|
|
132
|
+
return out;
|
|
133
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
You are evaluating whether a candidate answer is semantically equivalent to a reference answer for a knowledge-graph Q&A system.
|
|
2
|
+
|
|
3
|
+
Question: {{question}}
|
|
4
|
+
|
|
5
|
+
Reference answer (ground truth):
|
|
6
|
+
{{reference_answer}}
|
|
7
|
+
|
|
8
|
+
Candidate answer (from agent):
|
|
9
|
+
{{candidate_answer}}
|
|
10
|
+
|
|
11
|
+
Judge whether the candidate answer is semantically correct relative to the reference.
|
|
12
|
+
A candidate passes if it conveys the same key facts, even if phrased differently.
|
|
13
|
+
A candidate fails if it omits critical facts, states incorrect facts, or hallucinates information not in the reference.
|
|
14
|
+
Partial answers that cover most key facts but miss minor details should pass.
|
|
15
|
+
|
|
16
|
+
{{language_instruction}}
|
|
17
|
+
|
|
18
|
+
Respond with valid JSON matching this schema exactly:
|
|
19
|
+
{{output_schema}}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* M5 eval-set zod schemas (PR-A, MVP-B scope).
|
|
3
|
+
*
|
|
4
|
+
* 4 schemas in this file:
|
|
5
|
+
* - EvalSetIndexSchema: trace-eval-set-index/v1 (eval-set dir's index.yaml)
|
|
6
|
+
* - EvalSetShardSchema: trace-eval-set/v1 (final shard yaml file)
|
|
7
|
+
* - EvalSetInputSchema: trace-eval-set-input/v1 (--queries simplified input)
|
|
8
|
+
* - TestReportSchema: trace-test-report/v1 (test report; PR-A defines schema only;
|
|
9
|
+
* PR-B consumer)
|
|
10
|
+
*
|
|
11
|
+
* EvalSetShardSchema and EvalSetInputSchema share the same refinement:
|
|
12
|
+
* "for each case, at least one of {reference, non-empty assertions} must be present."
|
|
13
|
+
*
|
|
14
|
+
* The D5 builtin rubric `answer-match-reference` output schema is NOT here —
|
|
15
|
+
* it belongs to the rubric template definition (per spec doc §4.1).
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
export declare const EvalSetIndexSchema: z.ZodObject<{
|
|
19
|
+
schema_version: z.ZodLiteral<"trace-eval-set-index/v1">;
|
|
20
|
+
eval_set_id: z.ZodString;
|
|
21
|
+
shards: z.ZodArray<z.ZodObject<{
|
|
22
|
+
path: z.ZodString;
|
|
23
|
+
role: z.ZodOptional<z.ZodEnum<{
|
|
24
|
+
seed: "seed";
|
|
25
|
+
regression: "regression";
|
|
26
|
+
holdout: "holdout";
|
|
27
|
+
}>>;
|
|
28
|
+
}, z.core.$strip>>;
|
|
29
|
+
}, z.core.$strip>;
|
|
30
|
+
export declare const EvalSetShardSchema: z.ZodObject<{
|
|
31
|
+
schema_version: z.ZodLiteral<"trace-eval-set/v1">;
|
|
32
|
+
cases: z.ZodArray<z.ZodObject<{
|
|
33
|
+
query_id: z.ZodString;
|
|
34
|
+
input: z.ZodObject<{
|
|
35
|
+
user_message: z.ZodString;
|
|
36
|
+
}, z.core.$strip>;
|
|
37
|
+
reference: z.ZodOptional<z.ZodObject<{
|
|
38
|
+
answer: z.ZodString;
|
|
39
|
+
}, z.core.$strip>>;
|
|
40
|
+
assertions: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
41
|
+
type: z.ZodEnum<{
|
|
42
|
+
contains: "contains";
|
|
43
|
+
not_contains: "not_contains";
|
|
44
|
+
regex: "regex";
|
|
45
|
+
tool_call_count: "tool_call_count";
|
|
46
|
+
tool_call_order: "tool_call_order";
|
|
47
|
+
semantic_match: "semantic_match";
|
|
48
|
+
latency_ms: "latency_ms";
|
|
49
|
+
}>;
|
|
50
|
+
}, z.core.$loose>>>;
|
|
51
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
52
|
+
}, z.core.$strip>>;
|
|
53
|
+
}, z.core.$strip>;
|
|
54
|
+
export declare const EvalSetInputSchema: z.ZodObject<{
|
|
55
|
+
schema_version: z.ZodLiteral<"trace-eval-set-input/v1">;
|
|
56
|
+
cases: z.ZodArray<z.ZodObject<{
|
|
57
|
+
input: z.ZodObject<{
|
|
58
|
+
user_message: z.ZodString;
|
|
59
|
+
}, z.core.$strip>;
|
|
60
|
+
query_id: z.ZodOptional<z.ZodString>;
|
|
61
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
62
|
+
reference: z.ZodOptional<z.ZodObject<{
|
|
63
|
+
answer: z.ZodString;
|
|
64
|
+
}, z.core.$strip>>;
|
|
65
|
+
assertions: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
66
|
+
type: z.ZodEnum<{
|
|
67
|
+
contains: "contains";
|
|
68
|
+
not_contains: "not_contains";
|
|
69
|
+
regex: "regex";
|
|
70
|
+
tool_call_count: "tool_call_count";
|
|
71
|
+
tool_call_order: "tool_call_order";
|
|
72
|
+
semantic_match: "semantic_match";
|
|
73
|
+
latency_ms: "latency_ms";
|
|
74
|
+
}>;
|
|
75
|
+
}, z.core.$loose>>>;
|
|
76
|
+
}, z.core.$strip>>;
|
|
77
|
+
}, z.core.$strip>;
|
|
78
|
+
export declare const TestReportSchema: z.ZodObject<{
|
|
79
|
+
schema_version: z.ZodLiteral<"trace-test-report/v1">;
|
|
80
|
+
meta: z.ZodObject<{
|
|
81
|
+
eval_set_dir: z.ZodString;
|
|
82
|
+
eval_set_id: z.ZodString;
|
|
83
|
+
candidate: z.ZodObject<{
|
|
84
|
+
agent_id: z.ZodString;
|
|
85
|
+
agent_version: z.ZodOptional<z.ZodString>;
|
|
86
|
+
}, z.core.$strip>;
|
|
87
|
+
cli_version: z.ZodString;
|
|
88
|
+
ran_at: z.ZodString;
|
|
89
|
+
duration_ms: z.ZodNumber;
|
|
90
|
+
}, z.core.$strip>;
|
|
91
|
+
summary: z.ZodObject<{
|
|
92
|
+
total: z.ZodNumber;
|
|
93
|
+
pass: z.ZodNumber;
|
|
94
|
+
fail: z.ZodNumber;
|
|
95
|
+
error: z.ZodNumber;
|
|
96
|
+
skip: z.ZodNumber;
|
|
97
|
+
by_assertion_type: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
98
|
+
pass: z.ZodNumber;
|
|
99
|
+
fail: z.ZodNumber;
|
|
100
|
+
}, z.core.$strip>>;
|
|
101
|
+
}, z.core.$strip>;
|
|
102
|
+
cases: z.ZodArray<z.ZodObject<{
|
|
103
|
+
query_id: z.ZodString;
|
|
104
|
+
status: z.ZodEnum<{
|
|
105
|
+
error: "error";
|
|
106
|
+
pass: "pass";
|
|
107
|
+
fail: "fail";
|
|
108
|
+
skip: "skip";
|
|
109
|
+
}>;
|
|
110
|
+
conversation_id: z.ZodNullable<z.ZodString>;
|
|
111
|
+
trace_id: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
112
|
+
duration_ms: z.ZodOptional<z.ZodNumber>;
|
|
113
|
+
assertion_results: z.ZodArray<z.ZodObject<{
|
|
114
|
+
assertion: z.ZodObject<{
|
|
115
|
+
type: z.ZodEnum<{
|
|
116
|
+
contains: "contains";
|
|
117
|
+
not_contains: "not_contains";
|
|
118
|
+
regex: "regex";
|
|
119
|
+
tool_call_count: "tool_call_count";
|
|
120
|
+
tool_call_order: "tool_call_order";
|
|
121
|
+
semantic_match: "semantic_match";
|
|
122
|
+
latency_ms: "latency_ms";
|
|
123
|
+
}>;
|
|
124
|
+
}, z.core.$loose>;
|
|
125
|
+
verdict: z.ZodEnum<{
|
|
126
|
+
pass: "pass";
|
|
127
|
+
fail: "fail";
|
|
128
|
+
skip: "skip";
|
|
129
|
+
}>;
|
|
130
|
+
actual: z.ZodOptional<z.ZodUnknown>;
|
|
131
|
+
}, z.core.$strip>>;
|
|
132
|
+
failure_reason: z.ZodOptional<z.ZodString>;
|
|
133
|
+
error_code: z.ZodOptional<z.ZodString>;
|
|
134
|
+
error_message: z.ZodOptional<z.ZodString>;
|
|
135
|
+
}, z.core.$strip>>;
|
|
136
|
+
}, z.core.$strip>;
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* M5 eval-set zod schemas (PR-A, MVP-B scope).
|
|
3
|
+
*
|
|
4
|
+
* 4 schemas in this file:
|
|
5
|
+
* - EvalSetIndexSchema: trace-eval-set-index/v1 (eval-set dir's index.yaml)
|
|
6
|
+
* - EvalSetShardSchema: trace-eval-set/v1 (final shard yaml file)
|
|
7
|
+
* - EvalSetInputSchema: trace-eval-set-input/v1 (--queries simplified input)
|
|
8
|
+
* - TestReportSchema: trace-test-report/v1 (test report; PR-A defines schema only;
|
|
9
|
+
* PR-B consumer)
|
|
10
|
+
*
|
|
11
|
+
* EvalSetShardSchema and EvalSetInputSchema share the same refinement:
|
|
12
|
+
* "for each case, at least one of {reference, non-empty assertions} must be present."
|
|
13
|
+
*
|
|
14
|
+
* The D5 builtin rubric `answer-match-reference` output schema is NOT here —
|
|
15
|
+
* it belongs to the rubric template definition (per spec doc §4.1).
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
const InputSchema = z.object({
|
|
19
|
+
user_message: z.string().min(1),
|
|
20
|
+
});
|
|
21
|
+
const ReferenceSchema = z.object({
|
|
22
|
+
answer: z.string().min(1),
|
|
23
|
+
});
|
|
24
|
+
const AssertionSchema = z.object({
|
|
25
|
+
type: z.enum([
|
|
26
|
+
"contains",
|
|
27
|
+
"not_contains",
|
|
28
|
+
"regex",
|
|
29
|
+
"tool_call_count",
|
|
30
|
+
"tool_call_order",
|
|
31
|
+
"semantic_match",
|
|
32
|
+
"latency_ms",
|
|
33
|
+
]),
|
|
34
|
+
}).passthrough(); // allow type-specific fields (value, pattern, tool, op, n, ...)
|
|
35
|
+
// ── trace-eval-set-index/v1 ──────────────────────────────────────────────
|
|
36
|
+
const ShardRefSchema = z.object({
|
|
37
|
+
path: z
|
|
38
|
+
.string()
|
|
39
|
+
.min(1)
|
|
40
|
+
.refine((p) => !p.includes("..") && !p.startsWith("/"), {
|
|
41
|
+
message: "shard path must be a relative path within the eval-set directory (no '..' / '/')",
|
|
42
|
+
}),
|
|
43
|
+
role: z.enum(["seed", "regression", "holdout"]).optional(),
|
|
44
|
+
});
|
|
45
|
+
export const EvalSetIndexSchema = z.object({
|
|
46
|
+
schema_version: z.literal("trace-eval-set-index/v1"),
|
|
47
|
+
eval_set_id: z.string().min(1),
|
|
48
|
+
shards: z.array(ShardRefSchema).min(1),
|
|
49
|
+
});
|
|
50
|
+
// ── trace-eval-set/v1 ────────────────────────────────────────────────────
|
|
51
|
+
const refineCase = (data, ctx) => {
|
|
52
|
+
const hasReference = data.reference !== undefined && data.reference !== null;
|
|
53
|
+
const hasAssertions = Array.isArray(data.assertions) && data.assertions.length > 0;
|
|
54
|
+
if (!hasReference && !hasAssertions) {
|
|
55
|
+
ctx.addIssue({
|
|
56
|
+
code: z.ZodIssueCode.custom,
|
|
57
|
+
message: "each case must have either a 'reference' object or a non-empty 'assertions' array; both empty is not allowed (evaluator has no pass/fail signal)",
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
const FinalCaseSchema = z
|
|
62
|
+
.object({
|
|
63
|
+
query_id: z.string().min(1),
|
|
64
|
+
input: InputSchema,
|
|
65
|
+
reference: ReferenceSchema.optional(),
|
|
66
|
+
assertions: z.array(AssertionSchema).optional(),
|
|
67
|
+
tags: z.array(z.string()).optional(),
|
|
68
|
+
})
|
|
69
|
+
.superRefine(refineCase);
|
|
70
|
+
export const EvalSetShardSchema = z.object({
|
|
71
|
+
schema_version: z.literal("trace-eval-set/v1"),
|
|
72
|
+
cases: z.array(FinalCaseSchema).min(1),
|
|
73
|
+
});
|
|
74
|
+
// ── trace-eval-set-input/v1 (D1: same refinement as final) ───────────────
|
|
75
|
+
const InputCaseSchema = z
|
|
76
|
+
.object({
|
|
77
|
+
input: InputSchema,
|
|
78
|
+
query_id: z.string().min(1).optional(),
|
|
79
|
+
tags: z.array(z.string()).optional(),
|
|
80
|
+
reference: ReferenceSchema.optional(),
|
|
81
|
+
assertions: z.array(AssertionSchema).optional(),
|
|
82
|
+
})
|
|
83
|
+
.superRefine(refineCase);
|
|
84
|
+
export const EvalSetInputSchema = z.object({
|
|
85
|
+
schema_version: z.literal("trace-eval-set-input/v1"),
|
|
86
|
+
cases: z.array(InputCaseSchema).min(1),
|
|
87
|
+
});
|
|
88
|
+
// ── trace-test-report/v1 (PR-A defines; PR-B writes) ─────────────────────
|
|
89
|
+
const AssertionResultSchema = z.object({
|
|
90
|
+
assertion: AssertionSchema,
|
|
91
|
+
verdict: z.enum(["pass", "fail", "skip"]),
|
|
92
|
+
actual: z.unknown().optional(),
|
|
93
|
+
});
|
|
94
|
+
const CaseResultSchema = z.object({
|
|
95
|
+
query_id: z.string().min(1),
|
|
96
|
+
status: z.enum(["pass", "fail", "error", "skip"]),
|
|
97
|
+
conversation_id: z.string().nullable(),
|
|
98
|
+
trace_id: z.string().nullable().optional(),
|
|
99
|
+
duration_ms: z.number().nonnegative().optional(),
|
|
100
|
+
assertion_results: z.array(AssertionResultSchema),
|
|
101
|
+
failure_reason: z.string().optional(),
|
|
102
|
+
error_code: z.string().optional(),
|
|
103
|
+
error_message: z.string().optional(),
|
|
104
|
+
});
|
|
105
|
+
export const TestReportSchema = z.object({
|
|
106
|
+
schema_version: z.literal("trace-test-report/v1"),
|
|
107
|
+
meta: z.object({
|
|
108
|
+
eval_set_dir: z.string().min(1),
|
|
109
|
+
eval_set_id: z.string().min(1),
|
|
110
|
+
candidate: z.object({
|
|
111
|
+
agent_id: z.string().min(1),
|
|
112
|
+
agent_version: z.string().optional(),
|
|
113
|
+
}),
|
|
114
|
+
cli_version: z.string().min(1),
|
|
115
|
+
ran_at: z.string().min(1),
|
|
116
|
+
duration_ms: z.number().nonnegative(),
|
|
117
|
+
}),
|
|
118
|
+
summary: z.object({
|
|
119
|
+
total: z.number().int().nonnegative(),
|
|
120
|
+
pass: z.number().int().nonnegative(),
|
|
121
|
+
fail: z.number().int().nonnegative(),
|
|
122
|
+
error: z.number().int().nonnegative(),
|
|
123
|
+
skip: z.number().int().nonnegative(),
|
|
124
|
+
by_assertion_type: z.record(z.string(), z.object({
|
|
125
|
+
pass: z.number().int().nonnegative(),
|
|
126
|
+
fail: z.number().int().nonnegative(),
|
|
127
|
+
})),
|
|
128
|
+
}),
|
|
129
|
+
cases: z.array(CaseResultSchema),
|
|
130
|
+
});
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Builtin `semantic_match` judge for the eval-set test runner (M5 D5).
|
|
3
|
+
*
|
|
4
|
+
* Wraps an `AgentProvider` + the `builtin:answer-match-reference` prompt
|
|
5
|
+
* template + a small zod output schema into the `SemanticMatchProvider`
|
|
6
|
+
* surface the assertion-evaluator already speaks. The runner stays
|
|
7
|
+
* provider-agnostic; only this file knows how to render the rubric
|
|
8
|
+
* prompt and validate the LLM's reply.
|
|
9
|
+
*
|
|
10
|
+
* Output schema is intentionally local to this rubric (spec §4.1) —
|
|
11
|
+
* not in `schemas.ts`, which only carries the eval-set / report shapes.
|
|
12
|
+
*/
|
|
13
|
+
import { z } from "zod";
|
|
14
|
+
import type { AgentProvider } from "../../agent-providers/types.js";
|
|
15
|
+
import { type PromptTemplateRegistry, type AgentOutputLang } from "../../agent-providers/prompt-template.js";
|
|
16
|
+
import type { SemanticMatchProvider } from "./assertion-evaluator.js";
|
|
17
|
+
export declare const ANSWER_MATCH_REFERENCE_REF = "builtin:answer-match-reference";
|
|
18
|
+
export declare const AnswerMatchOutputSchema: z.ZodObject<{
|
|
19
|
+
verdict: z.ZodEnum<{
|
|
20
|
+
pass: "pass";
|
|
21
|
+
fail: "fail";
|
|
22
|
+
}>;
|
|
23
|
+
reasoning: z.ZodString;
|
|
24
|
+
}, z.core.$strip>;
|
|
25
|
+
export interface CreateSemanticMatchProviderOpts {
|
|
26
|
+
provider: AgentProvider;
|
|
27
|
+
promptRegistry: PromptTemplateRegistry;
|
|
28
|
+
/** Output locale for the rubric's reasoning text. Default 'en'. */
|
|
29
|
+
lang?: AgentOutputLang;
|
|
30
|
+
/** Per-invoke timeout override. */
|
|
31
|
+
timeoutMs?: number;
|
|
32
|
+
}
|
|
33
|
+
export declare function createBuiltinSemanticMatchProvider(opts: CreateSemanticMatchProviderOpts): SemanticMatchProvider;
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Builtin `semantic_match` judge for the eval-set test runner (M5 D5).
|
|
3
|
+
*
|
|
4
|
+
* Wraps an `AgentProvider` + the `builtin:answer-match-reference` prompt
|
|
5
|
+
* template + a small zod output schema into the `SemanticMatchProvider`
|
|
6
|
+
* surface the assertion-evaluator already speaks. The runner stays
|
|
7
|
+
* provider-agnostic; only this file knows how to render the rubric
|
|
8
|
+
* prompt and validate the LLM's reply.
|
|
9
|
+
*
|
|
10
|
+
* Output schema is intentionally local to this rubric (spec §4.1) —
|
|
11
|
+
* not in `schemas.ts`, which only carries the eval-set / report shapes.
|
|
12
|
+
*/
|
|
13
|
+
import { z } from "zod";
|
|
14
|
+
import { render as renderPrompt, languageInstructionFor, } from "../../agent-providers/prompt-template.js";
|
|
15
|
+
export const ANSWER_MATCH_REFERENCE_REF = "builtin:answer-match-reference";
|
|
16
|
+
export const AnswerMatchOutputSchema = z.object({
|
|
17
|
+
verdict: z.enum(["pass", "fail"]),
|
|
18
|
+
reasoning: z.string(),
|
|
19
|
+
});
|
|
20
|
+
// JSON shape the LLM is told to emit; rendered into the prompt's
|
|
21
|
+
// `{{output_schema}}` placeholder. Kept declarative so we don't try to
|
|
22
|
+
// reflect a Zod schema into JSON at runtime.
|
|
23
|
+
const OUTPUT_SCHEMA_DOC = {
|
|
24
|
+
type: "object",
|
|
25
|
+
required: ["verdict", "reasoning"],
|
|
26
|
+
properties: {
|
|
27
|
+
verdict: { type: "string", enum: ["pass", "fail"] },
|
|
28
|
+
reasoning: { type: "string" },
|
|
29
|
+
},
|
|
30
|
+
};
|
|
31
|
+
export function createBuiltinSemanticMatchProvider(opts) {
|
|
32
|
+
const { provider, promptRegistry, lang = "en", timeoutMs } = opts;
|
|
33
|
+
return {
|
|
34
|
+
async judgeSemanticMatch(question, candidateAnswer, referenceAnswer) {
|
|
35
|
+
const tpl = promptRegistry.get(ANSWER_MATCH_REFERENCE_REF);
|
|
36
|
+
const prompt = renderPrompt(tpl, {
|
|
37
|
+
question,
|
|
38
|
+
candidate_answer: candidateAnswer,
|
|
39
|
+
reference_answer: referenceAnswer,
|
|
40
|
+
language_instruction: languageInstructionFor(lang),
|
|
41
|
+
output_schema: OUTPUT_SCHEMA_DOC,
|
|
42
|
+
});
|
|
43
|
+
const resp = await provider.invoke({
|
|
44
|
+
prompt,
|
|
45
|
+
outputSchema: AnswerMatchOutputSchema,
|
|
46
|
+
timeoutMs,
|
|
47
|
+
});
|
|
48
|
+
return { verdict: resp.output.verdict, reasoning: resp.output.reasoning };
|
|
49
|
+
},
|
|
50
|
+
};
|
|
51
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import type { TraceSpan } from "../../api/conversations.js";
|
|
2
|
+
import type { SemanticMatchProvider } from "./assertion-evaluator.js";
|
|
3
|
+
export interface RunnerDeps {
|
|
4
|
+
fetchAgent: (agentId: string, version?: string) => Promise<{
|
|
5
|
+
id: string;
|
|
6
|
+
key: string;
|
|
7
|
+
version: string;
|
|
8
|
+
}>;
|
|
9
|
+
sendChat: (opts: {
|
|
10
|
+
agentInfo: {
|
|
11
|
+
id: string;
|
|
12
|
+
key: string;
|
|
13
|
+
version: string;
|
|
14
|
+
};
|
|
15
|
+
query: string;
|
|
16
|
+
conversationId?: string;
|
|
17
|
+
}) => Promise<{
|
|
18
|
+
text: string;
|
|
19
|
+
conversationId?: string;
|
|
20
|
+
}>;
|
|
21
|
+
fetchTrace: (conversationId: string) => Promise<{
|
|
22
|
+
spans: TraceSpan[];
|
|
23
|
+
}>;
|
|
24
|
+
semanticMatchProvider?: SemanticMatchProvider;
|
|
25
|
+
}
|
|
26
|
+
export interface RunOpts {
|
|
27
|
+
evalSetDir: string;
|
|
28
|
+
candidateAgentId: string;
|
|
29
|
+
candidateAgentVersion?: string;
|
|
30
|
+
outDir: string;
|
|
31
|
+
maxParallel?: number;
|
|
32
|
+
deps: RunnerDeps;
|
|
33
|
+
}
|
|
34
|
+
export declare function run(opts: RunOpts): Promise<void>;
|