@kweaver-ai/kweaver-sdk 0.7.4 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -5
- package/README.zh.md +37 -5
- package/dist/agent-providers/index.d.ts +7 -0
- package/dist/agent-providers/index.js +5 -0
- package/dist/agent-providers/prompt-template.d.ts +62 -0
- package/dist/agent-providers/prompt-template.js +105 -0
- package/dist/agent-providers/prompts/rubric-judge-v1.prompt.md +51 -0
- package/dist/agent-providers/prompts/within-trace-synthesizer-v1.prompt.md +60 -0
- package/dist/agent-providers/providers/claude-code-subprocess.d.ts +74 -0
- package/dist/agent-providers/providers/claude-code-subprocess.js +259 -0
- package/dist/agent-providers/providers/stub.d.ts +47 -0
- package/dist/agent-providers/providers/stub.js +77 -0
- package/dist/agent-providers/registry.d.ts +45 -0
- package/dist/agent-providers/registry.js +77 -0
- package/dist/agent-providers/types.d.ts +91 -0
- package/dist/agent-providers/types.js +25 -0
- package/dist/api/agent-chat.js +8 -6
- package/dist/api/agent-observability.d.ts +51 -0
- package/dist/api/agent-observability.js +108 -0
- package/dist/api/context-loader.d.ts +1 -0
- package/dist/api/conversations.d.ts +4 -8
- package/dist/api/conversations.js +16 -58
- package/dist/api/datasources.d.ts +2 -20
- package/dist/api/datasources.js +7 -123
- package/dist/api/semantic-search.d.ts +5 -0
- package/dist/api/semantic-search.js +5 -0
- package/dist/api/skills.d.ts +75 -2
- package/dist/api/skills.js +108 -12
- package/dist/api/trace.d.ts +49 -0
- package/dist/api/trace.js +85 -0
- package/dist/api/vega.d.ts +53 -0
- package/dist/api/vega.js +144 -0
- package/dist/cli.js +12 -5
- package/dist/commands/agent/mode.d.ts +6 -0
- package/dist/commands/agent/mode.js +75 -0
- package/dist/commands/agent.js +101 -29
- package/dist/commands/bkn-ops.js +12 -6
- package/dist/commands/bkn-utils.d.ts +9 -0
- package/dist/commands/bkn-utils.js +17 -0
- package/dist/commands/context-loader.js +608 -38
- package/dist/commands/ds.js +7 -2
- package/dist/commands/skill.d.ts +21 -1
- package/dist/commands/skill.js +389 -1
- package/dist/commands/trace.d.ts +39 -0
- package/dist/commands/trace.js +668 -0
- package/dist/index.d.ts +2 -2
- package/dist/index.js +1 -1
- package/dist/resources/bkn.d.ts +5 -0
- package/dist/resources/bkn.js +5 -0
- package/dist/resources/datasources.js +2 -1
- package/dist/resources/skills.d.ts +17 -1
- package/dist/resources/skills.js +32 -1
- package/dist/trace-ai/diagnose/agent-binding.d.ts +67 -0
- package/dist/trace-ai/diagnose/agent-binding.js +257 -0
- package/dist/trace-ai/diagnose/builtin-rules/excessive-tool-calls-per-turn.d.ts +2 -0
- package/dist/trace-ai/diagnose/builtin-rules/excessive-tool-calls-per-turn.js +15 -0
- package/dist/trace-ai/diagnose/builtin-rules/excessive-tool-calls-per-turn.yaml +16 -0
- package/dist/trace-ai/diagnose/builtin-rules/llm-response-truncated-no-continue.d.ts +2 -0
- package/dist/trace-ai/diagnose/builtin-rules/llm-response-truncated-no-continue.js +44 -0
- package/dist/trace-ai/diagnose/builtin-rules/llm-response-truncated-no-continue.yaml +15 -0
- package/dist/trace-ai/diagnose/builtin-rules/register.d.ts +1 -0
- package/dist/trace-ai/diagnose/builtin-rules/register.js +11 -0
- package/dist/trace-ai/diagnose/builtin-rules/retrieval-empty-no-fallback.d.ts +2 -0
- package/dist/trace-ai/diagnose/builtin-rules/retrieval-empty-no-fallback.js +29 -0
- package/dist/trace-ai/diagnose/builtin-rules/retrieval-empty-no-fallback.yaml +15 -0
- package/dist/trace-ai/diagnose/builtin-rules/tool-error-swallowed.d.ts +2 -0
- package/dist/trace-ai/diagnose/builtin-rules/tool-error-swallowed.js +45 -0
- package/dist/trace-ai/diagnose/builtin-rules/tool-error-swallowed.yaml +15 -0
- package/dist/trace-ai/diagnose/builtin-rules/tool-loop-no-state-change.d.ts +2 -0
- package/dist/trace-ai/diagnose/builtin-rules/tool-loop-no-state-change.js +38 -0
- package/dist/trace-ai/diagnose/builtin-rules/tool-loop-no-state-change.yaml +16 -0
- package/dist/trace-ai/diagnose/builtin-rules/tool-retry-intent-mismatch.yaml +68 -0
- package/dist/trace-ai/diagnose/index.d.ts +32 -0
- package/dist/trace-ai/diagnose/index.js +246 -0
- package/dist/trace-ai/diagnose/output-schema-converter.d.ts +24 -0
- package/dist/trace-ai/diagnose/output-schema-converter.js +81 -0
- package/dist/trace-ai/diagnose/predicate-registry.d.ts +7 -0
- package/dist/trace-ai/diagnose/predicate-registry.js +30 -0
- package/dist/trace-ai/diagnose/query-extractor.d.ts +14 -0
- package/dist/trace-ai/diagnose/query-extractor.js +45 -0
- package/dist/trace-ai/diagnose/report-assembler.d.ts +31 -0
- package/dist/trace-ai/diagnose/report-assembler.js +100 -0
- package/dist/trace-ai/diagnose/report-markdown.d.ts +18 -0
- package/dist/trace-ai/diagnose/report-markdown.js +192 -0
- package/dist/trace-ai/diagnose/rule-loader.d.ts +11 -0
- package/dist/trace-ai/diagnose/rule-loader.js +120 -0
- package/dist/trace-ai/diagnose/schemas.d.ts +184 -0
- package/dist/trace-ai/diagnose/schemas.js +154 -0
- package/dist/trace-ai/diagnose/signal-probe.d.ts +17 -0
- package/dist/trace-ai/diagnose/signal-probe.js +39 -0
- package/dist/trace-ai/diagnose/synthesizer-agent.d.ts +40 -0
- package/dist/trace-ai/diagnose/synthesizer-agent.js +158 -0
- package/dist/trace-ai/diagnose/synthesizer-template.d.ts +2 -0
- package/dist/trace-ai/diagnose/synthesizer-template.js +49 -0
- package/dist/trace-ai/diagnose/trace-shaper.d.ts +3 -0
- package/dist/trace-ai/diagnose/trace-shaper.js +73 -0
- package/dist/trace-ai/diagnose/types.d.ts +173 -0
- package/dist/trace-ai/diagnose/types.js +1 -0
- package/dist/trace-ai/eval-set/assertion-evaluator.d.ts +29 -0
- package/dist/trace-ai/eval-set/assertion-evaluator.js +100 -0
- package/dist/trace-ai/eval-set/builder.d.ts +36 -0
- package/dist/trace-ai/eval-set/builder.js +126 -0
- package/dist/trace-ai/eval-set/index.d.ts +15 -0
- package/dist/trace-ai/eval-set/index.js +10 -0
- package/dist/trace-ai/eval-set/output-writer.d.ts +27 -0
- package/dist/trace-ai/eval-set/output-writer.js +126 -0
- package/dist/trace-ai/eval-set/query-picker.d.ts +37 -0
- package/dist/trace-ai/eval-set/query-picker.js +147 -0
- package/dist/trace-ai/eval-set/redactor.d.ts +42 -0
- package/dist/trace-ai/eval-set/redactor.js +133 -0
- package/dist/trace-ai/eval-set/rubric-templates/answer-match-reference.prompt.md +19 -0
- package/dist/trace-ai/eval-set/schemas.d.ts +136 -0
- package/dist/trace-ai/eval-set/schemas.js +130 -0
- package/dist/trace-ai/eval-set/semantic-match-provider.d.ts +33 -0
- package/dist/trace-ai/eval-set/semantic-match-provider.js +51 -0
- package/dist/trace-ai/eval-set/test-runner.d.ts +34 -0
- package/dist/trace-ai/eval-set/test-runner.js +153 -0
- package/dist/trace-ai/eval-set/types.d.ts +46 -0
- package/dist/trace-ai/eval-set/types.js +8 -0
- package/dist/trace-ai/exp/bundle-writer.d.ts +10 -0
- package/dist/trace-ai/exp/bundle-writer.js +54 -0
- package/dist/trace-ai/exp/claude-binary.d.ts +5 -0
- package/dist/trace-ai/exp/claude-binary.js +30 -0
- package/dist/trace-ai/exp/coordinator.d.ts +45 -0
- package/dist/trace-ai/exp/coordinator.js +203 -0
- package/dist/trace-ai/exp/eval-runner.d.ts +14 -0
- package/dist/trace-ai/exp/eval-runner.js +47 -0
- package/dist/trace-ai/exp/exp-store/abort-signal.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/abort-signal.js +27 -0
- package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.d.ts +4 -0
- package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.js +37 -0
- package/dist/trace-ai/exp/exp-store/events-jsonl.d.ts +17 -0
- package/dist/trace-ai/exp/exp-store/events-jsonl.js +60 -0
- package/dist/trace-ai/exp/exp-store/exp-registry.d.ts +6 -0
- package/dist/trace-ai/exp/exp-store/exp-registry.js +41 -0
- package/dist/trace-ai/exp/exp-store/index.d.ts +46 -0
- package/dist/trace-ai/exp/exp-store/index.js +59 -0
- package/dist/trace-ai/exp/exp-store/lock.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/lock.js +73 -0
- package/dist/trace-ai/exp/exp-store/mission-md.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/mission-md.js +37 -0
- package/dist/trace-ai/exp/exp-store/readme-template.d.ts +5 -0
- package/dist/trace-ai/exp/exp-store/readme-template.js +25 -0
- package/dist/trace-ai/exp/exp-store/round-yaml.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/round-yaml.js +33 -0
- package/dist/trace-ai/exp/index.d.ts +8 -0
- package/dist/trace-ai/exp/index.js +238 -0
- package/dist/trace-ai/exp/info.d.ts +35 -0
- package/dist/trace-ai/exp/info.js +120 -0
- package/dist/trace-ai/exp/patch/agent-config.d.ts +1 -0
- package/dist/trace-ai/exp/patch/agent-config.js +26 -0
- package/dist/trace-ai/exp/patch/index.d.ts +2 -0
- package/dist/trace-ai/exp/patch/index.js +13 -0
- package/dist/trace-ai/exp/patch/skill.d.ts +1 -0
- package/dist/trace-ai/exp/patch/skill.js +24 -0
- package/dist/trace-ai/exp/providers/synthesizer-client.d.ts +14 -0
- package/dist/trace-ai/exp/providers/synthesizer-client.js +39 -0
- package/dist/trace-ai/exp/providers/triage-client.d.ts +19 -0
- package/dist/trace-ai/exp/providers/triage-client.js +51 -0
- package/dist/trace-ai/exp/schemas.d.ts +147 -0
- package/dist/trace-ai/exp/schemas.js +50 -0
- package/dist/trace-ai/exp/scoring.d.ts +2 -0
- package/dist/trace-ai/exp/scoring.js +46 -0
- package/dist/trace-ai/scan/aggregator.d.ts +20 -0
- package/dist/trace-ai/scan/aggregator.js +26 -0
- package/dist/trace-ai/scan/artifacts/paths.d.ts +12 -0
- package/dist/trace-ai/scan/artifacts/paths.js +18 -0
- package/dist/trace-ai/scan/artifacts/writer.d.ts +67 -0
- package/dist/trace-ai/scan/artifacts/writer.js +96 -0
- package/dist/trace-ai/scan/batched-rubric.d.ts +55 -0
- package/dist/trace-ai/scan/batched-rubric.js +159 -0
- package/dist/trace-ai/scan/cross-trace-synthesizer.d.ts +24 -0
- package/dist/trace-ai/scan/cross-trace-synthesizer.js +93 -0
- package/dist/trace-ai/scan/index.d.ts +31 -0
- package/dist/trace-ai/scan/index.js +390 -0
- package/dist/trace-ai/scan/prompts/builtin/cross-trace-synthesizer-v1.prompt.md +44 -0
- package/dist/trace-ai/scan/prompts/builtin/rubric-judge-batch-v1.prompt.md +44 -0
- package/dist/trace-ai/scan/runner.d.ts +25 -0
- package/dist/trace-ai/scan/runner.js +42 -0
- package/dist/trace-ai/scan/sampler.d.ts +18 -0
- package/dist/trace-ai/scan/sampler.js +81 -0
- package/dist/trace-ai/scan/scan-summary-markdown.d.ts +2 -0
- package/dist/trace-ai/scan/scan-summary-markdown.js +71 -0
- package/dist/trace-ai/scan/scan-summary-schema.d.ts +73 -0
- package/dist/trace-ai/scan/scan-summary-schema.js +61 -0
- package/dist/trace-ai/scan/single-agent-validator.d.ts +23 -0
- package/dist/trace-ai/scan/single-agent-validator.js +42 -0
- package/dist/trace-ai/scan/traces-list-parser.d.ts +15 -0
- package/dist/trace-ai/scan/traces-list-parser.js +46 -0
- package/package.json +14 -4
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* M5 eval-set output writer — handles directory layout, index upsert, shard
|
|
3
|
+
* merge, on-conflict resolution (fail / skip / overwrite), and .bak preservation.
|
|
4
|
+
*
|
|
5
|
+
* MVP layout: always one shard named `cases.yaml`. Users can manually split
|
|
6
|
+
* into multi-shard later (re-write `index.yaml` to reference more shards)
|
|
7
|
+
* and call `kweaver trace schema validate` to verify.
|
|
8
|
+
*/
|
|
9
|
+
import { readFile, writeFile, copyFile, mkdir } from "node:fs/promises";
|
|
10
|
+
import path from "node:path";
|
|
11
|
+
import yaml from "js-yaml";
|
|
12
|
+
import { EvalSetIndexSchema, EvalSetShardSchema } from "./schemas.js";
|
|
13
|
+
export class WriterError extends Error {
|
|
14
|
+
conflictIds;
|
|
15
|
+
constructor(message, conflictIds) {
|
|
16
|
+
super(message);
|
|
17
|
+
this.conflictIds = conflictIds;
|
|
18
|
+
this.name = "WriterError";
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
const SHARD_NAME = "cases.yaml";
|
|
22
|
+
const INDEX_NAME = "index.yaml";
|
|
23
|
+
async function fileExists(p) {
|
|
24
|
+
try {
|
|
25
|
+
await readFile(p);
|
|
26
|
+
return true;
|
|
27
|
+
}
|
|
28
|
+
catch {
|
|
29
|
+
return false;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
async function readShardCases(shardPath) {
|
|
33
|
+
if (!(await fileExists(shardPath)))
|
|
34
|
+
return [];
|
|
35
|
+
const raw = await readFile(shardPath, "utf8");
|
|
36
|
+
const parsed = yaml.load(raw);
|
|
37
|
+
const r = EvalSetShardSchema.safeParse(parsed);
|
|
38
|
+
if (!r.success) {
|
|
39
|
+
throw new WriterError(`existing shard at ${shardPath} fails schema validation: ${r.error.issues[0].message}`);
|
|
40
|
+
}
|
|
41
|
+
return r.data.cases;
|
|
42
|
+
}
|
|
43
|
+
export async function writeEvalSet(opts) {
|
|
44
|
+
const { outDir, evalSetId, newCases, onConflict } = opts;
|
|
45
|
+
// intra-batch duplicate detection
|
|
46
|
+
const seenInBatch = new Set();
|
|
47
|
+
const dupInBatch = [];
|
|
48
|
+
for (const c of newCases) {
|
|
49
|
+
if (seenInBatch.has(c.query_id))
|
|
50
|
+
dupInBatch.push(c.query_id);
|
|
51
|
+
seenInBatch.add(c.query_id);
|
|
52
|
+
}
|
|
53
|
+
if (dupInBatch.length > 0) {
|
|
54
|
+
throw new WriterError(`intra-batch duplicate query_id(s): ${dupInBatch.join(", ")}`, dupInBatch);
|
|
55
|
+
}
|
|
56
|
+
await mkdir(outDir, { recursive: true });
|
|
57
|
+
const shardPath = path.join(outDir, SHARD_NAME);
|
|
58
|
+
const existingCases = await readShardCases(shardPath);
|
|
59
|
+
const existingIds = new Set(existingCases.map((c) => c.query_id));
|
|
60
|
+
const incomingByConflict = newCases.filter((c) => existingIds.has(c.query_id));
|
|
61
|
+
const incomingFresh = newCases.filter((c) => !existingIds.has(c.query_id));
|
|
62
|
+
if (incomingByConflict.length > 0 && onConflict === "fail") {
|
|
63
|
+
throw new WriterError(`query_id conflict(s): ${incomingByConflict.map((c) => c.query_id).join(", ")}`, incomingByConflict.map((c) => c.query_id));
|
|
64
|
+
}
|
|
65
|
+
let mergedCases;
|
|
66
|
+
let casesWritten = 0;
|
|
67
|
+
let casesSkipped = 0;
|
|
68
|
+
if (onConflict === "skip") {
|
|
69
|
+
mergedCases = [...existingCases, ...incomingFresh];
|
|
70
|
+
casesWritten = incomingFresh.length;
|
|
71
|
+
casesSkipped = incomingByConflict.length;
|
|
72
|
+
}
|
|
73
|
+
else if (onConflict === "overwrite") {
|
|
74
|
+
if (incomingByConflict.length > 0 && (await fileExists(shardPath))) {
|
|
75
|
+
await copyFile(shardPath, shardPath + ".bak");
|
|
76
|
+
}
|
|
77
|
+
const overwriteIds = new Set(incomingByConflict.map((c) => c.query_id));
|
|
78
|
+
const kept = existingCases.filter((c) => !overwriteIds.has(c.query_id));
|
|
79
|
+
mergedCases = [...kept, ...incomingFresh, ...incomingByConflict];
|
|
80
|
+
casesWritten = incomingFresh.length + incomingByConflict.length;
|
|
81
|
+
casesSkipped = 0;
|
|
82
|
+
}
|
|
83
|
+
else {
|
|
84
|
+
// "fail" strategy — no conflicts at this point (would have thrown above)
|
|
85
|
+
mergedCases = [...existingCases, ...incomingFresh];
|
|
86
|
+
casesWritten = incomingFresh.length;
|
|
87
|
+
casesSkipped = 0;
|
|
88
|
+
}
|
|
89
|
+
const shardDoc = {
|
|
90
|
+
schema_version: "trace-eval-set/v1",
|
|
91
|
+
cases: mergedCases,
|
|
92
|
+
};
|
|
93
|
+
const shardCheck = EvalSetShardSchema.safeParse(shardDoc);
|
|
94
|
+
if (!shardCheck.success) {
|
|
95
|
+
throw new WriterError(`merged shard fails schema validation: ${shardCheck.error.issues[0].message}`);
|
|
96
|
+
}
|
|
97
|
+
await writeFile(shardPath, yaml.dump(shardDoc, { lineWidth: 120, noRefs: true }), "utf8");
|
|
98
|
+
const indexPath = path.join(outDir, INDEX_NAME);
|
|
99
|
+
let indexDoc;
|
|
100
|
+
if (await fileExists(indexPath)) {
|
|
101
|
+
const raw = await readFile(indexPath, "utf8");
|
|
102
|
+
const parsed = yaml.load(raw);
|
|
103
|
+
const r = EvalSetIndexSchema.safeParse(parsed);
|
|
104
|
+
if (!r.success) {
|
|
105
|
+
throw new WriterError(`existing index.yaml fails schema validation: ${r.error.issues[0].message}`);
|
|
106
|
+
}
|
|
107
|
+
indexDoc = r.data;
|
|
108
|
+
if (!indexDoc.shards.some((s) => s.path === SHARD_NAME)) {
|
|
109
|
+
indexDoc.shards.push({ path: SHARD_NAME });
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
indexDoc = {
|
|
114
|
+
schema_version: "trace-eval-set-index/v1",
|
|
115
|
+
eval_set_id: evalSetId,
|
|
116
|
+
shards: [{ path: SHARD_NAME }],
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
await writeFile(indexPath, yaml.dump(indexDoc, { lineWidth: 120, noRefs: true }), "utf8");
|
|
120
|
+
return {
|
|
121
|
+
cases_written: casesWritten,
|
|
122
|
+
cases_skipped: casesSkipped,
|
|
123
|
+
conflicts: incomingByConflict.map((c) => c.query_id),
|
|
124
|
+
shard_paths: [shardPath],
|
|
125
|
+
};
|
|
126
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* M5 eval-set query picker — two lift functions:
|
|
3
|
+
* - liftFromQueriesFile(path) reads `trace-eval-set-input/v1` simplified yaml
|
|
4
|
+
* - liftFromDiagnosis(dir) reads M4 diagnose report yamls (added in Task 4)
|
|
5
|
+
*
|
|
6
|
+
* Both return EvalCase[] (without query_id auto-fill — that happens in builder.ts).
|
|
7
|
+
*/
|
|
8
|
+
import type { EvalCase } from "./types.js";
|
|
9
|
+
export declare class QueryPickerError extends Error {
|
|
10
|
+
readonly path?: string | undefined;
|
|
11
|
+
constructor(message: string, path?: string | undefined);
|
|
12
|
+
}
|
|
13
|
+
export declare function liftFromQueriesFile(filePath: string): Promise<EvalCase[]>;
|
|
14
|
+
export interface LiftFromDiagnosisResult {
|
|
15
|
+
cases: EvalCase[];
|
|
16
|
+
skipped_findings_count: number;
|
|
17
|
+
skipped_findings_summary: string[];
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Read all *.yaml / *.yml files in `dirPath`, validate each as `trace-diagnose-report/v1`,
|
|
21
|
+
* and extract `findings[*].verify_with.suggested_eval_case` as EvalCases.
|
|
22
|
+
*
|
|
23
|
+
* Skips findings where:
|
|
24
|
+
* - `suggested_eval_case.query` is null (M4 has no user query → can't construct EvalCase.input)
|
|
25
|
+
* - `suggested_eval_case.assertions` is empty (refinement would fail; no reference either)
|
|
26
|
+
*
|
|
27
|
+
* Lifts:
|
|
28
|
+
* - `EvalCase.input.user_message = suggested_eval_case.query`
|
|
29
|
+
* - `EvalCase.query_id = suggested_eval_case.query_id ?? ""` (empty → builder.ensureQueryId fills)
|
|
30
|
+
* - `EvalCase.assertions` = M4 string templates wrapped as placeholder `contains` assertions
|
|
31
|
+
* with `_note` flagging "convert to structured manually"
|
|
32
|
+
* - `EvalCase.reference = undefined` (M4 doesn't emit reference)
|
|
33
|
+
*
|
|
34
|
+
* Files that fail to schema-validate cause a fail-fast error (all *.yaml in dir
|
|
35
|
+
* must be diagnose reports — picker doesn't filter by content).
|
|
36
|
+
*/
|
|
37
|
+
export declare function liftFromDiagnosis(dirPath: string): Promise<LiftFromDiagnosisResult>;
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* M5 eval-set query picker — two lift functions:
|
|
3
|
+
* - liftFromQueriesFile(path) reads `trace-eval-set-input/v1` simplified yaml
|
|
4
|
+
* - liftFromDiagnosis(dir) reads M4 diagnose report yamls (added in Task 4)
|
|
5
|
+
*
|
|
6
|
+
* Both return EvalCase[] (without query_id auto-fill — that happens in builder.ts).
|
|
7
|
+
*/
|
|
8
|
+
import { readFile, readdir } from "node:fs/promises";
|
|
9
|
+
import path from "node:path";
|
|
10
|
+
import yaml from "js-yaml";
|
|
11
|
+
import { EvalSetInputSchema } from "./schemas.js";
|
|
12
|
+
import { ReportSchema } from "../diagnose/schemas.js";
|
|
13
|
+
export class QueryPickerError extends Error {
|
|
14
|
+
path;
|
|
15
|
+
constructor(message, path) {
|
|
16
|
+
super(message);
|
|
17
|
+
this.path = path;
|
|
18
|
+
this.name = "QueryPickerError";
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
export async function liftFromQueriesFile(filePath) {
|
|
22
|
+
let raw;
|
|
23
|
+
try {
|
|
24
|
+
raw = await readFile(filePath, "utf8");
|
|
25
|
+
}
|
|
26
|
+
catch (e) {
|
|
27
|
+
const err = e;
|
|
28
|
+
if (err.code === "ENOENT") {
|
|
29
|
+
throw new QueryPickerError(`file not found: ${filePath}`, filePath);
|
|
30
|
+
}
|
|
31
|
+
throw new QueryPickerError(`failed to read ${filePath}: ${err.message}`, filePath);
|
|
32
|
+
}
|
|
33
|
+
let parsed;
|
|
34
|
+
try {
|
|
35
|
+
parsed = yaml.load(raw);
|
|
36
|
+
}
|
|
37
|
+
catch (e) {
|
|
38
|
+
throw new QueryPickerError(`failed to parse yaml ${filePath}: ${e.message}`, filePath);
|
|
39
|
+
}
|
|
40
|
+
const result = EvalSetInputSchema.safeParse(parsed);
|
|
41
|
+
if (!result.success) {
|
|
42
|
+
const firstIssue = result.error.issues[0];
|
|
43
|
+
const where = firstIssue.path.join(".");
|
|
44
|
+
throw new QueryPickerError(`schema validation failed for ${filePath} at '${where}': ${firstIssue.message}`, filePath);
|
|
45
|
+
}
|
|
46
|
+
return result.data.cases.map((c) => ({
|
|
47
|
+
query_id: c.query_id ?? "", // empty → builder.ensureQueryId fills it; undefined would break downstream
|
|
48
|
+
input: c.input,
|
|
49
|
+
reference: c.reference,
|
|
50
|
+
assertions: c.assertions,
|
|
51
|
+
tags: c.tags,
|
|
52
|
+
}));
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Read all *.yaml / *.yml files in `dirPath`, validate each as `trace-diagnose-report/v1`,
|
|
56
|
+
* and extract `findings[*].verify_with.suggested_eval_case` as EvalCases.
|
|
57
|
+
*
|
|
58
|
+
* Skips findings where:
|
|
59
|
+
* - `suggested_eval_case.query` is null (M4 has no user query → can't construct EvalCase.input)
|
|
60
|
+
* - `suggested_eval_case.assertions` is empty (refinement would fail; no reference either)
|
|
61
|
+
*
|
|
62
|
+
* Lifts:
|
|
63
|
+
* - `EvalCase.input.user_message = suggested_eval_case.query`
|
|
64
|
+
* - `EvalCase.query_id = suggested_eval_case.query_id ?? ""` (empty → builder.ensureQueryId fills)
|
|
65
|
+
* - `EvalCase.assertions` = M4 string templates wrapped as placeholder `contains` assertions
|
|
66
|
+
* with `_note` flagging "convert to structured manually"
|
|
67
|
+
* - `EvalCase.reference = undefined` (M4 doesn't emit reference)
|
|
68
|
+
*
|
|
69
|
+
* Files that fail to schema-validate cause a fail-fast error (all *.yaml in dir
|
|
70
|
+
* must be diagnose reports — picker doesn't filter by content).
|
|
71
|
+
*/
|
|
72
|
+
export async function liftFromDiagnosis(dirPath) {
|
|
73
|
+
let entries;
|
|
74
|
+
try {
|
|
75
|
+
entries = await readdir(dirPath);
|
|
76
|
+
}
|
|
77
|
+
catch (e) {
|
|
78
|
+
const err = e;
|
|
79
|
+
if (err.code === "ENOENT") {
|
|
80
|
+
throw new QueryPickerError(`directory not found: ${dirPath}`, dirPath);
|
|
81
|
+
}
|
|
82
|
+
throw new QueryPickerError(`failed to read directory ${dirPath}: ${err.message}`, dirPath);
|
|
83
|
+
}
|
|
84
|
+
const yamlFiles = entries
|
|
85
|
+
.filter((e) => e.endsWith(".yaml") || e.endsWith(".yml"))
|
|
86
|
+
.map((e) => path.join(dirPath, e));
|
|
87
|
+
// Accumulate by query_id so multiple findings from the same conversation
|
|
88
|
+
// collapse into one case with merged assertions (avoids intra-batch dup error).
|
|
89
|
+
const byQueryId = new Map();
|
|
90
|
+
let skipped = 0;
|
|
91
|
+
const skippedSummary = [];
|
|
92
|
+
for (const file of yamlFiles) {
|
|
93
|
+
let raw;
|
|
94
|
+
try {
|
|
95
|
+
raw = await readFile(file, "utf8");
|
|
96
|
+
}
|
|
97
|
+
catch (e) {
|
|
98
|
+
throw new QueryPickerError(`failed to read ${file}: ${e.message}`, file);
|
|
99
|
+
}
|
|
100
|
+
let parsed;
|
|
101
|
+
try {
|
|
102
|
+
parsed = yaml.load(raw);
|
|
103
|
+
}
|
|
104
|
+
catch (e) {
|
|
105
|
+
throw new QueryPickerError(`failed to parse yaml ${file}: ${e.message}`, file);
|
|
106
|
+
}
|
|
107
|
+
const result = ReportSchema.safeParse(parsed);
|
|
108
|
+
if (!result.success) {
|
|
109
|
+
const firstIssue = result.error.issues[0];
|
|
110
|
+
const where = firstIssue.path.join(".");
|
|
111
|
+
throw new QueryPickerError(`schema validation failed for ${file} at '${where}': ${firstIssue.message}`, file);
|
|
112
|
+
}
|
|
113
|
+
for (const finding of result.data.findings) {
|
|
114
|
+
const sec = finding.verify_with.suggested_eval_case;
|
|
115
|
+
if (sec.query === null) {
|
|
116
|
+
skipped += 1;
|
|
117
|
+
skippedSummary.push(`${path.basename(file)}: rule=${finding.rule_id} (query=null; upgrade M4 trace to populate input.messages)`);
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
if (sec.assertions.length === 0) {
|
|
121
|
+
skipped += 1;
|
|
122
|
+
skippedSummary.push(`${path.basename(file)}: rule=${finding.rule_id} (empty assertions; refinement would fail)`);
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
const placeholderAssertions = sec.assertions.map((t) => ({
|
|
126
|
+
type: "contains",
|
|
127
|
+
value: t,
|
|
128
|
+
_note: "auto-lifted from M4 assertion template; convert to structured assertion manually",
|
|
129
|
+
}));
|
|
130
|
+
const queryId = sec.query_id ?? "";
|
|
131
|
+
const existing = byQueryId.get(queryId);
|
|
132
|
+
if (existing) {
|
|
133
|
+
existing.assertions = [...(existing.assertions ?? []), ...placeholderAssertions];
|
|
134
|
+
}
|
|
135
|
+
else {
|
|
136
|
+
byQueryId.set(queryId, {
|
|
137
|
+
query_id: queryId,
|
|
138
|
+
input: { user_message: sec.query },
|
|
139
|
+
reference: undefined,
|
|
140
|
+
assertions: placeholderAssertions,
|
|
141
|
+
tags: undefined,
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return { cases: [...byQueryId.values()], skipped_findings_count: skipped, skipped_findings_summary: skippedSummary };
|
|
147
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* M5 eval-set redactor — PII pattern matching + replacement.
|
|
3
|
+
*
|
|
4
|
+
* Three rule sources, in priority order (chain):
|
|
5
|
+
* 1. --redaction-rules=<path> (CLI flag, highest)
|
|
6
|
+
* 2. <repo>/redaction-rules/*.yaml (repo-local)
|
|
7
|
+
* 3. BUILTIN_RULES (5 low-fidelity defaults)
|
|
8
|
+
*
|
|
9
|
+
* Builtin rules cover common Chinese-context PII: phone / email / id_card /
|
|
10
|
+
* bank_card / ip. Organizations write more rules in <repo>/redaction-rules/
|
|
11
|
+
* for their business-specific patterns.
|
|
12
|
+
*
|
|
13
|
+
* Rule yaml format:
|
|
14
|
+
* rules:
|
|
15
|
+
* - name: <id>
|
|
16
|
+
* pattern: <regex source string>
|
|
17
|
+
* replace: <replacement template; supports {hash6} placeholder>
|
|
18
|
+
*
|
|
19
|
+
* Malformed regex causes loadRules to throw RedactorError (no silent fallback).
|
|
20
|
+
*/
|
|
21
|
+
import type { RedactionRule } from "./types.js";
|
|
22
|
+
export declare class RedactorError extends Error {
|
|
23
|
+
readonly path?: string | undefined;
|
|
24
|
+
constructor(message: string, path?: string | undefined);
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* 5 builtin low-fidelity PII patterns. Tuned for Chinese-context defaults;
|
|
28
|
+
* organizations override with their own rules in <repo>/redaction-rules/.
|
|
29
|
+
*/
|
|
30
|
+
export declare const BUILTIN_RULES: RedactionRule[];
|
|
31
|
+
export interface LoadRulesOpts {
|
|
32
|
+
/** From `--redaction-rules=<path>`; highest priority */
|
|
33
|
+
cliFlag: string | undefined;
|
|
34
|
+
/** From `<repo>/redaction-rules/` (resolved by caller — usually `path.join(repoRoot, "redaction-rules")`) */
|
|
35
|
+
repoDir: string | undefined;
|
|
36
|
+
}
|
|
37
|
+
export interface LoadRulesResult {
|
|
38
|
+
rules: RedactionRule[];
|
|
39
|
+
source: "cli-flag" | "repo" | "builtin";
|
|
40
|
+
}
|
|
41
|
+
export declare function loadRules(opts: LoadRulesOpts): Promise<LoadRulesResult>;
|
|
42
|
+
export declare function applyRules(text: string, rules: RedactionRule[]): string;
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* M5 eval-set redactor — PII pattern matching + replacement.
|
|
3
|
+
*
|
|
4
|
+
* Three rule sources, in priority order (chain):
|
|
5
|
+
* 1. --redaction-rules=<path> (CLI flag, highest)
|
|
6
|
+
* 2. <repo>/redaction-rules/*.yaml (repo-local)
|
|
7
|
+
* 3. BUILTIN_RULES (5 low-fidelity defaults)
|
|
8
|
+
*
|
|
9
|
+
* Builtin rules cover common Chinese-context PII: phone / email / id_card /
|
|
10
|
+
* bank_card / ip. Organizations write more rules in <repo>/redaction-rules/
|
|
11
|
+
* for their business-specific patterns.
|
|
12
|
+
*
|
|
13
|
+
* Rule yaml format:
|
|
14
|
+
* rules:
|
|
15
|
+
* - name: <id>
|
|
16
|
+
* pattern: <regex source string>
|
|
17
|
+
* replace: <replacement template; supports {hash6} placeholder>
|
|
18
|
+
*
|
|
19
|
+
* Malformed regex causes loadRules to throw RedactorError (no silent fallback).
|
|
20
|
+
*/
|
|
21
|
+
import { readFile, readdir, stat } from "node:fs/promises";
|
|
22
|
+
import path from "node:path";
|
|
23
|
+
import { createHash } from "node:crypto";
|
|
24
|
+
import yaml from "js-yaml";
|
|
25
|
+
export class RedactorError extends Error {
|
|
26
|
+
path;
|
|
27
|
+
constructor(message, path) {
|
|
28
|
+
super(message);
|
|
29
|
+
this.path = path;
|
|
30
|
+
this.name = "RedactorError";
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* 5 builtin low-fidelity PII patterns. Tuned for Chinese-context defaults;
|
|
35
|
+
* organizations override with their own rules in <repo>/redaction-rules/.
|
|
36
|
+
*/
|
|
37
|
+
export const BUILTIN_RULES = [
|
|
38
|
+
{
|
|
39
|
+
name: "phone",
|
|
40
|
+
pattern: /1[3-9]\d{9}/g,
|
|
41
|
+
replace: "<phone:{hash6}>",
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
name: "email",
|
|
45
|
+
pattern: /[\w.+-]+@[\w.-]+\.\w+/g,
|
|
46
|
+
replace: "<email:{hash6}>",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
name: "id_card",
|
|
50
|
+
pattern: /\b\d{17}[\dXx]\b/g,
|
|
51
|
+
replace: "<id_card:{hash6}>",
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
name: "bank_card",
|
|
55
|
+
pattern: /\b\d{15,19}\b/g, // 银行卡号长度 15-19 位
|
|
56
|
+
replace: "<bank_card:{hash6}>",
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
name: "ip",
|
|
60
|
+
pattern: /\b(?:\d{1,3}\.){3}\d{1,3}\b/g,
|
|
61
|
+
replace: "<ip:{hash6}>",
|
|
62
|
+
},
|
|
63
|
+
];
|
|
64
|
+
function compileRule(entry, srcPath) {
|
|
65
|
+
let pattern;
|
|
66
|
+
try {
|
|
67
|
+
pattern = new RegExp(entry.pattern, "g");
|
|
68
|
+
}
|
|
69
|
+
catch (e) {
|
|
70
|
+
throw new RedactorError(`invalid regex in rule '${entry.name}' at ${srcPath}: ${e.message}`, srcPath);
|
|
71
|
+
}
|
|
72
|
+
return { name: entry.name, pattern, replace: entry.replace };
|
|
73
|
+
}
|
|
74
|
+
async function readRulesFile(filePath) {
|
|
75
|
+
let raw;
|
|
76
|
+
try {
|
|
77
|
+
raw = await readFile(filePath, "utf8");
|
|
78
|
+
}
|
|
79
|
+
catch (e) {
|
|
80
|
+
throw new RedactorError(`failed to read rule file ${filePath}: ${e.message}`, filePath);
|
|
81
|
+
}
|
|
82
|
+
let parsed;
|
|
83
|
+
try {
|
|
84
|
+
parsed = yaml.load(raw);
|
|
85
|
+
}
|
|
86
|
+
catch (e) {
|
|
87
|
+
throw new RedactorError(`failed to parse yaml ${filePath}: ${e.message}`, filePath);
|
|
88
|
+
}
|
|
89
|
+
const doc = parsed;
|
|
90
|
+
if (!doc || !Array.isArray(doc.rules)) {
|
|
91
|
+
throw new RedactorError(`rule file ${filePath} must have top-level 'rules: []'`, filePath);
|
|
92
|
+
}
|
|
93
|
+
return doc.rules.map((e) => compileRule(e, filePath));
|
|
94
|
+
}
|
|
95
|
+
export async function loadRules(opts) {
|
|
96
|
+
if (opts.cliFlag) {
|
|
97
|
+
const rules = await readRulesFile(opts.cliFlag);
|
|
98
|
+
return { rules, source: "cli-flag" };
|
|
99
|
+
}
|
|
100
|
+
if (opts.repoDir) {
|
|
101
|
+
let stats;
|
|
102
|
+
try {
|
|
103
|
+
stats = await stat(opts.repoDir);
|
|
104
|
+
}
|
|
105
|
+
catch {
|
|
106
|
+
stats = null;
|
|
107
|
+
}
|
|
108
|
+
if (stats && stats.isDirectory()) {
|
|
109
|
+
const entries = await readdir(opts.repoDir);
|
|
110
|
+
const yamlFiles = entries
|
|
111
|
+
.filter((e) => e.endsWith(".yaml") || e.endsWith(".yml"))
|
|
112
|
+
.map((e) => path.join(opts.repoDir, e));
|
|
113
|
+
if (yamlFiles.length > 0) {
|
|
114
|
+
const allRules = [];
|
|
115
|
+
for (const f of yamlFiles) {
|
|
116
|
+
allRules.push(...(await readRulesFile(f)));
|
|
117
|
+
}
|
|
118
|
+
return { rules: allRules, source: "repo" };
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
return { rules: BUILTIN_RULES, source: "builtin" };
|
|
123
|
+
}
|
|
124
|
+
function hash6(input) {
|
|
125
|
+
return createHash("sha256").update(input).digest("hex").slice(0, 6);
|
|
126
|
+
}
|
|
127
|
+
export function applyRules(text, rules) {
|
|
128
|
+
let out = text;
|
|
129
|
+
for (const rule of rules) {
|
|
130
|
+
out = out.replace(rule.pattern, (match) => rule.replace.replace("{hash6}", hash6(match)));
|
|
131
|
+
}
|
|
132
|
+
return out;
|
|
133
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
You are evaluating whether a candidate answer is semantically equivalent to a reference answer for a knowledge-graph Q&A system.
|
|
2
|
+
|
|
3
|
+
Question: {{question}}
|
|
4
|
+
|
|
5
|
+
Reference answer (ground truth):
|
|
6
|
+
{{reference_answer}}
|
|
7
|
+
|
|
8
|
+
Candidate answer (from agent):
|
|
9
|
+
{{candidate_answer}}
|
|
10
|
+
|
|
11
|
+
Judge whether the candidate answer is semantically correct relative to the reference.
|
|
12
|
+
A candidate passes if it conveys the same key facts, even if phrased differently.
|
|
13
|
+
A candidate fails if it omits critical facts, states incorrect facts, or hallucinates information not in the reference.
|
|
14
|
+
Partial answers that cover most key facts but miss minor details should pass.
|
|
15
|
+
|
|
16
|
+
{{language_instruction}}
|
|
17
|
+
|
|
18
|
+
Respond with valid JSON matching this schema exactly:
|
|
19
|
+
{{output_schema}}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* M5 eval-set zod schemas (PR-A, MVP-B scope).
|
|
3
|
+
*
|
|
4
|
+
* 4 schemas in this file:
|
|
5
|
+
* - EvalSetIndexSchema: trace-eval-set-index/v1 (eval-set dir's index.yaml)
|
|
6
|
+
* - EvalSetShardSchema: trace-eval-set/v1 (final shard yaml file)
|
|
7
|
+
* - EvalSetInputSchema: trace-eval-set-input/v1 (--queries simplified input)
|
|
8
|
+
* - TestReportSchema: trace-test-report/v1 (test report; PR-A defines schema only;
|
|
9
|
+
* PR-B consumer)
|
|
10
|
+
*
|
|
11
|
+
* EvalSetShardSchema and EvalSetInputSchema share the same refinement:
|
|
12
|
+
* "for each case, at least one of {reference, non-empty assertions} must be present."
|
|
13
|
+
*
|
|
14
|
+
* The D5 builtin rubric `answer-match-reference` output schema is NOT here —
|
|
15
|
+
* it belongs to the rubric template definition (per spec doc §4.1).
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
export declare const EvalSetIndexSchema: z.ZodObject<{
|
|
19
|
+
schema_version: z.ZodLiteral<"trace-eval-set-index/v1">;
|
|
20
|
+
eval_set_id: z.ZodString;
|
|
21
|
+
shards: z.ZodArray<z.ZodObject<{
|
|
22
|
+
path: z.ZodString;
|
|
23
|
+
role: z.ZodOptional<z.ZodEnum<{
|
|
24
|
+
seed: "seed";
|
|
25
|
+
regression: "regression";
|
|
26
|
+
holdout: "holdout";
|
|
27
|
+
}>>;
|
|
28
|
+
}, z.core.$strip>>;
|
|
29
|
+
}, z.core.$strip>;
|
|
30
|
+
export declare const EvalSetShardSchema: z.ZodObject<{
|
|
31
|
+
schema_version: z.ZodLiteral<"trace-eval-set/v1">;
|
|
32
|
+
cases: z.ZodArray<z.ZodObject<{
|
|
33
|
+
query_id: z.ZodString;
|
|
34
|
+
input: z.ZodObject<{
|
|
35
|
+
user_message: z.ZodString;
|
|
36
|
+
}, z.core.$strip>;
|
|
37
|
+
reference: z.ZodOptional<z.ZodObject<{
|
|
38
|
+
answer: z.ZodString;
|
|
39
|
+
}, z.core.$strip>>;
|
|
40
|
+
assertions: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
41
|
+
type: z.ZodEnum<{
|
|
42
|
+
contains: "contains";
|
|
43
|
+
not_contains: "not_contains";
|
|
44
|
+
regex: "regex";
|
|
45
|
+
tool_call_count: "tool_call_count";
|
|
46
|
+
tool_call_order: "tool_call_order";
|
|
47
|
+
semantic_match: "semantic_match";
|
|
48
|
+
latency_ms: "latency_ms";
|
|
49
|
+
}>;
|
|
50
|
+
}, z.core.$loose>>>;
|
|
51
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
52
|
+
}, z.core.$strip>>;
|
|
53
|
+
}, z.core.$strip>;
|
|
54
|
+
export declare const EvalSetInputSchema: z.ZodObject<{
|
|
55
|
+
schema_version: z.ZodLiteral<"trace-eval-set-input/v1">;
|
|
56
|
+
cases: z.ZodArray<z.ZodObject<{
|
|
57
|
+
input: z.ZodObject<{
|
|
58
|
+
user_message: z.ZodString;
|
|
59
|
+
}, z.core.$strip>;
|
|
60
|
+
query_id: z.ZodOptional<z.ZodString>;
|
|
61
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
62
|
+
reference: z.ZodOptional<z.ZodObject<{
|
|
63
|
+
answer: z.ZodString;
|
|
64
|
+
}, z.core.$strip>>;
|
|
65
|
+
assertions: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
66
|
+
type: z.ZodEnum<{
|
|
67
|
+
contains: "contains";
|
|
68
|
+
not_contains: "not_contains";
|
|
69
|
+
regex: "regex";
|
|
70
|
+
tool_call_count: "tool_call_count";
|
|
71
|
+
tool_call_order: "tool_call_order";
|
|
72
|
+
semantic_match: "semantic_match";
|
|
73
|
+
latency_ms: "latency_ms";
|
|
74
|
+
}>;
|
|
75
|
+
}, z.core.$loose>>>;
|
|
76
|
+
}, z.core.$strip>>;
|
|
77
|
+
}, z.core.$strip>;
|
|
78
|
+
export declare const TestReportSchema: z.ZodObject<{
|
|
79
|
+
schema_version: z.ZodLiteral<"trace-test-report/v1">;
|
|
80
|
+
meta: z.ZodObject<{
|
|
81
|
+
eval_set_dir: z.ZodString;
|
|
82
|
+
eval_set_id: z.ZodString;
|
|
83
|
+
candidate: z.ZodObject<{
|
|
84
|
+
agent_id: z.ZodString;
|
|
85
|
+
agent_version: z.ZodOptional<z.ZodString>;
|
|
86
|
+
}, z.core.$strip>;
|
|
87
|
+
cli_version: z.ZodString;
|
|
88
|
+
ran_at: z.ZodString;
|
|
89
|
+
duration_ms: z.ZodNumber;
|
|
90
|
+
}, z.core.$strip>;
|
|
91
|
+
summary: z.ZodObject<{
|
|
92
|
+
total: z.ZodNumber;
|
|
93
|
+
pass: z.ZodNumber;
|
|
94
|
+
fail: z.ZodNumber;
|
|
95
|
+
error: z.ZodNumber;
|
|
96
|
+
skip: z.ZodNumber;
|
|
97
|
+
by_assertion_type: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
98
|
+
pass: z.ZodNumber;
|
|
99
|
+
fail: z.ZodNumber;
|
|
100
|
+
}, z.core.$strip>>;
|
|
101
|
+
}, z.core.$strip>;
|
|
102
|
+
cases: z.ZodArray<z.ZodObject<{
|
|
103
|
+
query_id: z.ZodString;
|
|
104
|
+
status: z.ZodEnum<{
|
|
105
|
+
error: "error";
|
|
106
|
+
pass: "pass";
|
|
107
|
+
fail: "fail";
|
|
108
|
+
skip: "skip";
|
|
109
|
+
}>;
|
|
110
|
+
conversation_id: z.ZodNullable<z.ZodString>;
|
|
111
|
+
trace_id: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
112
|
+
duration_ms: z.ZodOptional<z.ZodNumber>;
|
|
113
|
+
assertion_results: z.ZodArray<z.ZodObject<{
|
|
114
|
+
assertion: z.ZodObject<{
|
|
115
|
+
type: z.ZodEnum<{
|
|
116
|
+
contains: "contains";
|
|
117
|
+
not_contains: "not_contains";
|
|
118
|
+
regex: "regex";
|
|
119
|
+
tool_call_count: "tool_call_count";
|
|
120
|
+
tool_call_order: "tool_call_order";
|
|
121
|
+
semantic_match: "semantic_match";
|
|
122
|
+
latency_ms: "latency_ms";
|
|
123
|
+
}>;
|
|
124
|
+
}, z.core.$loose>;
|
|
125
|
+
verdict: z.ZodEnum<{
|
|
126
|
+
pass: "pass";
|
|
127
|
+
fail: "fail";
|
|
128
|
+
skip: "skip";
|
|
129
|
+
}>;
|
|
130
|
+
actual: z.ZodOptional<z.ZodUnknown>;
|
|
131
|
+
}, z.core.$strip>>;
|
|
132
|
+
failure_reason: z.ZodOptional<z.ZodString>;
|
|
133
|
+
error_code: z.ZodOptional<z.ZodString>;
|
|
134
|
+
error_message: z.ZodOptional<z.ZodString>;
|
|
135
|
+
}, z.core.$strip>>;
|
|
136
|
+
}, z.core.$strip>;
|