@kweaver-ai/kweaver-sdk 0.8.1 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -52
- package/README.zh.md +41 -46
- package/dist/agent-providers/index.d.ts +7 -0
- package/dist/agent-providers/index.js +5 -0
- package/dist/agent-providers/prompt-template.d.ts +62 -0
- package/dist/agent-providers/prompt-template.js +105 -0
- package/dist/agent-providers/prompts/rubric-judge-v1.prompt.md +51 -0
- package/dist/agent-providers/prompts/within-trace-synthesizer-v1.prompt.md +60 -0
- package/dist/agent-providers/providers/claude-code-subprocess.d.ts +74 -0
- package/dist/agent-providers/providers/claude-code-subprocess.js +259 -0
- package/dist/agent-providers/providers/stub.d.ts +47 -0
- package/dist/agent-providers/providers/stub.js +77 -0
- package/dist/agent-providers/registry.d.ts +45 -0
- package/dist/agent-providers/registry.js +77 -0
- package/dist/agent-providers/types.d.ts +91 -0
- package/dist/agent-providers/types.js +25 -0
- package/dist/api/agent-chat.js +8 -6
- package/dist/api/context-loader.d.ts +1 -0
- package/dist/api/resources.d.ts +94 -0
- package/dist/api/resources.js +166 -0
- package/dist/api/semantic-search.d.ts +5 -0
- package/dist/api/semantic-search.js +5 -0
- package/dist/api/skills.d.ts +75 -2
- package/dist/api/skills.js +108 -12
- package/dist/api/trace.d.ts +5 -0
- package/dist/api/trace.js +4 -0
- package/dist/cli.js +109 -15
- package/dist/client.d.ts +3 -3
- package/dist/client.js +5 -5
- package/dist/commands/agent/mode.d.ts +6 -0
- package/dist/commands/agent/mode.js +75 -0
- package/dist/commands/agent-members.js +27 -11
- package/dist/commands/agent.js +469 -286
- package/dist/commands/auth.js +184 -71
- package/dist/commands/bkn-metric.js +37 -16
- package/dist/commands/bkn-ops.js +164 -86
- package/dist/commands/bkn-query.js +99 -31
- package/dist/commands/bkn-schema.d.ts +3 -3
- package/dist/commands/bkn-schema.js +127 -86
- package/dist/commands/bkn.js +153 -114
- package/dist/commands/call.js +23 -13
- package/dist/commands/config.js +22 -12
- package/dist/commands/context-loader.js +625 -49
- package/dist/commands/dataflow.js +14 -6
- package/dist/commands/ds.js +52 -30
- package/dist/commands/explore.js +18 -15
- package/dist/commands/model.js +53 -42
- package/dist/commands/resource.d.ts +1 -0
- package/dist/commands/{dataview.js → resource.js} +62 -84
- package/dist/commands/skill.d.ts +21 -1
- package/dist/commands/skill.js +567 -43
- package/dist/commands/token.js +11 -0
- package/dist/commands/tool.js +46 -29
- package/dist/commands/toolbox.js +31 -15
- package/dist/commands/trace.d.ts +26 -1
- package/dist/commands/trace.js +515 -15
- package/dist/commands/vega.js +466 -250
- package/dist/help/format.d.ts +65 -0
- package/dist/help/format.js +141 -0
- package/dist/index.d.ts +5 -5
- package/dist/index.js +3 -3
- package/dist/resources/bkn.d.ts +5 -0
- package/dist/resources/bkn.js +5 -0
- package/dist/resources/{dataviews.d.ts → resources.d.ts} +10 -11
- package/dist/resources/{dataviews.js → resources.js} +12 -13
- package/dist/resources/skills.d.ts +17 -1
- package/dist/resources/skills.js +32 -1
- package/dist/trace-ai/diagnose/agent-binding.d.ts +67 -0
- package/dist/trace-ai/diagnose/agent-binding.js +257 -0
- package/dist/trace-ai/diagnose/builtin-rules/tool-retry-intent-mismatch.yaml +68 -0
- package/dist/trace-ai/diagnose/index.d.ts +32 -0
- package/dist/trace-ai/diagnose/index.js +246 -0
- package/dist/trace-ai/diagnose/output-schema-converter.d.ts +24 -0
- package/dist/trace-ai/diagnose/output-schema-converter.js +81 -0
- package/dist/trace-ai/diagnose/query-extractor.d.ts +14 -0
- package/dist/trace-ai/diagnose/query-extractor.js +45 -0
- package/dist/trace-ai/diagnose/report-assembler.d.ts +31 -0
- package/dist/{trace-core → trace-ai}/diagnose/report-assembler.js +19 -9
- package/dist/trace-ai/diagnose/report-markdown.d.ts +18 -0
- package/dist/trace-ai/diagnose/report-markdown.js +192 -0
- package/dist/{trace-core → trace-ai}/diagnose/rule-loader.js +42 -8
- package/dist/{trace-core → trace-ai}/diagnose/schemas.d.ts +77 -2
- package/dist/trace-ai/diagnose/schemas.js +154 -0
- package/dist/trace-ai/diagnose/signal-probe.d.ts +17 -0
- package/dist/trace-ai/diagnose/signal-probe.js +39 -0
- package/dist/trace-ai/diagnose/synthesizer-agent.d.ts +40 -0
- package/dist/trace-ai/diagnose/synthesizer-agent.js +158 -0
- package/dist/{trace-core → trace-ai}/diagnose/trace-shaper.js +1 -0
- package/dist/{trace-core → trace-ai}/diagnose/types.d.ts +55 -6
- package/dist/trace-ai/eval-set/assertion-evaluator.d.ts +29 -0
- package/dist/trace-ai/eval-set/assertion-evaluator.js +100 -0
- package/dist/trace-ai/eval-set/builder.d.ts +36 -0
- package/dist/trace-ai/eval-set/builder.js +126 -0
- package/dist/trace-ai/eval-set/index.d.ts +15 -0
- package/dist/trace-ai/eval-set/index.js +10 -0
- package/dist/trace-ai/eval-set/output-writer.d.ts +27 -0
- package/dist/trace-ai/eval-set/output-writer.js +126 -0
- package/dist/trace-ai/eval-set/query-picker.d.ts +37 -0
- package/dist/trace-ai/eval-set/query-picker.js +147 -0
- package/dist/trace-ai/eval-set/redactor.d.ts +42 -0
- package/dist/trace-ai/eval-set/redactor.js +133 -0
- package/dist/trace-ai/eval-set/rubric-templates/answer-match-reference.prompt.md +19 -0
- package/dist/trace-ai/eval-set/schemas.d.ts +136 -0
- package/dist/trace-ai/eval-set/schemas.js +130 -0
- package/dist/trace-ai/eval-set/semantic-match-provider.d.ts +33 -0
- package/dist/trace-ai/eval-set/semantic-match-provider.js +51 -0
- package/dist/trace-ai/eval-set/test-runner.d.ts +34 -0
- package/dist/trace-ai/eval-set/test-runner.js +153 -0
- package/dist/trace-ai/eval-set/types.d.ts +46 -0
- package/dist/trace-ai/eval-set/types.js +8 -0
- package/dist/trace-ai/exp/bundle-writer.d.ts +10 -0
- package/dist/trace-ai/exp/bundle-writer.js +54 -0
- package/dist/trace-ai/exp/claude-binary.d.ts +5 -0
- package/dist/trace-ai/exp/claude-binary.js +30 -0
- package/dist/trace-ai/exp/coordinator.d.ts +45 -0
- package/dist/trace-ai/exp/coordinator.js +203 -0
- package/dist/trace-ai/exp/eval-runner.d.ts +14 -0
- package/dist/trace-ai/exp/eval-runner.js +47 -0
- package/dist/trace-ai/exp/exp-store/abort-signal.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/abort-signal.js +27 -0
- package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.d.ts +4 -0
- package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.js +37 -0
- package/dist/trace-ai/exp/exp-store/events-jsonl.d.ts +17 -0
- package/dist/trace-ai/exp/exp-store/events-jsonl.js +60 -0
- package/dist/trace-ai/exp/exp-store/exp-registry.d.ts +6 -0
- package/dist/trace-ai/exp/exp-store/exp-registry.js +41 -0
- package/dist/trace-ai/exp/exp-store/index.d.ts +46 -0
- package/dist/trace-ai/exp/exp-store/index.js +59 -0
- package/dist/trace-ai/exp/exp-store/lock.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/lock.js +73 -0
- package/dist/trace-ai/exp/exp-store/mission-md.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/mission-md.js +37 -0
- package/dist/trace-ai/exp/exp-store/readme-template.d.ts +5 -0
- package/dist/trace-ai/exp/exp-store/readme-template.js +25 -0
- package/dist/trace-ai/exp/exp-store/round-yaml.d.ts +3 -0
- package/dist/trace-ai/exp/exp-store/round-yaml.js +33 -0
- package/dist/trace-ai/exp/index.d.ts +8 -0
- package/dist/trace-ai/exp/index.js +238 -0
- package/dist/trace-ai/exp/info.d.ts +35 -0
- package/dist/trace-ai/exp/info.js +120 -0
- package/dist/trace-ai/exp/patch/agent-config.d.ts +1 -0
- package/dist/trace-ai/exp/patch/agent-config.js +26 -0
- package/dist/trace-ai/exp/patch/index.d.ts +2 -0
- package/dist/trace-ai/exp/patch/index.js +13 -0
- package/dist/trace-ai/exp/patch/skill.d.ts +1 -0
- package/dist/trace-ai/exp/patch/skill.js +24 -0
- package/dist/trace-ai/exp/providers/synthesizer-client.d.ts +14 -0
- package/dist/trace-ai/exp/providers/synthesizer-client.js +39 -0
- package/dist/trace-ai/exp/providers/triage-client.d.ts +19 -0
- package/dist/trace-ai/exp/providers/triage-client.js +51 -0
- package/dist/trace-ai/exp/schemas.d.ts +147 -0
- package/dist/trace-ai/exp/schemas.js +50 -0
- package/dist/trace-ai/exp/scoring.d.ts +2 -0
- package/dist/trace-ai/exp/scoring.js +46 -0
- package/dist/trace-ai/scan/aggregator.d.ts +20 -0
- package/dist/trace-ai/scan/aggregator.js +26 -0
- package/dist/trace-ai/scan/artifacts/paths.d.ts +12 -0
- package/dist/trace-ai/scan/artifacts/paths.js +18 -0
- package/dist/trace-ai/scan/artifacts/writer.d.ts +67 -0
- package/dist/trace-ai/scan/artifacts/writer.js +96 -0
- package/dist/trace-ai/scan/batched-rubric.d.ts +55 -0
- package/dist/trace-ai/scan/batched-rubric.js +159 -0
- package/dist/trace-ai/scan/cross-trace-synthesizer.d.ts +24 -0
- package/dist/trace-ai/scan/cross-trace-synthesizer.js +93 -0
- package/dist/trace-ai/scan/index.d.ts +31 -0
- package/dist/trace-ai/scan/index.js +390 -0
- package/dist/trace-ai/scan/prompts/builtin/cross-trace-synthesizer-v1.prompt.md +44 -0
- package/dist/trace-ai/scan/prompts/builtin/rubric-judge-batch-v1.prompt.md +44 -0
- package/dist/trace-ai/scan/runner.d.ts +25 -0
- package/dist/trace-ai/scan/runner.js +42 -0
- package/dist/trace-ai/scan/sampler.d.ts +18 -0
- package/dist/trace-ai/scan/sampler.js +81 -0
- package/dist/trace-ai/scan/scan-summary-markdown.d.ts +2 -0
- package/dist/trace-ai/scan/scan-summary-markdown.js +71 -0
- package/dist/trace-ai/scan/scan-summary-schema.d.ts +73 -0
- package/dist/trace-ai/scan/scan-summary-schema.js +61 -0
- package/dist/trace-ai/scan/single-agent-validator.d.ts +23 -0
- package/dist/trace-ai/scan/single-agent-validator.js +42 -0
- package/dist/trace-ai/scan/traces-list-parser.d.ts +15 -0
- package/dist/trace-ai/scan/traces-list-parser.js +46 -0
- package/package.json +2 -2
- package/dist/api/dataviews.d.ts +0 -117
- package/dist/api/dataviews.js +0 -265
- package/dist/commands/dataview.d.ts +0 -8
- package/dist/trace-core/diagnose/index.d.ts +0 -9
- package/dist/trace-core/diagnose/index.js +0 -104
- package/dist/trace-core/diagnose/report-assembler.d.ts +0 -12
- package/dist/trace-core/diagnose/schemas.js +0 -94
- package/dist/trace-core/diagnose/signal-probe.d.ts +0 -5
- package/dist/trace-core/diagnose/signal-probe.js +0 -21
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/register.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/register.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.yaml +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/predicate-registry.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/predicate-registry.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/rule-loader.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/synthesizer-template.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/synthesizer-template.js +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/trace-shaper.d.ts +0 -0
- /package/dist/{trace-core → trace-ai}/diagnose/types.js +0 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
const TaxonomySchema = z.object({
|
|
3
|
+
signals_axis: z.enum(["interaction", "execution", "environment"]),
|
|
4
|
+
ms_class: z.enum([
|
|
5
|
+
"retry_loop",
|
|
6
|
+
"tool_misuse",
|
|
7
|
+
"context_loss",
|
|
8
|
+
"goal_drift",
|
|
9
|
+
"cascading_error",
|
|
10
|
+
"silent_quality_degradation",
|
|
11
|
+
]),
|
|
12
|
+
});
|
|
13
|
+
const SuggestedFixSchema = z.object({
|
|
14
|
+
target: z.string().min(1),
|
|
15
|
+
change_template: z.string().min(1),
|
|
16
|
+
});
|
|
17
|
+
const VerifyWithSchema = z.object({
|
|
18
|
+
assertion_templates: z.array(z.string()).default([]),
|
|
19
|
+
});
|
|
20
|
+
/**
|
|
21
|
+
* Rubric input source descriptor. The supported source prefixes are
|
|
22
|
+
* resolved by `diagnose/agent-binding.ts` against the in-memory TraceTree:
|
|
23
|
+
*
|
|
24
|
+
* - `extract_from_root_attr:<dot.path>` → root span attribute by name
|
|
25
|
+
* - `filter_by_kind:[kind1,kind2,...]` → ordered span subset by kind
|
|
26
|
+
* - `literal:<json>` → constant blob (debug / fixtures)
|
|
27
|
+
*
|
|
28
|
+
* Authors describe **which slice of the trace** the agent needs as context;
|
|
29
|
+
* the binding does the actual extraction so rule YAML stays declarative.
|
|
30
|
+
*/
|
|
31
|
+
const RubricInputSchema = z.object({
|
|
32
|
+
kind: z.string().min(1),
|
|
33
|
+
source: z.string().min(1),
|
|
34
|
+
});
|
|
35
|
+
/**
|
|
36
|
+
* Minimal JSON-Schema-ish shape we accept for rubric output_schema. We
|
|
37
|
+
* convert to a zod schema at load time (see `output-schema-converter.ts`);
|
|
38
|
+
* keeping this loose here lets authors paste literal JSON Schema without
|
|
39
|
+
* us re-implementing the whole spec — just the subset we need (object
|
|
40
|
+
* with required[] + properties{type,enum,items}).
|
|
41
|
+
*/
|
|
42
|
+
const RubricOutputSchemaSchema = z.object({
|
|
43
|
+
type: z.literal("object"),
|
|
44
|
+
required: z.array(z.string()).default([]),
|
|
45
|
+
properties: z.record(z.string(), z.record(z.string(), z.unknown())),
|
|
46
|
+
});
|
|
47
|
+
const AgentBindingSchema = z.object({
|
|
48
|
+
provider: z.string().min(1),
|
|
49
|
+
prompt_template_ref: z.string().regex(/^builtin:[a-zA-Z0-9_-]+$/),
|
|
50
|
+
});
|
|
51
|
+
const RubricSchema = z.object({
|
|
52
|
+
judge_question: z.string().min(1),
|
|
53
|
+
inputs: z.array(RubricInputSchema).default([]),
|
|
54
|
+
output_schema: RubricOutputSchemaSchema,
|
|
55
|
+
agent_binding: AgentBindingSchema,
|
|
56
|
+
/**
|
|
57
|
+
* Optional symbolic rule_ids that act as gate for this rubric in batch mode.
|
|
58
|
+
* Empty/missing → rubric runs on all traces (PR-B fallback). In single-trace
|
|
59
|
+
* mode this field is ignored; rubric always runs.
|
|
60
|
+
*/
|
|
61
|
+
gates_on: z.array(z.string()).optional(),
|
|
62
|
+
});
|
|
63
|
+
/**
|
|
64
|
+
* The convergence contract between Stage-1 (symbolic) and Stage-2 (rubric):
|
|
65
|
+
* every rubric verdict MUST emit `first_violating_step_id` so cross-finding
|
|
66
|
+
* links can correlate rubric findings with the spans symbolic rules cite.
|
|
67
|
+
*
|
|
68
|
+
* Enforced as a YAML-load-time check rather than at runtime so authors
|
|
69
|
+
* see the violation in `trace diagnose rules validate <path>`.
|
|
70
|
+
*/
|
|
71
|
+
const FIRST_VIOLATING_STEP_ID = "first_violating_step_id";
|
|
72
|
+
export const RuleSchema = z
|
|
73
|
+
.object({
|
|
74
|
+
schema_version: z.literal("diagnosis-rule/v1"),
|
|
75
|
+
id: z.string().regex(/^[a-z][a-z0-9_]*$/),
|
|
76
|
+
severity: z.enum(["low", "medium", "high"]),
|
|
77
|
+
symptom: z.string().min(1),
|
|
78
|
+
taxonomy: TaxonomySchema,
|
|
79
|
+
suggested_fix: SuggestedFixSchema,
|
|
80
|
+
verify_with: VerifyWithSchema,
|
|
81
|
+
predicate: z.string().regex(/^builtin:[a-z][a-z0-9_]*$/).optional(),
|
|
82
|
+
rubric: RubricSchema.optional(),
|
|
83
|
+
params: z.record(z.string(), z.unknown()).default({}),
|
|
84
|
+
})
|
|
85
|
+
.refine((r) => Boolean(r.predicate) !== Boolean(r.rubric), { message: "exactly one of `predicate` or `rubric` must be present" })
|
|
86
|
+
.refine((r) => !r.rubric || r.rubric.output_schema.required.includes(FIRST_VIOLATING_STEP_ID), {
|
|
87
|
+
message: `rubric.output_schema.required must include '${FIRST_VIOLATING_STEP_ID}' (Stage-1↔Stage-2 convergence contract)`,
|
|
88
|
+
path: ["rubric", "output_schema", "required"],
|
|
89
|
+
});
|
|
90
|
+
const FindingSchema = z.object({
|
|
91
|
+
rule_id: z.string(),
|
|
92
|
+
judgment_kind: z.enum(["symbolic", "rubric"]),
|
|
93
|
+
severity: z.enum(["low", "medium", "high"]),
|
|
94
|
+
symptom: z.string(),
|
|
95
|
+
likely_cause: z.string(),
|
|
96
|
+
evidence: z.object({
|
|
97
|
+
spans: z.array(z.string()),
|
|
98
|
+
excerpt: z.string(),
|
|
99
|
+
}),
|
|
100
|
+
suggested_fix: z.object({
|
|
101
|
+
target: z.string(),
|
|
102
|
+
change: z.string(),
|
|
103
|
+
}),
|
|
104
|
+
// Symbolic findings always emit `low` (no semantic basis for higher).
|
|
105
|
+
// Rubric agent supplies its own confidence; rule-loader propagates the
|
|
106
|
+
// value the agent returned in the rubric output. Accept the union.
|
|
107
|
+
confidence: z.enum(["low", "medium", "high"]),
|
|
108
|
+
verify_with: z.object({
|
|
109
|
+
suggested_eval_case: z.object({
|
|
110
|
+
query_id: z.string().nullable(),
|
|
111
|
+
query: z.string().nullable(),
|
|
112
|
+
assertions: z.array(z.string()),
|
|
113
|
+
}),
|
|
114
|
+
}),
|
|
115
|
+
});
|
|
116
|
+
const SummarySchema = z.object({
|
|
117
|
+
headline: z.string().max(160),
|
|
118
|
+
primary_root_cause: z
|
|
119
|
+
.object({
|
|
120
|
+
finding_ids: z.array(z.number().int().nonnegative()).min(1),
|
|
121
|
+
description: z.string(),
|
|
122
|
+
target_for_fix: z.string(),
|
|
123
|
+
})
|
|
124
|
+
.nullable(),
|
|
125
|
+
fix_priority: z.array(z.object({
|
|
126
|
+
finding_id: z.number().int().nonnegative(),
|
|
127
|
+
reason: z.string(),
|
|
128
|
+
})),
|
|
129
|
+
cross_finding_links: z.array(z.object({
|
|
130
|
+
finding_ids: z.array(z.number().int().nonnegative()).min(2),
|
|
131
|
+
relation: z.string(),
|
|
132
|
+
})),
|
|
133
|
+
});
|
|
134
|
+
export const ReportSchema = z.object({
|
|
135
|
+
schema_version: z.literal("trace-diagnose-report/v1"),
|
|
136
|
+
trace: z.object({
|
|
137
|
+
trace_id: z.string(),
|
|
138
|
+
agent_id: z.string().nullable(),
|
|
139
|
+
tenant: z.string().nullable(),
|
|
140
|
+
}),
|
|
141
|
+
run: z.object({
|
|
142
|
+
diagnosed_at: z.string(),
|
|
143
|
+
cli_version: z.string(),
|
|
144
|
+
mode: z.enum(["symbolic-only", "rubric-only", "hybrid"]),
|
|
145
|
+
rules_applied: z.array(z.string()),
|
|
146
|
+
rules_skipped: z.array(z.object({ rule_id: z.string(), reason: z.string() })),
|
|
147
|
+
synthesizer_mode: z.enum(["template", "agent"]),
|
|
148
|
+
}),
|
|
149
|
+
summary: SummarySchema,
|
|
150
|
+
findings: z.array(FindingSchema),
|
|
151
|
+
});
|
|
152
|
+
/** The Summary section in isolation — exported so the agent synthesizer
|
|
153
|
+
* can validate its LLM output against the same shape the report uses. */
|
|
154
|
+
export const SummaryOutputSchema = SummarySchema;
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Stage-1 (symbolic) runner. Rubric rules are handled separately in
|
|
3
|
+
* `agent-binding.ts` and merged into the findings list by `index.ts`.
|
|
4
|
+
*
|
|
5
|
+
* Rationale for keeping the split here: symbolic predicates are cheap,
|
|
6
|
+
* deterministic, sync; rubric judgments are slow, non-deterministic,
|
|
7
|
+
* async. Running them in one loop would entangle backpressure,
|
|
8
|
+
* timeout, and retry concerns that only apply to one of the two paths.
|
|
9
|
+
*/
|
|
10
|
+
import type { Hit, Rule, TraceTree } from "./types.js";
|
|
11
|
+
export declare class RuleProbeError extends Error {
|
|
12
|
+
constructor(ruleId: string, cause: Error);
|
|
13
|
+
}
|
|
14
|
+
export declare function runRules(rules: Rule[], tree: TraceTree): Promise<Map<string, Hit[]>>;
|
|
15
|
+
/** Helpers that split a rule list by which stage owns them. */
|
|
16
|
+
export declare function symbolicRules(rules: Rule[]): Rule[];
|
|
17
|
+
export declare function rubricRules(rules: Rule[]): Rule[];
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Stage-1 (symbolic) runner. Rubric rules are handled separately in
|
|
3
|
+
* `agent-binding.ts` and merged into the findings list by `index.ts`.
|
|
4
|
+
*
|
|
5
|
+
* Rationale for keeping the split here: symbolic predicates are cheap,
|
|
6
|
+
* deterministic, sync; rubric judgments are slow, non-deterministic,
|
|
7
|
+
* async. Running them in one loop would entangle backpressure,
|
|
8
|
+
* timeout, and retry concerns that only apply to one of the two paths.
|
|
9
|
+
*/
|
|
10
|
+
import { resolvePredicate } from "./predicate-registry.js";
|
|
11
|
+
export class RuleProbeError extends Error {
|
|
12
|
+
constructor(ruleId, cause) {
|
|
13
|
+
super(`predicate failed for rule '${ruleId}': ${cause.message}`);
|
|
14
|
+
this.name = "RuleProbeError";
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
export async function runRules(rules, tree) {
|
|
18
|
+
const out = new Map();
|
|
19
|
+
for (const rule of rules) {
|
|
20
|
+
if (!rule.predicateRef)
|
|
21
|
+
continue; // rubric rule — handled by agent-binding
|
|
22
|
+
const fn = resolvePredicate(rule.predicateRef);
|
|
23
|
+
try {
|
|
24
|
+
const hits = fn(tree, rule.params);
|
|
25
|
+
out.set(rule.id, hits);
|
|
26
|
+
}
|
|
27
|
+
catch (e) {
|
|
28
|
+
throw new RuleProbeError(rule.id, e);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
return out;
|
|
32
|
+
}
|
|
33
|
+
/** Helpers that split a rule list by which stage owns them. */
|
|
34
|
+
export function symbolicRules(rules) {
|
|
35
|
+
return rules.filter((r) => r.predicateRef !== null);
|
|
36
|
+
}
|
|
37
|
+
export function rubricRules(rules) {
|
|
38
|
+
return rules.filter((r) => r.rubric !== null);
|
|
39
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Stage-3 — agent-driven within-trace synthesizer.
|
|
3
|
+
*
|
|
4
|
+
* Takes the N findings produced by Stages 1+2 and asks the LLM to compose
|
|
5
|
+
* a `Summary` (headline + root cause + ordered fix priority + cross-finding
|
|
6
|
+
* links). Falls back to the deterministic `templateSynthesize` if:
|
|
7
|
+
* - findings.length === 0 (no narrative needed)
|
|
8
|
+
* - no provider registered / provider unavailable
|
|
9
|
+
* - the agent invocation fails for any reason (we still want a usable
|
|
10
|
+
* report even when the LLM judge times out)
|
|
11
|
+
*
|
|
12
|
+
* The agent path remains a *narrative* layer — symbolic and rubric findings
|
|
13
|
+
* are already in hand; the synthesizer doesn't fabricate new findings, only
|
|
14
|
+
* organizes the ones it was given. This keeps the contract small and the
|
|
15
|
+
* failure modes containable.
|
|
16
|
+
*/
|
|
17
|
+
import type { Finding, Summary } from "./types.js";
|
|
18
|
+
import type { AgentProvider } from "../../agent-providers/types.js";
|
|
19
|
+
import { PromptTemplateRegistry, type AgentOutputLang } from "../../agent-providers/prompt-template.js";
|
|
20
|
+
import type { ArtifactWriter } from "../scan/artifacts/writer.js";
|
|
21
|
+
export interface AgentSynthesizeOpts {
|
|
22
|
+
findings: Finding[];
|
|
23
|
+
traceId: string;
|
|
24
|
+
agentId: string | null;
|
|
25
|
+
provider: AgentProvider | null;
|
|
26
|
+
promptRegistry: PromptTemplateRegistry;
|
|
27
|
+
promptRef?: string;
|
|
28
|
+
timeoutMs?: number;
|
|
29
|
+
/** Output locale for synthesizer prose. Default 'en'. */
|
|
30
|
+
lang?: AgentOutputLang;
|
|
31
|
+
/** When provided, writes Stage-3 prompt/response artifacts. */
|
|
32
|
+
artifacts?: ArtifactWriter;
|
|
33
|
+
}
|
|
34
|
+
export interface AgentSynthesizeResult {
|
|
35
|
+
summary: Summary;
|
|
36
|
+
mode: "agent" | "template";
|
|
37
|
+
/** Reason set when mode === 'template' under a non-default branch. */
|
|
38
|
+
fallbackReason?: string;
|
|
39
|
+
}
|
|
40
|
+
export declare function agentSynthesize(opts: AgentSynthesizeOpts): Promise<AgentSynthesizeResult>;
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Stage-3 — agent-driven within-trace synthesizer.
|
|
3
|
+
*
|
|
4
|
+
* Takes the N findings produced by Stages 1+2 and asks the LLM to compose
|
|
5
|
+
* a `Summary` (headline + root cause + ordered fix priority + cross-finding
|
|
6
|
+
* links). Falls back to the deterministic `templateSynthesize` if:
|
|
7
|
+
* - findings.length === 0 (no narrative needed)
|
|
8
|
+
* - no provider registered / provider unavailable
|
|
9
|
+
* - the agent invocation fails for any reason (we still want a usable
|
|
10
|
+
* report even when the LLM judge times out)
|
|
11
|
+
*
|
|
12
|
+
* The agent path remains a *narrative* layer — symbolic and rubric findings
|
|
13
|
+
* are already in hand; the synthesizer doesn't fabricate new findings, only
|
|
14
|
+
* organizes the ones it was given. This keeps the contract small and the
|
|
15
|
+
* failure modes containable.
|
|
16
|
+
*/
|
|
17
|
+
import { AgentProviderError } from "../../agent-providers/types.js";
|
|
18
|
+
import { render as renderPrompt, languageInstructionFor, } from "../../agent-providers/prompt-template.js";
|
|
19
|
+
import { SummaryOutputSchema } from "./schemas.js";
|
|
20
|
+
import { templateSynthesize } from "./synthesizer-template.js";
|
|
21
|
+
/** Map zod-validated agent output (snake_case Summary) → internal camelCase Summary. */
|
|
22
|
+
function toInternalSummary(out) {
|
|
23
|
+
return {
|
|
24
|
+
headline: out.headline,
|
|
25
|
+
primaryRootCause: out.primary_root_cause === null
|
|
26
|
+
? null
|
|
27
|
+
: {
|
|
28
|
+
findingIds: out.primary_root_cause.finding_ids,
|
|
29
|
+
description: out.primary_root_cause.description,
|
|
30
|
+
targetForFix: out.primary_root_cause.target_for_fix,
|
|
31
|
+
},
|
|
32
|
+
fixPriority: out.fix_priority.map((p) => ({ findingId: p.finding_id, reason: p.reason })),
|
|
33
|
+
crossFindingLinks: out.cross_finding_links.map((l) => ({
|
|
34
|
+
findingIds: l.finding_ids,
|
|
35
|
+
relation: l.relation,
|
|
36
|
+
})),
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
/** Snake-case projection of Finding for the prompt — matches the YAML
|
|
40
|
+
* representation users already see, so the model doesn't have to translate. */
|
|
41
|
+
function findingForPrompt(f, idx) {
|
|
42
|
+
return {
|
|
43
|
+
index: idx,
|
|
44
|
+
rule_id: f.ruleId,
|
|
45
|
+
judgment_kind: f.judgmentKind,
|
|
46
|
+
severity: f.severity,
|
|
47
|
+
symptom: f.symptom,
|
|
48
|
+
likely_cause: f.likelyCause,
|
|
49
|
+
evidence_spans: f.evidence.spans,
|
|
50
|
+
excerpt: f.evidence.excerpt,
|
|
51
|
+
suggested_fix_target: f.suggestedFix.target,
|
|
52
|
+
suggested_fix_change: f.suggestedFix.change,
|
|
53
|
+
confidence: f.confidence,
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* The output_schema rendered into the prompt is a JSON-Schema-ish
|
|
58
|
+
* description of the Summary contract. Authored inline rather than
|
|
59
|
+
* derived from the zod schema to keep the prompt human-readable.
|
|
60
|
+
*/
|
|
61
|
+
const SUMMARY_OUTPUT_SCHEMA_DESCRIPTION = {
|
|
62
|
+
type: "object",
|
|
63
|
+
required: ["headline", "primary_root_cause", "fix_priority", "cross_finding_links"],
|
|
64
|
+
properties: {
|
|
65
|
+
headline: { type: "string", maxLength: 160 },
|
|
66
|
+
primary_root_cause: {
|
|
67
|
+
type: "object_or_null",
|
|
68
|
+
required: ["finding_ids", "description", "target_for_fix"],
|
|
69
|
+
properties: {
|
|
70
|
+
finding_ids: { type: "array", items: { type: "integer" } },
|
|
71
|
+
description: { type: "string" },
|
|
72
|
+
target_for_fix: { type: "string" },
|
|
73
|
+
},
|
|
74
|
+
},
|
|
75
|
+
fix_priority: {
|
|
76
|
+
type: "array",
|
|
77
|
+
items: {
|
|
78
|
+
type: "object",
|
|
79
|
+
required: ["finding_id", "reason"],
|
|
80
|
+
properties: { finding_id: { type: "integer" }, reason: { type: "string" } },
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
cross_finding_links: {
|
|
84
|
+
type: "array",
|
|
85
|
+
items: {
|
|
86
|
+
type: "object",
|
|
87
|
+
required: ["finding_ids", "relation"],
|
|
88
|
+
properties: {
|
|
89
|
+
finding_ids: { type: "array", items: { type: "integer" }, minItems: 2 },
|
|
90
|
+
relation: { type: "string" },
|
|
91
|
+
},
|
|
92
|
+
},
|
|
93
|
+
},
|
|
94
|
+
},
|
|
95
|
+
};
|
|
96
|
+
export async function agentSynthesize(opts) {
|
|
97
|
+
// Empty findings: no narrative to compose. Both modes produce the same
|
|
98
|
+
// summary here, so default to `template` so reports don't claim the
|
|
99
|
+
// agent ran when it didn't.
|
|
100
|
+
if (opts.findings.length === 0) {
|
|
101
|
+
return { summary: templateSynthesize([]), mode: "template" };
|
|
102
|
+
}
|
|
103
|
+
if (!opts.provider) {
|
|
104
|
+
return {
|
|
105
|
+
summary: templateSynthesize(opts.findings),
|
|
106
|
+
mode: "template",
|
|
107
|
+
fallbackReason: "no-provider-registered",
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
const ref = opts.promptRef ?? "builtin:within-trace-synthesizer-v1";
|
|
111
|
+
if (!opts.promptRegistry.has(ref)) {
|
|
112
|
+
return {
|
|
113
|
+
summary: templateSynthesize(opts.findings),
|
|
114
|
+
mode: "template",
|
|
115
|
+
fallbackReason: `prompt-template-missing:${ref}`,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
const tpl = opts.promptRegistry.get(ref);
|
|
119
|
+
const prompt = renderPrompt(tpl, {
|
|
120
|
+
trace_id: opts.traceId,
|
|
121
|
+
agent_id: opts.agentId ?? "<unknown>",
|
|
122
|
+
findings: opts.findings.map((f, i) => findingForPrompt(f, i)),
|
|
123
|
+
output_schema: SUMMARY_OUTPUT_SCHEMA_DESCRIPTION,
|
|
124
|
+
language_instruction: languageInstructionFor(opts.lang ?? "en"),
|
|
125
|
+
});
|
|
126
|
+
try {
|
|
127
|
+
if (!(await opts.provider.isAvailable())) {
|
|
128
|
+
return {
|
|
129
|
+
summary: templateSynthesize(opts.findings),
|
|
130
|
+
mode: "template",
|
|
131
|
+
fallbackReason: `provider-not-available:${opts.provider.name}`,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
if (opts.artifacts) {
|
|
135
|
+
await opts.artifacts.writeStageThreeSynthPrompt(prompt);
|
|
136
|
+
}
|
|
137
|
+
const resp = await opts.provider.invoke({
|
|
138
|
+
prompt,
|
|
139
|
+
outputSchema: SummaryOutputSchema,
|
|
140
|
+
timeoutMs: opts.timeoutMs,
|
|
141
|
+
correlationId: `synthesize:${opts.traceId}`,
|
|
142
|
+
});
|
|
143
|
+
if (opts.artifacts) {
|
|
144
|
+
await opts.artifacts.writeStageThreeSynthResponse(resp.output);
|
|
145
|
+
}
|
|
146
|
+
return { summary: toInternalSummary(resp.output), mode: "agent" };
|
|
147
|
+
}
|
|
148
|
+
catch (e) {
|
|
149
|
+
if (e instanceof AgentProviderError) {
|
|
150
|
+
return {
|
|
151
|
+
summary: templateSynthesize(opts.findings),
|
|
152
|
+
mode: "template",
|
|
153
|
+
fallbackReason: `agent-error:${e.kind}`,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
throw e;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
@@ -11,6 +11,11 @@ export interface Span {
|
|
|
11
11
|
durationMs: number;
|
|
12
12
|
status: 'ok' | 'error' | 'unset';
|
|
13
13
|
attributes: SpanAttributes;
|
|
14
|
+
events?: Array<{
|
|
15
|
+
name?: string;
|
|
16
|
+
time?: string;
|
|
17
|
+
attributes?: Record<string, unknown>;
|
|
18
|
+
}>;
|
|
14
19
|
}
|
|
15
20
|
export type SpanKind = 'tool' | 'llm' | 'retrieval' | 'reasoning' | 'unknown';
|
|
16
21
|
export interface TraceTree {
|
|
@@ -25,6 +30,24 @@ export interface RuleTaxonomy {
|
|
|
25
30
|
signalsAxis: 'interaction' | 'execution' | 'environment';
|
|
26
31
|
msClass: 'retry_loop' | 'tool_misuse' | 'context_loss' | 'goal_drift' | 'cascading_error' | 'silent_quality_degradation';
|
|
27
32
|
}
|
|
33
|
+
export interface RubricInputSpec {
|
|
34
|
+
kind: string;
|
|
35
|
+
source: string;
|
|
36
|
+
}
|
|
37
|
+
export interface RubricSpec {
|
|
38
|
+
judgeQuestion: string;
|
|
39
|
+
inputs: RubricInputSpec[];
|
|
40
|
+
/** Original JSON-Schema-ish blob (kept for YAML round-trips / debug). */
|
|
41
|
+
outputSchemaRaw: Record<string, unknown>;
|
|
42
|
+
/** Compiled zod schema (built once at load time via output-schema-converter). */
|
|
43
|
+
outputZodSchema: import("zod").ZodTypeAny;
|
|
44
|
+
agentBinding: {
|
|
45
|
+
provider: string;
|
|
46
|
+
promptTemplateRef: string;
|
|
47
|
+
};
|
|
48
|
+
/** Optional gating; see RuleSchema.rubric.gates_on. */
|
|
49
|
+
gatesOn?: string[];
|
|
50
|
+
}
|
|
28
51
|
export interface Rule {
|
|
29
52
|
schemaVersion: 'diagnosis-rule/v1';
|
|
30
53
|
id: string;
|
|
@@ -38,10 +61,13 @@ export interface Rule {
|
|
|
38
61
|
verifyWith: {
|
|
39
62
|
assertionTemplates: string[];
|
|
40
63
|
};
|
|
41
|
-
predicateRef
|
|
64
|
+
/** Exactly one of `predicateRef` or `rubric` is non-null (XOR enforced at load). */
|
|
65
|
+
predicateRef: string | null;
|
|
66
|
+
rubric: RubricSpec | null;
|
|
42
67
|
params: Record<string, unknown>;
|
|
43
68
|
sourcePath: string;
|
|
44
69
|
}
|
|
70
|
+
export type JudgmentKind = 'symbolic' | 'rubric';
|
|
45
71
|
export interface Hit {
|
|
46
72
|
evidenceSpans: string[];
|
|
47
73
|
excerpt: string;
|
|
@@ -50,7 +76,7 @@ export interface Hit {
|
|
|
50
76
|
export type Predicate = (trace: TraceTree, params: Record<string, unknown>) => Hit[];
|
|
51
77
|
export interface Finding {
|
|
52
78
|
ruleId: string;
|
|
53
|
-
judgmentKind:
|
|
79
|
+
judgmentKind: JudgmentKind;
|
|
54
80
|
severity: 'low' | 'medium' | 'high';
|
|
55
81
|
symptom: string;
|
|
56
82
|
likelyCause: string;
|
|
@@ -62,7 +88,8 @@ export interface Finding {
|
|
|
62
88
|
target: string;
|
|
63
89
|
change: string;
|
|
64
90
|
};
|
|
65
|
-
|
|
91
|
+
/** Symbolic always 'low' (no semantic basis); rubric carries agent confidence. */
|
|
92
|
+
confidence: 'low' | 'medium' | 'high';
|
|
66
93
|
verifyWith: {
|
|
67
94
|
suggestedEvalCase: {
|
|
68
95
|
queryId: string | null;
|
|
@@ -100,13 +127,13 @@ export interface Report {
|
|
|
100
127
|
run: {
|
|
101
128
|
diagnosedAt: string;
|
|
102
129
|
cliVersion: string;
|
|
103
|
-
mode: 'symbolic-only';
|
|
130
|
+
mode: 'symbolic-only' | 'rubric-only' | 'hybrid';
|
|
104
131
|
rulesApplied: string[];
|
|
105
132
|
rulesSkipped: {
|
|
106
133
|
ruleId: string;
|
|
107
134
|
reason: string;
|
|
108
135
|
}[];
|
|
109
|
-
synthesizerMode: 'template';
|
|
136
|
+
synthesizerMode: 'template' | 'agent';
|
|
110
137
|
};
|
|
111
138
|
summary: Summary;
|
|
112
139
|
findings: Finding[];
|
|
@@ -115,10 +142,32 @@ export interface DiagnoseOpts {
|
|
|
115
142
|
out: string | null;
|
|
116
143
|
rulesDir: string | null;
|
|
117
144
|
noBuiltin: boolean;
|
|
118
|
-
|
|
145
|
+
/** PR-B: when true, skip rubric rules (warn + record in rules_skipped) AND
|
|
146
|
+
* fall the synthesizer back from agent → template. Default is now false
|
|
147
|
+
* (both pillars on). */
|
|
148
|
+
noLlm: boolean;
|
|
149
|
+
/** Skip artifact persistence. Default false (artifacts ARE written). */
|
|
150
|
+
noArtifacts?: boolean;
|
|
151
|
+
/** Override default provider used by the agent synthesizer (rubric rules
|
|
152
|
+
* pick their own provider via `agent_binding.provider`). null = registry default. */
|
|
119
153
|
agentProvider: string | null;
|
|
120
154
|
timeoutMs: number;
|
|
121
155
|
baseUrl: string;
|
|
122
156
|
token: string;
|
|
123
157
|
businessDomain: string;
|
|
158
|
+
/**
|
|
159
|
+
* Output format(s). yaml is the source of truth (always re-derivable into
|
|
160
|
+
* markdown). When `--out` is a file path, `both` writes <stem>.yaml +
|
|
161
|
+
* <stem>.md side by side; `yaml` or `markdown` writes a single file at the
|
|
162
|
+
* given path. When `--out` is null (stdout), `both` collapses to yaml only —
|
|
163
|
+
* piping markdown to a downstream YAML consumer would silently corrupt it.
|
|
164
|
+
* Default: 'both' when out is a file, 'yaml' when stdout.
|
|
165
|
+
*/
|
|
166
|
+
format?: 'yaml' | 'markdown' | 'both';
|
|
167
|
+
/**
|
|
168
|
+
* Output locale for agent-judged natural-language fields (rubric reasoning,
|
|
169
|
+
* synthesizer headline / description / fix_priority reason). Default 'en'.
|
|
170
|
+
* Affects only prose; JSON keys / enum values / span IDs always stay English.
|
|
171
|
+
*/
|
|
172
|
+
lang?: 'en' | 'zh';
|
|
124
173
|
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { TraceSpan } from "../../api/conversations.js";
|
|
2
|
+
import type { EvalAssertion, EvalReference } from "./types.js";
|
|
3
|
+
export interface SemanticMatchVerdict {
|
|
4
|
+
verdict: "pass" | "fail";
|
|
5
|
+
reasoning: string;
|
|
6
|
+
}
|
|
7
|
+
export interface SemanticMatchProvider {
|
|
8
|
+
judgeSemanticMatch(question: string, candidateAnswer: string, referenceAnswer: string): Promise<SemanticMatchVerdict>;
|
|
9
|
+
}
|
|
10
|
+
export interface AssertionContext {
|
|
11
|
+
answer: string;
|
|
12
|
+
spans: TraceSpan[];
|
|
13
|
+
reference?: EvalReference;
|
|
14
|
+
durationMs?: number;
|
|
15
|
+
/**
|
|
16
|
+
* The user message that produced `answer`. Used as the default
|
|
17
|
+
* `{{question}}` for `semantic_match` when the assertion doesn't
|
|
18
|
+
* override it — case authors should not have to repeat user_message
|
|
19
|
+
* inside every semantic_match block.
|
|
20
|
+
*/
|
|
21
|
+
question?: string;
|
|
22
|
+
semanticMatchProvider?: SemanticMatchProvider;
|
|
23
|
+
}
|
|
24
|
+
export interface AssertionResult {
|
|
25
|
+
verdict: "pass" | "fail" | "skip";
|
|
26
|
+
actual?: unknown;
|
|
27
|
+
reason?: string;
|
|
28
|
+
}
|
|
29
|
+
export declare function evaluateAssertion(assertion: EvalAssertion, ctx: AssertionContext): Promise<AssertionResult>;
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
function applyOp(actual, op, expected) {
|
|
2
|
+
switch (op) {
|
|
3
|
+
case "eq": return actual === expected;
|
|
4
|
+
case "lt": return actual < expected;
|
|
5
|
+
case "lte": return actual <= expected;
|
|
6
|
+
case "gt": return actual > expected;
|
|
7
|
+
case "gte": return actual >= expected;
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
function toolCallsFor(spans, toolName) {
|
|
11
|
+
return spans.filter((s) => s.kind === "tool" && s.attributes?.["gen_ai.tool.name"] === toolName);
|
|
12
|
+
}
|
|
13
|
+
function sortedToolNames(spans) {
|
|
14
|
+
return spans
|
|
15
|
+
.filter((s) => s.kind === "tool")
|
|
16
|
+
.slice()
|
|
17
|
+
.sort((a, b) => (a.startTime < b.startTime ? -1 : a.startTime > b.startTime ? 1 : 0))
|
|
18
|
+
.map((s) => String(s.attributes?.["gen_ai.tool.name"] ?? ""));
|
|
19
|
+
}
|
|
20
|
+
function isSubsequence(sequence, actual) {
|
|
21
|
+
let si = 0;
|
|
22
|
+
for (const name of actual) {
|
|
23
|
+
if (name === sequence[si])
|
|
24
|
+
si++;
|
|
25
|
+
if (si === sequence.length)
|
|
26
|
+
return true;
|
|
27
|
+
}
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
export async function evaluateAssertion(assertion, ctx) {
|
|
31
|
+
const { answer, spans, durationMs } = ctx;
|
|
32
|
+
const a = assertion;
|
|
33
|
+
switch (assertion.type) {
|
|
34
|
+
case "contains": {
|
|
35
|
+
const value = String(a["value"] ?? "");
|
|
36
|
+
return answer.includes(value)
|
|
37
|
+
? { verdict: "pass" }
|
|
38
|
+
: { verdict: "fail", actual: answer };
|
|
39
|
+
}
|
|
40
|
+
case "not_contains": {
|
|
41
|
+
const value = String(a["value"] ?? "");
|
|
42
|
+
return answer.includes(value)
|
|
43
|
+
? { verdict: "fail", actual: answer }
|
|
44
|
+
: { verdict: "pass" };
|
|
45
|
+
}
|
|
46
|
+
case "regex": {
|
|
47
|
+
const pattern = String(a["pattern"] ?? "");
|
|
48
|
+
let re;
|
|
49
|
+
try {
|
|
50
|
+
re = new RegExp(pattern);
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
return { verdict: "skip", reason: "invalid-regex: " + pattern };
|
|
54
|
+
}
|
|
55
|
+
return re.test(answer) ? { verdict: "pass" } : { verdict: "fail", actual: answer };
|
|
56
|
+
}
|
|
57
|
+
case "tool_call_count": {
|
|
58
|
+
const tool = String(a["tool"] ?? "");
|
|
59
|
+
const op = a["op"] ?? "eq";
|
|
60
|
+
const value = Number(a["value"] ?? 0);
|
|
61
|
+
const count = toolCallsFor(spans, tool).length;
|
|
62
|
+
return applyOp(count, op, value)
|
|
63
|
+
? { verdict: "pass", actual: count }
|
|
64
|
+
: { verdict: "fail", actual: count };
|
|
65
|
+
}
|
|
66
|
+
case "tool_call_order": {
|
|
67
|
+
const sequence = Array.isArray(a["sequence"])
|
|
68
|
+
? a["sequence"].map(String)
|
|
69
|
+
: [];
|
|
70
|
+
const actual = sortedToolNames(spans);
|
|
71
|
+
return isSubsequence(sequence, actual)
|
|
72
|
+
? { verdict: "pass", actual }
|
|
73
|
+
: { verdict: "fail", actual };
|
|
74
|
+
}
|
|
75
|
+
case "latency_ms": {
|
|
76
|
+
if (durationMs === undefined || durationMs === null) {
|
|
77
|
+
return { verdict: "skip", reason: "durationMs not available" };
|
|
78
|
+
}
|
|
79
|
+
const op = a["op"] ?? "lte";
|
|
80
|
+
const value = Number(a["value"] ?? 0);
|
|
81
|
+
return applyOp(durationMs, op, value)
|
|
82
|
+
? { verdict: "pass", actual: durationMs }
|
|
83
|
+
: { verdict: "fail", actual: durationMs };
|
|
84
|
+
}
|
|
85
|
+
case "semantic_match": {
|
|
86
|
+
const provider = ctx.semanticMatchProvider;
|
|
87
|
+
if (!provider) {
|
|
88
|
+
return { verdict: "skip", reason: "semantic_match requires a provider; pass semanticMatchProvider in context" };
|
|
89
|
+
}
|
|
90
|
+
if (!ctx.reference?.answer) {
|
|
91
|
+
return { verdict: "skip", reason: "semantic_match requires reference.answer on the eval case" };
|
|
92
|
+
}
|
|
93
|
+
const question = String(a["question"] ?? ctx.question ?? "");
|
|
94
|
+
const smv = await provider.judgeSemanticMatch(question, answer, ctx.reference.answer);
|
|
95
|
+
return { verdict: smv.verdict, actual: smv.reasoning };
|
|
96
|
+
}
|
|
97
|
+
default:
|
|
98
|
+
return { verdict: "skip", reason: `unknown assertion type: ${assertion.type}` };
|
|
99
|
+
}
|
|
100
|
+
}
|