@kweaver-ai/kweaver-sdk 0.7.4 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/README.md +39 -5
  2. package/README.zh.md +37 -5
  3. package/dist/agent-providers/index.d.ts +7 -0
  4. package/dist/agent-providers/index.js +5 -0
  5. package/dist/agent-providers/prompt-template.d.ts +62 -0
  6. package/dist/agent-providers/prompt-template.js +105 -0
  7. package/dist/agent-providers/prompts/rubric-judge-v1.prompt.md +51 -0
  8. package/dist/agent-providers/prompts/within-trace-synthesizer-v1.prompt.md +60 -0
  9. package/dist/agent-providers/providers/claude-code-subprocess.d.ts +74 -0
  10. package/dist/agent-providers/providers/claude-code-subprocess.js +259 -0
  11. package/dist/agent-providers/providers/stub.d.ts +47 -0
  12. package/dist/agent-providers/providers/stub.js +77 -0
  13. package/dist/agent-providers/registry.d.ts +45 -0
  14. package/dist/agent-providers/registry.js +77 -0
  15. package/dist/agent-providers/types.d.ts +91 -0
  16. package/dist/agent-providers/types.js +25 -0
  17. package/dist/api/agent-chat.js +8 -6
  18. package/dist/api/agent-observability.d.ts +51 -0
  19. package/dist/api/agent-observability.js +108 -0
  20. package/dist/api/context-loader.d.ts +1 -0
  21. package/dist/api/conversations.d.ts +4 -8
  22. package/dist/api/conversations.js +16 -58
  23. package/dist/api/datasources.d.ts +2 -20
  24. package/dist/api/datasources.js +7 -123
  25. package/dist/api/semantic-search.d.ts +5 -0
  26. package/dist/api/semantic-search.js +5 -0
  27. package/dist/api/skills.d.ts +75 -2
  28. package/dist/api/skills.js +108 -12
  29. package/dist/api/trace.d.ts +49 -0
  30. package/dist/api/trace.js +85 -0
  31. package/dist/api/vega.d.ts +53 -0
  32. package/dist/api/vega.js +144 -0
  33. package/dist/cli.js +12 -5
  34. package/dist/commands/agent/mode.d.ts +6 -0
  35. package/dist/commands/agent/mode.js +75 -0
  36. package/dist/commands/agent.js +101 -29
  37. package/dist/commands/bkn-ops.js +12 -6
  38. package/dist/commands/bkn-utils.d.ts +9 -0
  39. package/dist/commands/bkn-utils.js +17 -0
  40. package/dist/commands/context-loader.js +608 -38
  41. package/dist/commands/ds.js +7 -2
  42. package/dist/commands/skill.d.ts +21 -1
  43. package/dist/commands/skill.js +389 -1
  44. package/dist/commands/trace.d.ts +39 -0
  45. package/dist/commands/trace.js +668 -0
  46. package/dist/index.d.ts +2 -2
  47. package/dist/index.js +1 -1
  48. package/dist/resources/bkn.d.ts +5 -0
  49. package/dist/resources/bkn.js +5 -0
  50. package/dist/resources/datasources.js +2 -1
  51. package/dist/resources/skills.d.ts +17 -1
  52. package/dist/resources/skills.js +32 -1
  53. package/dist/trace-ai/diagnose/agent-binding.d.ts +67 -0
  54. package/dist/trace-ai/diagnose/agent-binding.js +257 -0
  55. package/dist/trace-ai/diagnose/builtin-rules/excessive-tool-calls-per-turn.d.ts +2 -0
  56. package/dist/trace-ai/diagnose/builtin-rules/excessive-tool-calls-per-turn.js +15 -0
  57. package/dist/trace-ai/diagnose/builtin-rules/excessive-tool-calls-per-turn.yaml +16 -0
  58. package/dist/trace-ai/diagnose/builtin-rules/llm-response-truncated-no-continue.d.ts +2 -0
  59. package/dist/trace-ai/diagnose/builtin-rules/llm-response-truncated-no-continue.js +44 -0
  60. package/dist/trace-ai/diagnose/builtin-rules/llm-response-truncated-no-continue.yaml +15 -0
  61. package/dist/trace-ai/diagnose/builtin-rules/register.d.ts +1 -0
  62. package/dist/trace-ai/diagnose/builtin-rules/register.js +11 -0
  63. package/dist/trace-ai/diagnose/builtin-rules/retrieval-empty-no-fallback.d.ts +2 -0
  64. package/dist/trace-ai/diagnose/builtin-rules/retrieval-empty-no-fallback.js +29 -0
  65. package/dist/trace-ai/diagnose/builtin-rules/retrieval-empty-no-fallback.yaml +15 -0
  66. package/dist/trace-ai/diagnose/builtin-rules/tool-error-swallowed.d.ts +2 -0
  67. package/dist/trace-ai/diagnose/builtin-rules/tool-error-swallowed.js +45 -0
  68. package/dist/trace-ai/diagnose/builtin-rules/tool-error-swallowed.yaml +15 -0
  69. package/dist/trace-ai/diagnose/builtin-rules/tool-loop-no-state-change.d.ts +2 -0
  70. package/dist/trace-ai/diagnose/builtin-rules/tool-loop-no-state-change.js +38 -0
  71. package/dist/trace-ai/diagnose/builtin-rules/tool-loop-no-state-change.yaml +16 -0
  72. package/dist/trace-ai/diagnose/builtin-rules/tool-retry-intent-mismatch.yaml +68 -0
  73. package/dist/trace-ai/diagnose/index.d.ts +32 -0
  74. package/dist/trace-ai/diagnose/index.js +246 -0
  75. package/dist/trace-ai/diagnose/output-schema-converter.d.ts +24 -0
  76. package/dist/trace-ai/diagnose/output-schema-converter.js +81 -0
  77. package/dist/trace-ai/diagnose/predicate-registry.d.ts +7 -0
  78. package/dist/trace-ai/diagnose/predicate-registry.js +30 -0
  79. package/dist/trace-ai/diagnose/query-extractor.d.ts +14 -0
  80. package/dist/trace-ai/diagnose/query-extractor.js +45 -0
  81. package/dist/trace-ai/diagnose/report-assembler.d.ts +31 -0
  82. package/dist/trace-ai/diagnose/report-assembler.js +100 -0
  83. package/dist/trace-ai/diagnose/report-markdown.d.ts +18 -0
  84. package/dist/trace-ai/diagnose/report-markdown.js +192 -0
  85. package/dist/trace-ai/diagnose/rule-loader.d.ts +11 -0
  86. package/dist/trace-ai/diagnose/rule-loader.js +120 -0
  87. package/dist/trace-ai/diagnose/schemas.d.ts +184 -0
  88. package/dist/trace-ai/diagnose/schemas.js +154 -0
  89. package/dist/trace-ai/diagnose/signal-probe.d.ts +17 -0
  90. package/dist/trace-ai/diagnose/signal-probe.js +39 -0
  91. package/dist/trace-ai/diagnose/synthesizer-agent.d.ts +40 -0
  92. package/dist/trace-ai/diagnose/synthesizer-agent.js +158 -0
  93. package/dist/trace-ai/diagnose/synthesizer-template.d.ts +2 -0
  94. package/dist/trace-ai/diagnose/synthesizer-template.js +49 -0
  95. package/dist/trace-ai/diagnose/trace-shaper.d.ts +3 -0
  96. package/dist/trace-ai/diagnose/trace-shaper.js +73 -0
  97. package/dist/trace-ai/diagnose/types.d.ts +173 -0
  98. package/dist/trace-ai/diagnose/types.js +1 -0
  99. package/dist/trace-ai/eval-set/assertion-evaluator.d.ts +29 -0
  100. package/dist/trace-ai/eval-set/assertion-evaluator.js +100 -0
  101. package/dist/trace-ai/eval-set/builder.d.ts +36 -0
  102. package/dist/trace-ai/eval-set/builder.js +126 -0
  103. package/dist/trace-ai/eval-set/index.d.ts +15 -0
  104. package/dist/trace-ai/eval-set/index.js +10 -0
  105. package/dist/trace-ai/eval-set/output-writer.d.ts +27 -0
  106. package/dist/trace-ai/eval-set/output-writer.js +126 -0
  107. package/dist/trace-ai/eval-set/query-picker.d.ts +37 -0
  108. package/dist/trace-ai/eval-set/query-picker.js +147 -0
  109. package/dist/trace-ai/eval-set/redactor.d.ts +42 -0
  110. package/dist/trace-ai/eval-set/redactor.js +133 -0
  111. package/dist/trace-ai/eval-set/rubric-templates/answer-match-reference.prompt.md +19 -0
  112. package/dist/trace-ai/eval-set/schemas.d.ts +136 -0
  113. package/dist/trace-ai/eval-set/schemas.js +130 -0
  114. package/dist/trace-ai/eval-set/semantic-match-provider.d.ts +33 -0
  115. package/dist/trace-ai/eval-set/semantic-match-provider.js +51 -0
  116. package/dist/trace-ai/eval-set/test-runner.d.ts +34 -0
  117. package/dist/trace-ai/eval-set/test-runner.js +153 -0
  118. package/dist/trace-ai/eval-set/types.d.ts +46 -0
  119. package/dist/trace-ai/eval-set/types.js +8 -0
  120. package/dist/trace-ai/exp/bundle-writer.d.ts +10 -0
  121. package/dist/trace-ai/exp/bundle-writer.js +54 -0
  122. package/dist/trace-ai/exp/claude-binary.d.ts +5 -0
  123. package/dist/trace-ai/exp/claude-binary.js +30 -0
  124. package/dist/trace-ai/exp/coordinator.d.ts +45 -0
  125. package/dist/trace-ai/exp/coordinator.js +203 -0
  126. package/dist/trace-ai/exp/eval-runner.d.ts +14 -0
  127. package/dist/trace-ai/exp/eval-runner.js +47 -0
  128. package/dist/trace-ai/exp/exp-store/abort-signal.d.ts +3 -0
  129. package/dist/trace-ai/exp/exp-store/abort-signal.js +27 -0
  130. package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.d.ts +4 -0
  131. package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.js +37 -0
  132. package/dist/trace-ai/exp/exp-store/events-jsonl.d.ts +17 -0
  133. package/dist/trace-ai/exp/exp-store/events-jsonl.js +60 -0
  134. package/dist/trace-ai/exp/exp-store/exp-registry.d.ts +6 -0
  135. package/dist/trace-ai/exp/exp-store/exp-registry.js +41 -0
  136. package/dist/trace-ai/exp/exp-store/index.d.ts +46 -0
  137. package/dist/trace-ai/exp/exp-store/index.js +59 -0
  138. package/dist/trace-ai/exp/exp-store/lock.d.ts +3 -0
  139. package/dist/trace-ai/exp/exp-store/lock.js +73 -0
  140. package/dist/trace-ai/exp/exp-store/mission-md.d.ts +3 -0
  141. package/dist/trace-ai/exp/exp-store/mission-md.js +37 -0
  142. package/dist/trace-ai/exp/exp-store/readme-template.d.ts +5 -0
  143. package/dist/trace-ai/exp/exp-store/readme-template.js +25 -0
  144. package/dist/trace-ai/exp/exp-store/round-yaml.d.ts +3 -0
  145. package/dist/trace-ai/exp/exp-store/round-yaml.js +33 -0
  146. package/dist/trace-ai/exp/index.d.ts +8 -0
  147. package/dist/trace-ai/exp/index.js +238 -0
  148. package/dist/trace-ai/exp/info.d.ts +35 -0
  149. package/dist/trace-ai/exp/info.js +120 -0
  150. package/dist/trace-ai/exp/patch/agent-config.d.ts +1 -0
  151. package/dist/trace-ai/exp/patch/agent-config.js +26 -0
  152. package/dist/trace-ai/exp/patch/index.d.ts +2 -0
  153. package/dist/trace-ai/exp/patch/index.js +13 -0
  154. package/dist/trace-ai/exp/patch/skill.d.ts +1 -0
  155. package/dist/trace-ai/exp/patch/skill.js +24 -0
  156. package/dist/trace-ai/exp/providers/synthesizer-client.d.ts +14 -0
  157. package/dist/trace-ai/exp/providers/synthesizer-client.js +39 -0
  158. package/dist/trace-ai/exp/providers/triage-client.d.ts +19 -0
  159. package/dist/trace-ai/exp/providers/triage-client.js +51 -0
  160. package/dist/trace-ai/exp/schemas.d.ts +147 -0
  161. package/dist/trace-ai/exp/schemas.js +50 -0
  162. package/dist/trace-ai/exp/scoring.d.ts +2 -0
  163. package/dist/trace-ai/exp/scoring.js +46 -0
  164. package/dist/trace-ai/scan/aggregator.d.ts +20 -0
  165. package/dist/trace-ai/scan/aggregator.js +26 -0
  166. package/dist/trace-ai/scan/artifacts/paths.d.ts +12 -0
  167. package/dist/trace-ai/scan/artifacts/paths.js +18 -0
  168. package/dist/trace-ai/scan/artifacts/writer.d.ts +67 -0
  169. package/dist/trace-ai/scan/artifacts/writer.js +96 -0
  170. package/dist/trace-ai/scan/batched-rubric.d.ts +55 -0
  171. package/dist/trace-ai/scan/batched-rubric.js +159 -0
  172. package/dist/trace-ai/scan/cross-trace-synthesizer.d.ts +24 -0
  173. package/dist/trace-ai/scan/cross-trace-synthesizer.js +93 -0
  174. package/dist/trace-ai/scan/index.d.ts +31 -0
  175. package/dist/trace-ai/scan/index.js +390 -0
  176. package/dist/trace-ai/scan/prompts/builtin/cross-trace-synthesizer-v1.prompt.md +44 -0
  177. package/dist/trace-ai/scan/prompts/builtin/rubric-judge-batch-v1.prompt.md +44 -0
  178. package/dist/trace-ai/scan/runner.d.ts +25 -0
  179. package/dist/trace-ai/scan/runner.js +42 -0
  180. package/dist/trace-ai/scan/sampler.d.ts +18 -0
  181. package/dist/trace-ai/scan/sampler.js +81 -0
  182. package/dist/trace-ai/scan/scan-summary-markdown.d.ts +2 -0
  183. package/dist/trace-ai/scan/scan-summary-markdown.js +71 -0
  184. package/dist/trace-ai/scan/scan-summary-schema.d.ts +73 -0
  185. package/dist/trace-ai/scan/scan-summary-schema.js +61 -0
  186. package/dist/trace-ai/scan/single-agent-validator.d.ts +23 -0
  187. package/dist/trace-ai/scan/single-agent-validator.js +42 -0
  188. package/dist/trace-ai/scan/traces-list-parser.d.ts +15 -0
  189. package/dist/trace-ai/scan/traces-list-parser.js +46 -0
  190. package/package.json +14 -4
@@ -0,0 +1,73 @@
1
+ // Map from OTel GenAI `gen_ai.operation.name` (the cross-runtime standard) to
2
+ // the diagnostic SpanKind buckets rules filter on. `agent.trace.type` is kept
3
+ // as a fallback for runtimes/fixtures that pre-tag spans with our own taxonomy.
4
+ const KIND_MAP = {
5
+ // OTel GenAI semconv operation names
6
+ chat: "llm",
7
+ text_completion: "llm",
8
+ embeddings: "retrieval",
9
+ execute_tool: "tool",
10
+ // pre-existing custom taxonomy (synthetic fixtures, optional runtime tag)
11
+ model: "llm",
12
+ llm: "llm",
13
+ tool: "tool",
14
+ retrieval: "retrieval",
15
+ reasoning: "reasoning",
16
+ };
17
+ function deriveKind(attrs) {
18
+ const op = attrs["gen_ai.operation.name"];
19
+ if (typeof op === "string" && op in KIND_MAP)
20
+ return KIND_MAP[op];
21
+ const t = attrs["agent.trace.type"];
22
+ if (typeof t === "string" && t in KIND_MAP)
23
+ return KIND_MAP[t];
24
+ return "unknown";
25
+ }
26
+ function deriveStatus(raw) {
27
+ const code = raw?.code?.toUpperCase();
28
+ if (code === "OK")
29
+ return "ok";
30
+ if (code === "ERROR")
31
+ return "error";
32
+ return "unset";
33
+ }
34
+ function durationMs(start, end) {
35
+ if (!start || !end)
36
+ return 0;
37
+ // string nanos → BigInt to avoid precision loss, then convert to ms.
38
+ const s = BigInt(start);
39
+ const e = BigInt(end);
40
+ return Number((e - s) / 1000000n);
41
+ }
42
+ export function assembleTraceTree(traceId, raw) {
43
+ const spans = raw.map((r) => {
44
+ const attrs = r.attributes ?? {};
45
+ return {
46
+ spanId: r.spanId,
47
+ parentSpanId: r.parentSpanId ?? null,
48
+ name: r.name ?? "",
49
+ kind: deriveKind(attrs),
50
+ startTimeUnixNano: r.startTimeUnixNano ?? "0",
51
+ endTimeUnixNano: r.endTimeUnixNano ?? "0",
52
+ durationMs: durationMs(r.startTimeUnixNano, r.endTimeUnixNano),
53
+ status: deriveStatus(r.status),
54
+ attributes: attrs,
55
+ events: r.events,
56
+ };
57
+ });
58
+ const byId = new Map();
59
+ const parentToChildren = new Map();
60
+ const byKind = new Map();
61
+ for (const s of spans) {
62
+ byId.set(s.spanId, s);
63
+ const arr = parentToChildren.get(s.parentSpanId) ?? [];
64
+ arr.push(s);
65
+ parentToChildren.set(s.parentSpanId, arr);
66
+ const kindArr = byKind.get(s.kind) ?? [];
67
+ kindArr.push(s);
68
+ byKind.set(s.kind, kindArr);
69
+ }
70
+ const roots = parentToChildren.get(null) ?? [];
71
+ const root = roots.length > 0 ? roots[0] : null;
72
+ return { traceId, spans, byId, parentToChildren, byKind, root };
73
+ }
@@ -0,0 +1,173 @@
1
+ export interface SpanAttributes {
2
+ [key: string]: unknown;
3
+ }
4
+ export interface Span {
5
+ spanId: string;
6
+ parentSpanId: string | null;
7
+ name: string;
8
+ kind: SpanKind;
9
+ startTimeUnixNano: string;
10
+ endTimeUnixNano: string;
11
+ durationMs: number;
12
+ status: 'ok' | 'error' | 'unset';
13
+ attributes: SpanAttributes;
14
+ events?: Array<{
15
+ name?: string;
16
+ time?: string;
17
+ attributes?: Record<string, unknown>;
18
+ }>;
19
+ }
20
+ export type SpanKind = 'tool' | 'llm' | 'retrieval' | 'reasoning' | 'unknown';
21
+ export interface TraceTree {
22
+ traceId: string;
23
+ spans: Span[];
24
+ byId: Map<string, Span>;
25
+ parentToChildren: Map<string | null, Span[]>;
26
+ byKind: Map<SpanKind, Span[]>;
27
+ root: Span | null;
28
+ }
29
+ export interface RuleTaxonomy {
30
+ signalsAxis: 'interaction' | 'execution' | 'environment';
31
+ msClass: 'retry_loop' | 'tool_misuse' | 'context_loss' | 'goal_drift' | 'cascading_error' | 'silent_quality_degradation';
32
+ }
33
+ export interface RubricInputSpec {
34
+ kind: string;
35
+ source: string;
36
+ }
37
+ export interface RubricSpec {
38
+ judgeQuestion: string;
39
+ inputs: RubricInputSpec[];
40
+ /** Original JSON-Schema-ish blob (kept for YAML round-trips / debug). */
41
+ outputSchemaRaw: Record<string, unknown>;
42
+ /** Compiled zod schema (built once at load time via output-schema-converter). */
43
+ outputZodSchema: import("zod").ZodTypeAny;
44
+ agentBinding: {
45
+ provider: string;
46
+ promptTemplateRef: string;
47
+ };
48
+ /** Optional gating; see RuleSchema.rubric.gates_on. */
49
+ gatesOn?: string[];
50
+ }
51
+ export interface Rule {
52
+ schemaVersion: 'diagnosis-rule/v1';
53
+ id: string;
54
+ severity: 'low' | 'medium' | 'high';
55
+ symptom: string;
56
+ taxonomy: RuleTaxonomy;
57
+ suggestedFix: {
58
+ target: string;
59
+ changeTemplate: string;
60
+ };
61
+ verifyWith: {
62
+ assertionTemplates: string[];
63
+ };
64
+ /** Exactly one of `predicateRef` or `rubric` is non-null (XOR enforced at load). */
65
+ predicateRef: string | null;
66
+ rubric: RubricSpec | null;
67
+ params: Record<string, unknown>;
68
+ sourcePath: string;
69
+ }
70
+ export type JudgmentKind = 'symbolic' | 'rubric';
71
+ export interface Hit {
72
+ evidenceSpans: string[];
73
+ excerpt: string;
74
+ bindings: Record<string, unknown>;
75
+ }
76
+ export type Predicate = (trace: TraceTree, params: Record<string, unknown>) => Hit[];
77
+ export interface Finding {
78
+ ruleId: string;
79
+ judgmentKind: JudgmentKind;
80
+ severity: 'low' | 'medium' | 'high';
81
+ symptom: string;
82
+ likelyCause: string;
83
+ evidence: {
84
+ spans: string[];
85
+ excerpt: string;
86
+ };
87
+ suggestedFix: {
88
+ target: string;
89
+ change: string;
90
+ };
91
+ /** Symbolic always 'low' (no semantic basis); rubric carries agent confidence. */
92
+ confidence: 'low' | 'medium' | 'high';
93
+ verifyWith: {
94
+ suggestedEvalCase: {
95
+ queryId: string | null;
96
+ query: string | null;
97
+ assertions: string[];
98
+ };
99
+ };
100
+ }
101
+ export interface SummaryRootCause {
102
+ findingIds: number[];
103
+ description: string;
104
+ targetForFix: string;
105
+ }
106
+ export interface SummaryFixPriority {
107
+ findingId: number;
108
+ reason: string;
109
+ }
110
+ export interface SummaryCrossLink {
111
+ findingIds: number[];
112
+ relation: string;
113
+ }
114
+ export interface Summary {
115
+ headline: string;
116
+ primaryRootCause: SummaryRootCause | null;
117
+ fixPriority: SummaryFixPriority[];
118
+ crossFindingLinks: SummaryCrossLink[];
119
+ }
120
+ export interface Report {
121
+ schemaVersion: 'trace-diagnose-report/v1';
122
+ trace: {
123
+ traceId: string;
124
+ agentId: string | null;
125
+ tenant: string | null;
126
+ };
127
+ run: {
128
+ diagnosedAt: string;
129
+ cliVersion: string;
130
+ mode: 'symbolic-only' | 'rubric-only' | 'hybrid';
131
+ rulesApplied: string[];
132
+ rulesSkipped: {
133
+ ruleId: string;
134
+ reason: string;
135
+ }[];
136
+ synthesizerMode: 'template' | 'agent';
137
+ };
138
+ summary: Summary;
139
+ findings: Finding[];
140
+ }
141
+ export interface DiagnoseOpts {
142
+ out: string | null;
143
+ rulesDir: string | null;
144
+ noBuiltin: boolean;
145
+ /** PR-B: when true, skip rubric rules (warn + record in rules_skipped) AND
146
+ * fall the synthesizer back from agent → template. Default is now false
147
+ * (both pillars on). */
148
+ noLlm: boolean;
149
+ /** Skip artifact persistence. Default false (artifacts ARE written). */
150
+ noArtifacts?: boolean;
151
+ /** Override default provider used by the agent synthesizer (rubric rules
152
+ * pick their own provider via `agent_binding.provider`). null = registry default. */
153
+ agentProvider: string | null;
154
+ timeoutMs: number;
155
+ baseUrl: string;
156
+ token: string;
157
+ businessDomain: string;
158
+ /**
159
+ * Output format(s). yaml is the source of truth (always re-derivable into
160
+ * markdown). When `--out` is a file path, `both` writes <stem>.yaml +
161
+ * <stem>.md side by side; `yaml` or `markdown` writes a single file at the
162
+ * given path. When `--out` is null (stdout), `both` collapses to yaml only —
163
+ * piping markdown to a downstream YAML consumer would silently corrupt it.
164
+ * Default: 'both' when out is a file, 'yaml' when stdout.
165
+ */
166
+ format?: 'yaml' | 'markdown' | 'both';
167
+ /**
168
+ * Output locale for agent-judged natural-language fields (rubric reasoning,
169
+ * synthesizer headline / description / fix_priority reason). Default 'en'.
170
+ * Affects only prose; JSON keys / enum values / span IDs always stay English.
171
+ */
172
+ lang?: 'en' | 'zh';
173
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,29 @@
1
+ import type { TraceSpan } from "../../api/conversations.js";
2
+ import type { EvalAssertion, EvalReference } from "./types.js";
3
+ export interface SemanticMatchVerdict {
4
+ verdict: "pass" | "fail";
5
+ reasoning: string;
6
+ }
7
+ export interface SemanticMatchProvider {
8
+ judgeSemanticMatch(question: string, candidateAnswer: string, referenceAnswer: string): Promise<SemanticMatchVerdict>;
9
+ }
10
+ export interface AssertionContext {
11
+ answer: string;
12
+ spans: TraceSpan[];
13
+ reference?: EvalReference;
14
+ durationMs?: number;
15
+ /**
16
+ * The user message that produced `answer`. Used as the default
17
+ * `{{question}}` for `semantic_match` when the assertion doesn't
18
+ * override it — case authors should not have to repeat user_message
19
+ * inside every semantic_match block.
20
+ */
21
+ question?: string;
22
+ semanticMatchProvider?: SemanticMatchProvider;
23
+ }
24
+ export interface AssertionResult {
25
+ verdict: "pass" | "fail" | "skip";
26
+ actual?: unknown;
27
+ reason?: string;
28
+ }
29
+ export declare function evaluateAssertion(assertion: EvalAssertion, ctx: AssertionContext): Promise<AssertionResult>;
@@ -0,0 +1,100 @@
1
+ function applyOp(actual, op, expected) {
2
+ switch (op) {
3
+ case "eq": return actual === expected;
4
+ case "lt": return actual < expected;
5
+ case "lte": return actual <= expected;
6
+ case "gt": return actual > expected;
7
+ case "gte": return actual >= expected;
8
+ }
9
+ }
10
+ function toolCallsFor(spans, toolName) {
11
+ return spans.filter((s) => s.kind === "tool" && s.attributes?.["gen_ai.tool.name"] === toolName);
12
+ }
13
+ function sortedToolNames(spans) {
14
+ return spans
15
+ .filter((s) => s.kind === "tool")
16
+ .slice()
17
+ .sort((a, b) => (a.startTime < b.startTime ? -1 : a.startTime > b.startTime ? 1 : 0))
18
+ .map((s) => String(s.attributes?.["gen_ai.tool.name"] ?? ""));
19
+ }
20
+ function isSubsequence(sequence, actual) {
21
+ let si = 0;
22
+ for (const name of actual) {
23
+ if (name === sequence[si])
24
+ si++;
25
+ if (si === sequence.length)
26
+ return true;
27
+ }
28
+ return false;
29
+ }
30
+ export async function evaluateAssertion(assertion, ctx) {
31
+ const { answer, spans, durationMs } = ctx;
32
+ const a = assertion;
33
+ switch (assertion.type) {
34
+ case "contains": {
35
+ const value = String(a["value"] ?? "");
36
+ return answer.includes(value)
37
+ ? { verdict: "pass" }
38
+ : { verdict: "fail", actual: answer };
39
+ }
40
+ case "not_contains": {
41
+ const value = String(a["value"] ?? "");
42
+ return answer.includes(value)
43
+ ? { verdict: "fail", actual: answer }
44
+ : { verdict: "pass" };
45
+ }
46
+ case "regex": {
47
+ const pattern = String(a["pattern"] ?? "");
48
+ let re;
49
+ try {
50
+ re = new RegExp(pattern);
51
+ }
52
+ catch {
53
+ return { verdict: "skip", reason: "invalid-regex: " + pattern };
54
+ }
55
+ return re.test(answer) ? { verdict: "pass" } : { verdict: "fail", actual: answer };
56
+ }
57
+ case "tool_call_count": {
58
+ const tool = String(a["tool"] ?? "");
59
+ const op = a["op"] ?? "eq";
60
+ const value = Number(a["value"] ?? 0);
61
+ const count = toolCallsFor(spans, tool).length;
62
+ return applyOp(count, op, value)
63
+ ? { verdict: "pass", actual: count }
64
+ : { verdict: "fail", actual: count };
65
+ }
66
+ case "tool_call_order": {
67
+ const sequence = Array.isArray(a["sequence"])
68
+ ? a["sequence"].map(String)
69
+ : [];
70
+ const actual = sortedToolNames(spans);
71
+ return isSubsequence(sequence, actual)
72
+ ? { verdict: "pass", actual }
73
+ : { verdict: "fail", actual };
74
+ }
75
+ case "latency_ms": {
76
+ if (durationMs === undefined || durationMs === null) {
77
+ return { verdict: "skip", reason: "durationMs not available" };
78
+ }
79
+ const op = a["op"] ?? "lte";
80
+ const value = Number(a["value"] ?? 0);
81
+ return applyOp(durationMs, op, value)
82
+ ? { verdict: "pass", actual: durationMs }
83
+ : { verdict: "fail", actual: durationMs };
84
+ }
85
+ case "semantic_match": {
86
+ const provider = ctx.semanticMatchProvider;
87
+ if (!provider) {
88
+ return { verdict: "skip", reason: "semantic_match requires a provider; pass semanticMatchProvider in context" };
89
+ }
90
+ if (!ctx.reference?.answer) {
91
+ return { verdict: "skip", reason: "semantic_match requires reference.answer on the eval case" };
92
+ }
93
+ const question = String(a["question"] ?? ctx.question ?? "");
94
+ const smv = await provider.judgeSemanticMatch(question, answer, ctx.reference.answer);
95
+ return { verdict: smv.verdict, actual: smv.reasoning };
96
+ }
97
+ default:
98
+ return { verdict: "skip", reason: `unknown assertion type: ${assertion.type}` };
99
+ }
100
+ }
@@ -0,0 +1,36 @@
1
+ /**
2
+ * M5 eval-set builder — orchestrates build:
3
+ * picker → ensureQueryId → redact → write (with conflict resolution) → validate
4
+ *
5
+ * `ensureQueryId` is the deterministic hash-based ID generator (inline here,
6
+ * not a separate file — spec doc §9 "反过度工程" decision).
7
+ */
8
+ import type { BuildResult } from "./types.js";
9
+ import { type ConflictStrategy } from "./output-writer.js";
10
+ export declare class BuilderError extends Error {
11
+ readonly cause?: Error | undefined;
12
+ constructor(message: string, cause?: Error | undefined);
13
+ }
14
+ export type BuildSource = {
15
+ kind: "diagnosis";
16
+ path: string;
17
+ } | {
18
+ kind: "queries";
19
+ path: string;
20
+ };
21
+ export interface BuildOpts {
22
+ source: BuildSource;
23
+ outDir: string;
24
+ evalSetId: string;
25
+ onConflict: ConflictStrategy;
26
+ /** From `--redaction-rules=<path>` */
27
+ redactionRulesCliFlag: string | undefined;
28
+ /** From CWD: usually `path.join(process.cwd(), "redaction-rules")` — caller passes resolved path */
29
+ repoDir: string | undefined;
30
+ }
31
+ export declare function ensureQueryId(c: {
32
+ query_id: string;
33
+ input: unknown;
34
+ tags?: string[];
35
+ }): string;
36
+ export declare function build(opts: BuildOpts): Promise<BuildResult>;
@@ -0,0 +1,126 @@
1
+ /**
2
+ * M5 eval-set builder — orchestrates build:
3
+ * picker → ensureQueryId → redact → write (with conflict resolution) → validate
4
+ *
5
+ * `ensureQueryId` is the deterministic hash-based ID generator (inline here,
6
+ * not a separate file — spec doc §9 "反过度工程" decision).
7
+ */
8
+ import { createHash } from "node:crypto";
9
+ import { liftFromQueriesFile, liftFromDiagnosis, QueryPickerError } from "./query-picker.js";
10
+ import { loadRules, applyRules, RedactorError } from "./redactor.js";
11
+ import { writeEvalSet, WriterError } from "./output-writer.js";
12
+ export class BuilderError extends Error {
13
+ cause;
14
+ constructor(message, cause) {
15
+ super(message);
16
+ this.cause = cause;
17
+ this.name = "BuilderError";
18
+ }
19
+ }
20
+ /**
21
+ * Canonical JSON serialization for hashing — keys sorted, no whitespace.
22
+ * Ensures hash(case) is stable across runs.
23
+ */
24
+ function canonicalJson(value) {
25
+ if (value === null || typeof value !== "object")
26
+ return JSON.stringify(value);
27
+ if (Array.isArray(value))
28
+ return "[" + value.map(canonicalJson).join(",") + "]";
29
+ const obj = value;
30
+ const keys = Object.keys(obj).sort();
31
+ return "{" + keys.map((k) => JSON.stringify(k) + ":" + canonicalJson(obj[k])).join(",") + "}";
32
+ }
33
+ export function ensureQueryId(c) {
34
+ if (c.query_id && c.query_id.length > 0)
35
+ return c.query_id;
36
+ const seed = canonicalJson({ input: c.input, tags: c.tags ?? [] });
37
+ return createHash("sha256").update(seed).digest("hex").slice(0, 12);
38
+ }
39
+ function redactCase(c, applyFn) {
40
+ const redacted = {
41
+ query_id: c.query_id,
42
+ input: { user_message: applyFn(c.input.user_message) },
43
+ tags: c.tags,
44
+ };
45
+ if (c.reference) {
46
+ redacted.reference = { answer: applyFn(c.reference.answer) };
47
+ }
48
+ if (c.assertions) {
49
+ redacted.assertions = c.assertions; // assertions strings (regex / value) intentionally NOT redacted
50
+ // — they are user-authored test expectations, not raw PII
51
+ }
52
+ return redacted;
53
+ }
54
+ export async function build(opts) {
55
+ // Stage 1: pick cases
56
+ let lifted;
57
+ let skippedFindingsCount = 0;
58
+ try {
59
+ if (opts.source.kind === "queries") {
60
+ lifted = await liftFromQueriesFile(opts.source.path);
61
+ }
62
+ else {
63
+ const r = await liftFromDiagnosis(opts.source.path);
64
+ lifted = r.cases;
65
+ skippedFindingsCount = r.skipped_findings_count;
66
+ }
67
+ }
68
+ catch (e) {
69
+ if (e instanceof QueryPickerError) {
70
+ throw new BuilderError(`picker failed: ${e.message}`, e);
71
+ }
72
+ throw e;
73
+ }
74
+ // Stage 2: ensure query_id
75
+ const withIds = lifted.map((c) => ({ ...c, query_id: ensureQueryId(c) }));
76
+ // Stage 3: redact
77
+ let rulesResult;
78
+ try {
79
+ rulesResult = await loadRules({
80
+ cliFlag: opts.redactionRulesCliFlag,
81
+ repoDir: opts.repoDir,
82
+ });
83
+ }
84
+ catch (e) {
85
+ if (e instanceof RedactorError) {
86
+ throw new BuilderError(`redactor failed: ${e.message}`, e);
87
+ }
88
+ throw e;
89
+ }
90
+ const apply = (s) => applyRules(s, rulesResult.rules);
91
+ const redacted = withIds.map((c) => redactCase(c, apply));
92
+ // Stage 3.5: guard against 0-cases lift (better UX than letting writer fail
93
+ // with cryptic "Too small: expected array to have >=1 items"). Common cause:
94
+ // --diagnosis= where every finding has query=null (e.g. runtime doesn't emit
95
+ // gen_ai.input.messages) or empty assertions[].
96
+ if (redacted.length === 0) {
97
+ const sourceLabel = opts.source.kind === "diagnosis" ? "--diagnosis=" : "--queries=";
98
+ const skippedNote = skippedFindingsCount > 0
99
+ ? `\n Skipped ${skippedFindingsCount} finding(s) — common causes:\n - findings have query: null (M4 trace runtime doesn't emit gen_ai.input.messages)\n - findings have empty assertions[]`
100
+ : "";
101
+ throw new BuilderError(`lifted 0 eval-cases from ${sourceLabel}${opts.source.path}.${skippedNote}\n Alternatives: use --queries=<file> to provide queries manually, or upgrade M4 trace runtime to emit gen_ai.input.messages.`);
102
+ }
103
+ // Stage 4: write + conflict + validate
104
+ let writeRes;
105
+ try {
106
+ writeRes = await writeEvalSet({
107
+ outDir: opts.outDir,
108
+ evalSetId: opts.evalSetId,
109
+ newCases: redacted,
110
+ onConflict: opts.onConflict,
111
+ });
112
+ }
113
+ catch (e) {
114
+ if (e instanceof WriterError) {
115
+ throw new BuilderError(`writer failed: ${e.message}`, e);
116
+ }
117
+ throw e;
118
+ }
119
+ return {
120
+ cases_written: writeRes.cases_written,
121
+ cases_skipped: writeRes.cases_skipped + skippedFindingsCount,
122
+ conflicts: writeRes.conflicts,
123
+ shard_paths: writeRes.shard_paths,
124
+ redaction_rules_source: rulesResult.source,
125
+ };
126
+ }
@@ -0,0 +1,15 @@
1
+ /**
2
+ * M5 eval-set module — public exports.
3
+ *
4
+ * Consumers (commands/trace.ts, tests, future M6 reuse) import from this
5
+ * barrel; internal modules cross-import via direct paths.
6
+ */
7
+ export type { EvalCase, EvalCaseInput, EvalReference, EvalAssertion, AssertionType, EvalSetIndex, EvalSetIndexShard, BuildResult, RedactionRule, } from "./types.js";
8
+ export { build, ensureQueryId, BuilderError } from "./builder.js";
9
+ export type { BuildOpts, BuildSource } from "./builder.js";
10
+ export { run as runTest } from "./test-runner.js";
11
+ export type { RunOpts, RunnerDeps } from "./test-runner.js";
12
+ export { evaluateAssertion } from "./assertion-evaluator.js";
13
+ export type { AssertionContext, AssertionResult, SemanticMatchProvider, SemanticMatchVerdict, } from "./assertion-evaluator.js";
14
+ export { createBuiltinSemanticMatchProvider, ANSWER_MATCH_REFERENCE_REF, AnswerMatchOutputSchema, } from "./semantic-match-provider.js";
15
+ export type { CreateSemanticMatchProviderOpts } from "./semantic-match-provider.js";
@@ -0,0 +1,10 @@
1
+ /**
2
+ * M5 eval-set module — public exports.
3
+ *
4
+ * Consumers (commands/trace.ts, tests, future M6 reuse) import from this
5
+ * barrel; internal modules cross-import via direct paths.
6
+ */
7
+ export { build, ensureQueryId, BuilderError } from "./builder.js";
8
+ export { run as runTest } from "./test-runner.js";
9
+ export { evaluateAssertion } from "./assertion-evaluator.js";
10
+ export { createBuiltinSemanticMatchProvider, ANSWER_MATCH_REFERENCE_REF, AnswerMatchOutputSchema, } from "./semantic-match-provider.js";
@@ -0,0 +1,27 @@
1
+ /**
2
+ * M5 eval-set output writer — handles directory layout, index upsert, shard
3
+ * merge, on-conflict resolution (fail / skip / overwrite), and .bak preservation.
4
+ *
5
+ * MVP layout: always one shard named `cases.yaml`. Users can manually split
6
+ * into multi-shard later (re-write `index.yaml` to reference more shards)
7
+ * and call `kweaver trace schema validate` to verify.
8
+ */
9
+ import type { EvalCase } from "./types.js";
10
+ export declare class WriterError extends Error {
11
+ readonly conflictIds?: string[] | undefined;
12
+ constructor(message: string, conflictIds?: string[] | undefined);
13
+ }
14
+ export type ConflictStrategy = "fail" | "skip" | "overwrite";
15
+ export interface WriteEvalSetOpts {
16
+ outDir: string;
17
+ evalSetId: string;
18
+ newCases: EvalCase[];
19
+ onConflict: ConflictStrategy;
20
+ }
21
+ export interface WriteEvalSetResult {
22
+ cases_written: number;
23
+ cases_skipped: number;
24
+ conflicts: string[];
25
+ shard_paths: string[];
26
+ }
27
+ export declare function writeEvalSet(opts: WriteEvalSetOpts): Promise<WriteEvalSetResult>;