@kat-ai/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +114 -0
- package/dist/agent/index.cjs +468 -0
- package/dist/agent/index.cjs.map +1 -0
- package/dist/agent/index.d.cts +170 -0
- package/dist/agent/index.d.ts +170 -0
- package/dist/agent/index.js +466 -0
- package/dist/agent/index.js.map +1 -0
- package/dist/index.cjs +1043 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +93 -0
- package/dist/index.d.ts +93 -0
- package/dist/index.js +1032 -0
- package/dist/index.js.map +1 -0
- package/dist/introspection/index.cjs +476 -0
- package/dist/introspection/index.cjs.map +1 -0
- package/dist/introspection/index.d.cts +107 -0
- package/dist/introspection/index.d.ts +107 -0
- package/dist/introspection/index.js +474 -0
- package/dist/introspection/index.js.map +1 -0
- package/dist/retrieval/index.cjs +312 -0
- package/dist/retrieval/index.cjs.map +1 -0
- package/dist/retrieval/index.d.cts +98 -0
- package/dist/retrieval/index.d.ts +98 -0
- package/dist/retrieval/index.js +310 -0
- package/dist/retrieval/index.js.map +1 -0
- package/dist/types-BJjlqNhg.d.cts +112 -0
- package/dist/types-BJjlqNhg.d.ts +112 -0
- package/package.json +79 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import { B as BaseEvalConfig, E as EvalCriterion, b as EvalResult, a as EvalEvidence } from '../types-BJjlqNhg.cjs';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Agent Eval Types
|
|
5
|
+
*
|
|
6
|
+
* Types for evaluating end-to-end agent behavior.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Configuration for agent evaluation.
|
|
11
|
+
*/
|
|
12
|
+
interface AgentEvalConfig extends BaseEvalConfig {
|
|
13
|
+
/** API endpoint URL or agent function */
|
|
14
|
+
agentEndpoint: string | AgentFunction;
|
|
15
|
+
/** Test scenarios to run */
|
|
16
|
+
scenarios: AgentTestScenario[];
|
|
17
|
+
/** Maximum conversation turns per scenario (default: 5) */
|
|
18
|
+
maxTurns?: number;
|
|
19
|
+
/** Timeout per scenario in milliseconds (default: 60000) */
|
|
20
|
+
timeout?: number;
|
|
21
|
+
/** LLM grading configuration */
|
|
22
|
+
graderConfig?: {
|
|
23
|
+
model?: string;
|
|
24
|
+
temperature?: number;
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Function type for local agent testing.
|
|
29
|
+
*/
|
|
30
|
+
type AgentFunction = (message: string, context?: AgentContext) => Promise<AgentResponse>;
|
|
31
|
+
/**
|
|
32
|
+
* Context passed to agent function.
|
|
33
|
+
*/
|
|
34
|
+
interface AgentContext {
|
|
35
|
+
sessionId?: string;
|
|
36
|
+
previousContext?: unknown;
|
|
37
|
+
previousIntent?: unknown;
|
|
38
|
+
conversationHistory?: Array<{
|
|
39
|
+
role: 'user' | 'assistant';
|
|
40
|
+
content: string;
|
|
41
|
+
}>;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* A test scenario for agent evaluation.
|
|
45
|
+
*/
|
|
46
|
+
interface AgentTestScenario {
|
|
47
|
+
/** Unique name for the scenario */
|
|
48
|
+
name: string;
|
|
49
|
+
/** Description of what the scenario tests */
|
|
50
|
+
description?: string;
|
|
51
|
+
/** Initial user query */
|
|
52
|
+
initialQuery: string;
|
|
53
|
+
/** Expected outcome type */
|
|
54
|
+
expectedOutcome?: 'answer' | 'follow_up' | 'blocked' | 'out_of_scope';
|
|
55
|
+
/** Responses to follow-up questions (pattern -> response) */
|
|
56
|
+
followUpResponses?: Record<string, string>;
|
|
57
|
+
/** Expected KBs to be called */
|
|
58
|
+
expectedKBs?: string[];
|
|
59
|
+
/** Maximum turns for this scenario */
|
|
60
|
+
maxTurns?: number;
|
|
61
|
+
/** Evaluation criteria */
|
|
62
|
+
evaluation: ScenarioEvaluation;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Evaluation criteria for a scenario.
|
|
66
|
+
*/
|
|
67
|
+
interface ScenarioEvaluation {
|
|
68
|
+
/** Strings that MUST appear in the answer */
|
|
69
|
+
mustContain?: string[];
|
|
70
|
+
/** Strings that must NOT appear in the answer */
|
|
71
|
+
mustNotContain?: string[];
|
|
72
|
+
/** LLM rubric for grading the answer */
|
|
73
|
+
rubric?: string;
|
|
74
|
+
/** Custom evaluation criteria */
|
|
75
|
+
criteria?: EvalCriterion[];
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Result from agent evaluation.
|
|
79
|
+
*/
|
|
80
|
+
interface AgentEvalResult extends EvalResult {
|
|
81
|
+
/** Individual metric scores */
|
|
82
|
+
scores: {
|
|
83
|
+
/** Percentage of scenarios with correct outcome type (0-100) */
|
|
84
|
+
accuracy: number;
|
|
85
|
+
/** Average answer relevance to queries (0-100) */
|
|
86
|
+
relevance: number;
|
|
87
|
+
/** Average answer completeness (0-100) */
|
|
88
|
+
completeness: number;
|
|
89
|
+
/** Average helpfulness/actionability (0-100) */
|
|
90
|
+
helpfulness: number;
|
|
91
|
+
};
|
|
92
|
+
/** Results for each scenario */
|
|
93
|
+
scenarioResults: ScenarioResult[];
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Result from running a single scenario.
|
|
97
|
+
*/
|
|
98
|
+
interface ScenarioResult {
|
|
99
|
+
/** The scenario that was run */
|
|
100
|
+
scenario: AgentTestScenario;
|
|
101
|
+
/** Whether the scenario passed */
|
|
102
|
+
passed: boolean;
|
|
103
|
+
/** Number of conversation turns */
|
|
104
|
+
turns: number;
|
|
105
|
+
/** Final outcome type */
|
|
106
|
+
finalOutcome: string;
|
|
107
|
+
/** Final answer (null if not 'answer' outcome) */
|
|
108
|
+
finalAnswer: string | null;
|
|
109
|
+
/** Detailed evaluation result */
|
|
110
|
+
evaluation: {
|
|
111
|
+
passed: boolean;
|
|
112
|
+
score: number;
|
|
113
|
+
evidence: EvalEvidence[];
|
|
114
|
+
};
|
|
115
|
+
/** Full conversation history */
|
|
116
|
+
conversation: ConversationTurn[];
|
|
117
|
+
/** Duration in milliseconds */
|
|
118
|
+
duration: number;
|
|
119
|
+
/** Error message if scenario failed */
|
|
120
|
+
error?: string;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* A single turn in the conversation.
|
|
124
|
+
*/
|
|
125
|
+
interface ConversationTurn {
|
|
126
|
+
/** Turn number (1-indexed) */
|
|
127
|
+
turn: number;
|
|
128
|
+
/** User message */
|
|
129
|
+
userMessage: string;
|
|
130
|
+
/** Agent response */
|
|
131
|
+
agentResponse: AgentResponse;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Response from the agent (matches @kat/core ChatResponse structure).
|
|
135
|
+
*/
|
|
136
|
+
interface AgentResponse {
|
|
137
|
+
/** Outcome type */
|
|
138
|
+
outcome: 'answer' | 'follow_up' | 'blocked' | 'out_of_scope';
|
|
139
|
+
/** Answer text (for 'answer' outcome) */
|
|
140
|
+
answer?: string | null;
|
|
141
|
+
/** Follow-up question (for 'follow_up' outcome) */
|
|
142
|
+
followUpQuestion?: string | null;
|
|
143
|
+
/** Multi-choice options */
|
|
144
|
+
options?: Array<{
|
|
145
|
+
id: string;
|
|
146
|
+
label: string;
|
|
147
|
+
}>;
|
|
148
|
+
/** Context for multi-turn */
|
|
149
|
+
context?: unknown;
|
|
150
|
+
/** Intent for multi-turn */
|
|
151
|
+
intent?: unknown;
|
|
152
|
+
/** Execution trace */
|
|
153
|
+
trace?: unknown;
|
|
154
|
+
/** Session ID */
|
|
155
|
+
sessionId?: string;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Agent Eval - Layer 3
|
|
160
|
+
*
|
|
161
|
+
* Evaluates end-to-end agent behavior by running multi-turn
|
|
162
|
+
* conversation scenarios and grading the responses.
|
|
163
|
+
*/
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Evaluate agent behavior by running test scenarios.
|
|
167
|
+
*/
|
|
168
|
+
declare function evaluateAgent(config: AgentEvalConfig): Promise<AgentEvalResult>;
|
|
169
|
+
|
|
170
|
+
export { type AgentEvalConfig, type AgentEvalResult, type AgentResponse, type AgentTestScenario, type ConversationTurn, type ScenarioEvaluation, type ScenarioResult, evaluateAgent };
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import { B as BaseEvalConfig, E as EvalCriterion, b as EvalResult, a as EvalEvidence } from '../types-BJjlqNhg.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Agent Eval Types
|
|
5
|
+
*
|
|
6
|
+
* Types for evaluating end-to-end agent behavior.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Configuration for agent evaluation.
|
|
11
|
+
*/
|
|
12
|
+
interface AgentEvalConfig extends BaseEvalConfig {
|
|
13
|
+
/** API endpoint URL or agent function */
|
|
14
|
+
agentEndpoint: string | AgentFunction;
|
|
15
|
+
/** Test scenarios to run */
|
|
16
|
+
scenarios: AgentTestScenario[];
|
|
17
|
+
/** Maximum conversation turns per scenario (default: 5) */
|
|
18
|
+
maxTurns?: number;
|
|
19
|
+
/** Timeout per scenario in milliseconds (default: 60000) */
|
|
20
|
+
timeout?: number;
|
|
21
|
+
/** LLM grading configuration */
|
|
22
|
+
graderConfig?: {
|
|
23
|
+
model?: string;
|
|
24
|
+
temperature?: number;
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Function type for local agent testing.
|
|
29
|
+
*/
|
|
30
|
+
type AgentFunction = (message: string, context?: AgentContext) => Promise<AgentResponse>;
|
|
31
|
+
/**
|
|
32
|
+
* Context passed to agent function.
|
|
33
|
+
*/
|
|
34
|
+
interface AgentContext {
|
|
35
|
+
sessionId?: string;
|
|
36
|
+
previousContext?: unknown;
|
|
37
|
+
previousIntent?: unknown;
|
|
38
|
+
conversationHistory?: Array<{
|
|
39
|
+
role: 'user' | 'assistant';
|
|
40
|
+
content: string;
|
|
41
|
+
}>;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* A test scenario for agent evaluation.
|
|
45
|
+
*/
|
|
46
|
+
interface AgentTestScenario {
|
|
47
|
+
/** Unique name for the scenario */
|
|
48
|
+
name: string;
|
|
49
|
+
/** Description of what the scenario tests */
|
|
50
|
+
description?: string;
|
|
51
|
+
/** Initial user query */
|
|
52
|
+
initialQuery: string;
|
|
53
|
+
/** Expected outcome type */
|
|
54
|
+
expectedOutcome?: 'answer' | 'follow_up' | 'blocked' | 'out_of_scope';
|
|
55
|
+
/** Responses to follow-up questions (pattern -> response) */
|
|
56
|
+
followUpResponses?: Record<string, string>;
|
|
57
|
+
/** Expected KBs to be called */
|
|
58
|
+
expectedKBs?: string[];
|
|
59
|
+
/** Maximum turns for this scenario */
|
|
60
|
+
maxTurns?: number;
|
|
61
|
+
/** Evaluation criteria */
|
|
62
|
+
evaluation: ScenarioEvaluation;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Evaluation criteria for a scenario.
|
|
66
|
+
*/
|
|
67
|
+
interface ScenarioEvaluation {
|
|
68
|
+
/** Strings that MUST appear in the answer */
|
|
69
|
+
mustContain?: string[];
|
|
70
|
+
/** Strings that must NOT appear in the answer */
|
|
71
|
+
mustNotContain?: string[];
|
|
72
|
+
/** LLM rubric for grading the answer */
|
|
73
|
+
rubric?: string;
|
|
74
|
+
/** Custom evaluation criteria */
|
|
75
|
+
criteria?: EvalCriterion[];
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Result from agent evaluation.
|
|
79
|
+
*/
|
|
80
|
+
interface AgentEvalResult extends EvalResult {
|
|
81
|
+
/** Individual metric scores */
|
|
82
|
+
scores: {
|
|
83
|
+
/** Percentage of scenarios with correct outcome type (0-100) */
|
|
84
|
+
accuracy: number;
|
|
85
|
+
/** Average answer relevance to queries (0-100) */
|
|
86
|
+
relevance: number;
|
|
87
|
+
/** Average answer completeness (0-100) */
|
|
88
|
+
completeness: number;
|
|
89
|
+
/** Average helpfulness/actionability (0-100) */
|
|
90
|
+
helpfulness: number;
|
|
91
|
+
};
|
|
92
|
+
/** Results for each scenario */
|
|
93
|
+
scenarioResults: ScenarioResult[];
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Result from running a single scenario.
|
|
97
|
+
*/
|
|
98
|
+
interface ScenarioResult {
|
|
99
|
+
/** The scenario that was run */
|
|
100
|
+
scenario: AgentTestScenario;
|
|
101
|
+
/** Whether the scenario passed */
|
|
102
|
+
passed: boolean;
|
|
103
|
+
/** Number of conversation turns */
|
|
104
|
+
turns: number;
|
|
105
|
+
/** Final outcome type */
|
|
106
|
+
finalOutcome: string;
|
|
107
|
+
/** Final answer (null if not 'answer' outcome) */
|
|
108
|
+
finalAnswer: string | null;
|
|
109
|
+
/** Detailed evaluation result */
|
|
110
|
+
evaluation: {
|
|
111
|
+
passed: boolean;
|
|
112
|
+
score: number;
|
|
113
|
+
evidence: EvalEvidence[];
|
|
114
|
+
};
|
|
115
|
+
/** Full conversation history */
|
|
116
|
+
conversation: ConversationTurn[];
|
|
117
|
+
/** Duration in milliseconds */
|
|
118
|
+
duration: number;
|
|
119
|
+
/** Error message if scenario failed */
|
|
120
|
+
error?: string;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* A single turn in the conversation.
|
|
124
|
+
*/
|
|
125
|
+
interface ConversationTurn {
|
|
126
|
+
/** Turn number (1-indexed) */
|
|
127
|
+
turn: number;
|
|
128
|
+
/** User message */
|
|
129
|
+
userMessage: string;
|
|
130
|
+
/** Agent response */
|
|
131
|
+
agentResponse: AgentResponse;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Response from the agent (matches @kat/core ChatResponse structure).
|
|
135
|
+
*/
|
|
136
|
+
interface AgentResponse {
|
|
137
|
+
/** Outcome type */
|
|
138
|
+
outcome: 'answer' | 'follow_up' | 'blocked' | 'out_of_scope';
|
|
139
|
+
/** Answer text (for 'answer' outcome) */
|
|
140
|
+
answer?: string | null;
|
|
141
|
+
/** Follow-up question (for 'follow_up' outcome) */
|
|
142
|
+
followUpQuestion?: string | null;
|
|
143
|
+
/** Multi-choice options */
|
|
144
|
+
options?: Array<{
|
|
145
|
+
id: string;
|
|
146
|
+
label: string;
|
|
147
|
+
}>;
|
|
148
|
+
/** Context for multi-turn */
|
|
149
|
+
context?: unknown;
|
|
150
|
+
/** Intent for multi-turn */
|
|
151
|
+
intent?: unknown;
|
|
152
|
+
/** Execution trace */
|
|
153
|
+
trace?: unknown;
|
|
154
|
+
/** Session ID */
|
|
155
|
+
sessionId?: string;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Agent Eval - Layer 3
|
|
160
|
+
*
|
|
161
|
+
* Evaluates end-to-end agent behavior by running multi-turn
|
|
162
|
+
* conversation scenarios and grading the responses.
|
|
163
|
+
*/
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Evaluate agent behavior by running test scenarios.
|
|
167
|
+
*/
|
|
168
|
+
declare function evaluateAgent(config: AgentEvalConfig): Promise<AgentEvalResult>;
|
|
169
|
+
|
|
170
|
+
export { type AgentEvalConfig, type AgentEvalResult, type AgentResponse, type AgentTestScenario, type ConversationTurn, type ScenarioEvaluation, type ScenarioResult, evaluateAgent };
|