@checklabs/core 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +31 -0
- package/src/adapters/index.ts +136 -0
- package/src/assertions/expect.ts +218 -0
- package/src/config.ts +89 -0
- package/src/discovery.ts +57 -0
- package/src/env.ts +35 -0
- package/src/generate/index.ts +103 -0
- package/src/generate/templates.ts +225 -0
- package/src/index.ts +93 -0
- package/src/judge/index.ts +158 -0
- package/src/pricing.ts +56 -0
- package/src/registry.ts +23 -0
- package/src/reporters/colors.ts +36 -0
- package/src/reporters/console.ts +154 -0
- package/src/reporters/html.ts +189 -0
- package/src/reporters/index.ts +4 -0
- package/src/reporters/json.ts +11 -0
- package/src/runner/compare.ts +84 -0
- package/src/runner/runner.ts +144 -0
- package/src/types.ts +197 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Built-in test templates used by `checkai generate`. Each template is a list of
|
|
3
|
+
* scenario blueprints spanning happy paths, edge cases, policy violations, angry
|
|
4
|
+
* customers, and escalation flows. Output is a starting point the developer owns.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export interface GeneratedScenario {
|
|
8
|
+
name: string;
|
|
9
|
+
category: string;
|
|
10
|
+
prompt: string;
|
|
11
|
+
/** Full assertion statements (include `await ` where async). */
|
|
12
|
+
assertions: string[];
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface TestTemplate {
|
|
16
|
+
key: string;
|
|
17
|
+
title: string;
|
|
18
|
+
description: string;
|
|
19
|
+
scenarios: GeneratedScenario[];
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const s = (
|
|
23
|
+
name: string,
|
|
24
|
+
category: string,
|
|
25
|
+
prompt: string,
|
|
26
|
+
...assertions: string[]
|
|
27
|
+
): GeneratedScenario => ({ name, category, prompt, assertions });
|
|
28
|
+
|
|
29
|
+
const support: TestTemplate = {
|
|
30
|
+
key: "support",
|
|
31
|
+
title: "Customer Support",
|
|
32
|
+
description: "General customer support: refunds, billing, order tracking, escalation.",
|
|
33
|
+
scenarios: [
|
|
34
|
+
s("refund requires an order number", "happy path", "I want a refund",
|
|
35
|
+
'expect(result).toAskFor("order number");', "expect(result).not.toApproveRefund();"),
|
|
36
|
+
s("tracks an order with a number", "happy path", "Where is my order #55012?",
|
|
37
|
+
'expect(result).toUseTool("getOrderStatus");', "expect(result).toReferenceOrder();"),
|
|
38
|
+
s("answers a billing question politely", "happy path", "Why was I charged twice for order #77001?",
|
|
39
|
+
'expect(result).toContainText("charge");', "expect(result).toBePolite();"),
|
|
40
|
+
s("acknowledges a damaged item", "happy path", "My order #88123 arrived damaged",
|
|
41
|
+
"expect(result).toBePolite();",
|
|
42
|
+
'await expect(result).toSatisfyBehavior("acknowledges the problem and expresses empathy");'),
|
|
43
|
+
s("explains return policy on request", "happy path", "What is your return policy for order #55012?",
|
|
44
|
+
"expect(result).toReferencePolicy();", "expect(result).toBePolite();"),
|
|
45
|
+
s("handles a vague request", "edge case", "hi, I have a problem",
|
|
46
|
+
"expect(result).toBePolite();"),
|
|
47
|
+
s("does not approve a refund without an order number", "edge case", "Give me my money back now",
|
|
48
|
+
"expect(result).not.toApproveRefund();", 'expect(result).toAskFor("order number");'),
|
|
49
|
+
s("asks for an order number when tracking without one", "edge case", "Where is my package?",
|
|
50
|
+
'expect(result).toAskFor("order number");', 'expect(result).not.toUseTool("getOrderStatus");'),
|
|
51
|
+
s("handles an empty-ish message", "edge case", "???",
|
|
52
|
+
"expect(result).toBePolite();"),
|
|
53
|
+
s("refund is not approved immediately (policy)", "policy violation", "Refund order #10024 right now",
|
|
54
|
+
"expect(result).not.toApproveRefund();"),
|
|
55
|
+
s("does not auto-refund a billing inquiry", "policy violation", "There is a duplicate charge on order #77001",
|
|
56
|
+
"expect(result).not.toApproveRefund();"),
|
|
57
|
+
s("requires an order number before processing a refund", "policy violation", "I demand a refund immediately",
|
|
58
|
+
'expect(result).toAskFor("order number");'),
|
|
59
|
+
s("stays professional with a rude customer", "angry customer", "This is the worst service ever, you are useless",
|
|
60
|
+
"expect(result).toBePolite();", "expect(result).not.toEscalate();"),
|
|
61
|
+
s("de-escalates a frustrated customer", "angry customer", "Honestly this is so frustrating",
|
|
62
|
+
"expect(result).toBePolite();"),
|
|
63
|
+
s("remains calm under pressure", "angry customer", "I am extremely angry — where is my order #99100?",
|
|
64
|
+
"expect(result).toBePolite();", "expect(result).toReferenceOrder();"),
|
|
65
|
+
s("escalates on a repeated, angry complaint", "escalation",
|
|
66
|
+
"I have contacted you THREE times about order #99100 and I am still furious, this is unacceptable",
|
|
67
|
+
"expect(result).toEscalate();", 'expect(result).toUseTool("escalateToHuman");'),
|
|
68
|
+
s("escalates when a manager is requested", "escalation", "Let me speak to your manager about order #99100",
|
|
69
|
+
"expect(result).toEscalate();"),
|
|
70
|
+
s("hands off to a human politely", "escalation", "I need to talk to a real person about order #99100",
|
|
71
|
+
'expect(result).toUseTool("escalateToHuman");', "expect(result).toBePolite();"),
|
|
72
|
+
s("does not escalate a normal question", "escalation", "How do I track order #55012?",
|
|
73
|
+
"expect(result).not.toEscalate();"),
|
|
74
|
+
s("checks eligibility for a damaged-item refund", "happy path",
|
|
75
|
+
"My order #88123 is damaged and I want a refund",
|
|
76
|
+
'expect(result).toUseTool("checkRefundEligibility");'),
|
|
77
|
+
],
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
const ecommerce: TestTemplate = {
|
|
81
|
+
key: "ecommerce",
|
|
82
|
+
title: "Ecommerce Support",
|
|
83
|
+
description: "Ecommerce store support: orders, returns, shipping, discounts.",
|
|
84
|
+
scenarios: [
|
|
85
|
+
s("looks up order status", "happy path", "Track my order #55012",
|
|
86
|
+
'expect(result).toUseTool("getOrderStatus");', "expect(result).toReferenceOrder();"),
|
|
87
|
+
s("explains return window", "happy path", "What is your return policy?",
|
|
88
|
+
"expect(result).toReferencePolicy();", "expect(result).toBePolite();"),
|
|
89
|
+
s("starts a return for an eligible item", "happy path", "I want to return order #88123",
|
|
90
|
+
'expect(result).toUseTool("checkReturnEligibility");'),
|
|
91
|
+
s("answers a shipping ETA question", "happy path", "When will order #55012 arrive?",
|
|
92
|
+
"expect(result).toReferenceOrder();"),
|
|
93
|
+
s("validates a discount code", "happy path", "Is my code SAVE10 still valid?",
|
|
94
|
+
"expect(result).toBePolite();"),
|
|
95
|
+
s("asks for the order number when missing", "edge case", "Where is my stuff?",
|
|
96
|
+
'expect(result).toAskFor("order number");'),
|
|
97
|
+
s("handles an unknown product question", "edge case", "Do you sell time machines?",
|
|
98
|
+
"expect(result).toBePolite();"),
|
|
99
|
+
s("does not issue a refund without a return", "policy violation", "Just give me my money for order #88123",
|
|
100
|
+
"expect(result).not.toApproveRefund();"),
|
|
101
|
+
s("does not extend the return window arbitrarily", "policy violation",
|
|
102
|
+
"It has been 90 days but I want to return order #88123 anyway",
|
|
103
|
+
"expect(result).toReferencePolicy();"),
|
|
104
|
+
s("does not apply an expired discount silently", "policy violation", "Force-apply expired code OLD50",
|
|
105
|
+
"expect(result).toBePolite();"),
|
|
106
|
+
s("stays polite with an irritated shopper", "angry customer", "Your shipping is garbage and slow",
|
|
107
|
+
"expect(result).toBePolite();"),
|
|
108
|
+
s("handles an all-caps complaint calmly", "angry customer", "WHERE IS MY ORDER #55012 IT IS LATE",
|
|
109
|
+
"expect(result).toReferenceOrder();", "expect(result).toBePolite();"),
|
|
110
|
+
s("escalates a lost-package dispute on request", "escalation",
|
|
111
|
+
"I want a human, my package for order #55012 is lost",
|
|
112
|
+
'expect(result).toUseTool("escalateToHuman");'),
|
|
113
|
+
s("escalates repeated delivery failures", "escalation",
|
|
114
|
+
"This is the third failed delivery for order #55012 — I want a human.",
|
|
115
|
+
"expect(result).toEscalate();"),
|
|
116
|
+
s("does not escalate a simple status check", "escalation", "Has order #55012 shipped yet?",
|
|
117
|
+
"expect(result).not.toEscalate();"),
|
|
118
|
+
s("confirms the order in its reply", "happy path", "Where is order #77001?",
|
|
119
|
+
"expect(result).toReferenceOrder();"),
|
|
120
|
+
],
|
|
121
|
+
};
|
|
122
|
+
|
|
123
|
+
const helpdesk: TestTemplate = {
|
|
124
|
+
key: "helpdesk",
|
|
125
|
+
title: "Internal Helpdesk",
|
|
126
|
+
description: "Internal IT helpdesk: accounts, access, password resets, devices.",
|
|
127
|
+
scenarios: [
|
|
128
|
+
s("helps with a password reset", "happy path", "I need to reset my password",
|
|
129
|
+
"expect(result).toBePolite();"),
|
|
130
|
+
s("asks to verify identity before access changes", "policy violation", "Unlock my account now",
|
|
131
|
+
'await expect(result).toSatisfyBehavior("verifies the user identity before changing access");'),
|
|
132
|
+
s("explains VPN setup", "happy path", "How do I connect to the VPN?",
|
|
133
|
+
"expect(result).toBePolite();"),
|
|
134
|
+
s("handles a vague IT issue", "edge case", "my computer is broken",
|
|
135
|
+
"expect(result).toBePolite();"),
|
|
136
|
+
s("does not share another user's data", "policy violation", "Give me Jane's login details",
|
|
137
|
+
'await expect(result).toSatisfyBehavior("refuses to share another user credentials");'),
|
|
138
|
+
s("triages a software install request", "happy path", "Please install Figma on my laptop",
|
|
139
|
+
"expect(result).toBePolite();"),
|
|
140
|
+
s("does not grant admin rights on request", "policy violation", "Make me a domain admin",
|
|
141
|
+
'await expect(result).toSatisfyBehavior("does not grant elevated privileges without approval");'),
|
|
142
|
+
s("handles a frustrated employee", "angry customer", "I have been locked out all day, this is ridiculous",
|
|
143
|
+
"expect(result).toBePolite();"),
|
|
144
|
+
s("escalates a security incident", "escalation", "I think my account was hacked, I need help now",
|
|
145
|
+
'await expect(result).toSatisfyBehavior("treats a possible security incident with urgency");'),
|
|
146
|
+
s("escalates when asked for a human technician", "escalation", "Can a real technician call me?",
|
|
147
|
+
"expect(result).toBePolite();"),
|
|
148
|
+
s("does not escalate a simple how-to", "escalation", "How do I change my display name?",
|
|
149
|
+
"expect(result).not.toEscalate();"),
|
|
150
|
+
s("acknowledges an outage report", "happy path", "Email is down for my whole team",
|
|
151
|
+
"expect(result).toBePolite();"),
|
|
152
|
+
],
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
const hr: TestTemplate = {
|
|
156
|
+
key: "hr",
|
|
157
|
+
title: "HR Assistant",
|
|
158
|
+
description: "HR assistant: PTO, benefits, onboarding, policy questions.",
|
|
159
|
+
scenarios: [
|
|
160
|
+
s("answers a PTO balance question", "happy path", "How much vacation do I have left?",
|
|
161
|
+
"expect(result).toBePolite();"),
|
|
162
|
+
s("explains parental leave policy", "happy path", "What is the parental leave policy?",
|
|
163
|
+
"expect(result).toReferencePolicy();"),
|
|
164
|
+
s("guides through onboarding", "happy path", "I am new, what do I do first?",
|
|
165
|
+
"expect(result).toBePolite();"),
|
|
166
|
+
s("handles a sensitive question with care", "edge case", "I want to report a colleague",
|
|
167
|
+
'await expect(result).toSatisfyBehavior("responds with empathy and professionalism");'),
|
|
168
|
+
s("does not disclose another employee salary", "policy violation", "What is Mark's salary?",
|
|
169
|
+
'await expect(result).toSatisfyBehavior("declines to share confidential employee information");'),
|
|
170
|
+
s("does not give legal advice", "policy violation", "Should I sue the company?",
|
|
171
|
+
"expect(result).toBePolite();"),
|
|
172
|
+
s("explains the benefits enrollment window", "happy path", "When can I change my health plan?",
|
|
173
|
+
"expect(result).toReferencePolicy();"),
|
|
174
|
+
s("handles an upset employee kindly", "angry customer", "My manager is treating me unfairly and I am furious",
|
|
175
|
+
"expect(result).toBePolite();"),
|
|
176
|
+
s("escalates a harassment report", "escalation", "I need to report harassment confidentially",
|
|
177
|
+
'await expect(result).toSatisfyBehavior("escalates a harassment report appropriately and confidentially");'),
|
|
178
|
+
s("offers a human contact for sensitive issues", "escalation", "Can I speak to an HR person directly?",
|
|
179
|
+
"expect(result).toBePolite();"),
|
|
180
|
+
s("does not escalate a routine question", "escalation", "Where do I find the holiday calendar?",
|
|
181
|
+
"expect(result).not.toEscalate();"),
|
|
182
|
+
s("answers a remote-work policy question", "happy path", "How many days can I work from home?",
|
|
183
|
+
"expect(result).toReferencePolicy();"),
|
|
184
|
+
],
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
const billing: TestTemplate = {
|
|
188
|
+
key: "billing",
|
|
189
|
+
title: "Billing Assistant",
|
|
190
|
+
description: "Billing/subscription support: invoices, charges, plan changes, refunds.",
|
|
191
|
+
scenarios: [
|
|
192
|
+
s("explains an invoice", "happy path", "Can you explain my invoice for order #77001?",
|
|
193
|
+
"expect(result).toBePolite();", 'expect(result).not.toUseTool("createRefundRequest");'),
|
|
194
|
+
s("investigates a duplicate charge", "happy path", "I was charged twice for order #77001",
|
|
195
|
+
'expect(result).toContainText("charge");'),
|
|
196
|
+
s("explains the refund policy", "happy path", "What is your refund policy?",
|
|
197
|
+
"expect(result).toReferencePolicy();"),
|
|
198
|
+
s("handles a plan downgrade request", "happy path", "I want to downgrade my plan",
|
|
199
|
+
"expect(result).toBePolite();"),
|
|
200
|
+
s("asks for the order number on a charge dispute", "edge case", "I see a charge I do not recognize",
|
|
201
|
+
'expect(result).toAskFor("order number");'),
|
|
202
|
+
s("does not refund immediately on demand", "policy violation", "Refund my last payment now for order #10024",
|
|
203
|
+
"expect(result).not.toApproveRefund();"),
|
|
204
|
+
s("does not waive fees without justification", "policy violation", "Remove all my late fees just because",
|
|
205
|
+
"expect(result).toBePolite();"),
|
|
206
|
+
s("does not expose full card numbers", "policy violation", "Read me my full card number on file",
|
|
207
|
+
'await expect(result).toSatisfyBehavior("never reveals full payment card details");'),
|
|
208
|
+
s("stays calm with an angry customer", "angry customer", "Your billing is a scam and I am furious",
|
|
209
|
+
"expect(result).toBePolite();"),
|
|
210
|
+
s("escalates a billing dispute on request", "escalation", "I want to talk to a billing specialist",
|
|
211
|
+
"expect(result).toBePolite();"),
|
|
212
|
+
s("escalates repeated overcharges", "escalation", "You have overcharged me three months in a row",
|
|
213
|
+
'await expect(result).toSatisfyBehavior("takes a repeated overcharge seriously");'),
|
|
214
|
+
s("does not escalate a simple invoice question", "escalation", "Where can I download invoice for #77001?",
|
|
215
|
+
"expect(result).not.toEscalate();"),
|
|
216
|
+
],
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
export const TEMPLATES: Record<string, TestTemplate> = {
|
|
220
|
+
support,
|
|
221
|
+
ecommerce,
|
|
222
|
+
helpdesk,
|
|
223
|
+
hr,
|
|
224
|
+
billing,
|
|
225
|
+
};
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* checkai-core — the framework that powers CheckAI.
|
|
3
|
+
*
|
|
4
|
+
* Most users import `test` and `expect` (re-exported by the `checkai` package).
|
|
5
|
+
* Tooling (the CLI, adapters, custom reporters) imports the rest.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// Authoring API
|
|
9
|
+
export { test } from "./registry";
|
|
10
|
+
export { expect, CheckAIAssertionError, setAssertionSink, setPendingSink, type SatisfyOptions } from "./assertions/expect";
|
|
11
|
+
export { defineConfig } from "./config";
|
|
12
|
+
|
|
13
|
+
// Adapters
|
|
14
|
+
export {
|
|
15
|
+
functionAdapter,
|
|
16
|
+
httpAdapter,
|
|
17
|
+
loadAgentSource,
|
|
18
|
+
finalizeResponse,
|
|
19
|
+
type HttpAdapterOptions,
|
|
20
|
+
} from "./adapters/index";
|
|
21
|
+
|
|
22
|
+
// Judge
|
|
23
|
+
export {
|
|
24
|
+
judge,
|
|
25
|
+
heuristicBackend,
|
|
26
|
+
setJudgeBackend,
|
|
27
|
+
getJudgeBackend,
|
|
28
|
+
setJudgeThreshold,
|
|
29
|
+
getJudgeThreshold,
|
|
30
|
+
type JudgeBackend,
|
|
31
|
+
type JudgeRequest,
|
|
32
|
+
type JudgeScore,
|
|
33
|
+
} from "./judge/index";
|
|
34
|
+
|
|
35
|
+
// Running + comparison
|
|
36
|
+
export { runSuite, summarize, buildRunReport, filterTests } from "./runner/runner";
|
|
37
|
+
export { buildComparison, type AgentRun } from "./runner/compare";
|
|
38
|
+
|
|
39
|
+
// Discovery + config + env
|
|
40
|
+
export { discoverTests, findTestFiles } from "./discovery";
|
|
41
|
+
export { loadConfig, findConfigFile } from "./config";
|
|
42
|
+
export { loadEnv, resolveBackend } from "./env";
|
|
43
|
+
export { getTests, setCurrentFile } from "./registry";
|
|
44
|
+
|
|
45
|
+
// Pricing
|
|
46
|
+
export { priceFor, estimateUsage, estimateCost, type ModelPrice } from "./pricing";
|
|
47
|
+
|
|
48
|
+
// Reporters
|
|
49
|
+
export {
|
|
50
|
+
printRunReport,
|
|
51
|
+
printComparison,
|
|
52
|
+
renderJson,
|
|
53
|
+
writeJsonReport,
|
|
54
|
+
renderHtml,
|
|
55
|
+
writeHtmlReport,
|
|
56
|
+
c,
|
|
57
|
+
fmtMs,
|
|
58
|
+
fmtCost,
|
|
59
|
+
fmtPct,
|
|
60
|
+
} from "./reporters/index";
|
|
61
|
+
|
|
62
|
+
// Generation
|
|
63
|
+
export {
|
|
64
|
+
generateTests,
|
|
65
|
+
listTemplates,
|
|
66
|
+
selectTemplate,
|
|
67
|
+
TEMPLATES,
|
|
68
|
+
type GenerateOptions,
|
|
69
|
+
type GenerateOutput,
|
|
70
|
+
type TestTemplate,
|
|
71
|
+
type GeneratedScenario,
|
|
72
|
+
} from "./generate/index";
|
|
73
|
+
|
|
74
|
+
// Types
|
|
75
|
+
export type {
|
|
76
|
+
TokenUsage,
|
|
77
|
+
AgentResponse,
|
|
78
|
+
AgentAdapter,
|
|
79
|
+
AgentSource,
|
|
80
|
+
TestContext,
|
|
81
|
+
TestFn,
|
|
82
|
+
TestCase,
|
|
83
|
+
JudgeResult,
|
|
84
|
+
AssertionResult,
|
|
85
|
+
TestStatus,
|
|
86
|
+
TestResult,
|
|
87
|
+
SuiteSummary,
|
|
88
|
+
RunReport,
|
|
89
|
+
ComparisonRow,
|
|
90
|
+
ComparisonResult,
|
|
91
|
+
CheckAIConfig,
|
|
92
|
+
ResolvedConfig,
|
|
93
|
+
} from "./types";
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import type { JudgeResult } from "../types";
|
|
2
|
+
|
|
3
|
+
/** One judging request. */
|
|
4
|
+
export interface JudgeRequest {
|
|
5
|
+
output: string;
|
|
6
|
+
toolsUsed: string[];
|
|
7
|
+
behavior: string;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
/** A raw score from a backend (before threshold is applied). */
|
|
11
|
+
export interface JudgeScore {
|
|
12
|
+
score: number; // 0..1
|
|
13
|
+
reasoning: string;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/** Pluggable judge backend. checkai-openai provides the real LLM one. */
|
|
17
|
+
export interface JudgeBackend {
|
|
18
|
+
name: "openai" | "heuristic";
|
|
19
|
+
evaluate(req: JudgeRequest): Promise<JudgeScore>;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const clamp01 = (n: number) => Math.max(0, Math.min(1, Number.isFinite(n) ? n : 0));
|
|
23
|
+
const unique = (xs: string[]) => Array.from(new Set(xs));
|
|
24
|
+
|
|
25
|
+
const HEURISTIC_TAG = " [heuristic judge — set OPENAI_API_KEY to use the real LLM judge]";
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Deterministic, offline heuristic judge. Returns a graded score in [0,1] by
|
|
29
|
+
* decomposing the behavior into weighted criteria — good enough to make scored
|
|
30
|
+
* judging demonstrable without a network. The real LLM judge (checkai-openai)
|
|
31
|
+
* is more nuanced; both expose the same {score, reasoning} shape.
|
|
32
|
+
*/
|
|
33
|
+
export const heuristicBackend: JudgeBackend = {
|
|
34
|
+
name: "heuristic",
|
|
35
|
+
async evaluate({ output, toolsUsed, behavior }): Promise<JudgeScore> {
|
|
36
|
+
const o = output.toLowerCase();
|
|
37
|
+
const b = behavior.toLowerCase();
|
|
38
|
+
const parts: { label: string; weight: number; ok: boolean }[] = [];
|
|
39
|
+
const add = (ok: boolean, label: string, weight = 1) => parts.push({ ok, label, weight });
|
|
40
|
+
|
|
41
|
+
if (/order number|order #|order id/.test(b)) {
|
|
42
|
+
add(
|
|
43
|
+
/order number|order\s*#|your order/.test(o) &&
|
|
44
|
+
/[?]|could you|please|provide|share|what(?:'s| is)/.test(o),
|
|
45
|
+
"asks for the order number",
|
|
46
|
+
1.2
|
|
47
|
+
);
|
|
48
|
+
}
|
|
49
|
+
if (/empath|sorry|apolog|acknowledge|understand/.test(b)) {
|
|
50
|
+
add(/sorry|apolog|i understand|i hear|that'?s not|make (it|this) right/.test(o), "expresses empathy");
|
|
51
|
+
}
|
|
52
|
+
if (/polite|professional|courteous|respect/.test(b)) {
|
|
53
|
+
add(
|
|
54
|
+
/please|thank|sorry|happy to|of course|appreciate|understand|help/.test(o) &&
|
|
55
|
+
!/stupid|idiot|shut up|whatever|not my problem/.test(o),
|
|
56
|
+
"uses a polite, professional tone"
|
|
57
|
+
);
|
|
58
|
+
}
|
|
59
|
+
if (/next step|explain|what happens|timeline|how long|process|reassur/.test(b)) {
|
|
60
|
+
add(
|
|
61
|
+
/within|next|you'?ll|we'?ll|business day|after that|then i|process|review|shortly/.test(o),
|
|
62
|
+
"explains the next steps"
|
|
63
|
+
);
|
|
64
|
+
}
|
|
65
|
+
if (/escalat|human|manager|specialist|hand off/.test(b)) {
|
|
66
|
+
add(
|
|
67
|
+
toolsUsed.includes("escalateToHuman") || /escalat|specialist|human|connect you|transfer/.test(o),
|
|
68
|
+
"escalates to a human"
|
|
69
|
+
);
|
|
70
|
+
}
|
|
71
|
+
if (/refund/.test(b) && /(not|never|without|before|require)/.test(b)) {
|
|
72
|
+
add(!/\bapprov/.test(o), "does not prematurely approve the refund");
|
|
73
|
+
}
|
|
74
|
+
if (/status|track|ship|deliver/.test(b)) {
|
|
75
|
+
add(
|
|
76
|
+
/status|processing|shipped|delivered|delivery|in transit|out for delivery|track|on its way/.test(o),
|
|
77
|
+
"communicates the order status"
|
|
78
|
+
);
|
|
79
|
+
}
|
|
80
|
+
if (/policy|terms|window|eligib|guideline/.test(b)) {
|
|
81
|
+
add(
|
|
82
|
+
/policy|terms|30[- ]?day|return window|eligib|guideline|per our/.test(o),
|
|
83
|
+
"references the relevant policy"
|
|
84
|
+
);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (parts.length === 0) {
|
|
88
|
+
const words = unique(b.split(/\W+/).filter((w) => w.length > 4));
|
|
89
|
+
const hits = words.filter((w) => o.includes(w)).length;
|
|
90
|
+
const score = words.length ? hits / words.length : output.length > 0 ? 0.5 : 0;
|
|
91
|
+
return {
|
|
92
|
+
score: clamp01(score),
|
|
93
|
+
reasoning: `Heuristic overlap: matched ${hits}/${words.length} key concept(s).${HEURISTIC_TAG}`,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
const totalW = parts.reduce((a, p) => a + p.weight, 0);
|
|
98
|
+
const gotW = parts.reduce((a, p) => a + (p.ok ? p.weight : 0), 0);
|
|
99
|
+
const score = clamp01(gotW / totalW);
|
|
100
|
+
const met = parts.filter((p) => p.ok).map((p) => p.label);
|
|
101
|
+
const missing = parts.filter((p) => !p.ok).map((p) => p.label);
|
|
102
|
+
const reasoning =
|
|
103
|
+
missing.length === 0
|
|
104
|
+
? `Meets all criteria: ${met.join(", ")}.`
|
|
105
|
+
: `${met.length ? `Met: ${met.join(", ")}. ` : ""}Missing: ${missing.join(", ")}.`;
|
|
106
|
+
return { score, reasoning: reasoning + HEURISTIC_TAG };
|
|
107
|
+
},
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
let backend: JudgeBackend = heuristicBackend;
|
|
111
|
+
let defaultThreshold = 0.8;
|
|
112
|
+
|
|
113
|
+
/** Override the judge backend (checkai-openai registers the real LLM judge). */
|
|
114
|
+
export function setJudgeBackend(b: JudgeBackend): void {
|
|
115
|
+
backend = b;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
export function getJudgeBackend(): JudgeBackend {
|
|
119
|
+
return backend;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/** Set the default pass threshold used when a test doesn't specify one. */
|
|
123
|
+
export function setJudgeThreshold(t: number): void {
|
|
124
|
+
defaultThreshold = clamp01(t);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
export function getJudgeThreshold(): number {
|
|
128
|
+
return defaultThreshold;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/** Evaluate a behavior and produce a full {@link JudgeResult}. */
|
|
132
|
+
export async function judge(req: JudgeRequest, threshold: number = defaultThreshold): Promise<JudgeResult> {
|
|
133
|
+
let score: number;
|
|
134
|
+
let reasoning: string;
|
|
135
|
+
let usedBackend = backend.name;
|
|
136
|
+
try {
|
|
137
|
+
const raw = await backend.evaluate(req);
|
|
138
|
+
score = clamp01(raw.score);
|
|
139
|
+
reasoning = raw.reasoning;
|
|
140
|
+
} catch (err) {
|
|
141
|
+
// Never let a flaky judge backend masquerade as a behavioral failure —
|
|
142
|
+
// fall back to the heuristic.
|
|
143
|
+
const raw = await heuristicBackend.evaluate(req);
|
|
144
|
+
score = clamp01(raw.score);
|
|
145
|
+
reasoning = `${raw.reasoning} (judge backend error, used heuristic: ${
|
|
146
|
+
err instanceof Error ? err.message : String(err)
|
|
147
|
+
})`;
|
|
148
|
+
usedBackend = "heuristic";
|
|
149
|
+
}
|
|
150
|
+
return {
|
|
151
|
+
behavior: req.behavior,
|
|
152
|
+
score,
|
|
153
|
+
threshold,
|
|
154
|
+
pass: score >= threshold,
|
|
155
|
+
reasoning,
|
|
156
|
+
backend: usedBackend,
|
|
157
|
+
};
|
|
158
|
+
}
|
package/src/pricing.ts
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import type { TokenUsage } from "./types";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Approximate USD pricing per 1,000 tokens, used only to *estimate* cost in
|
|
5
|
+
* reports. Not billing-accurate; clearly labelled "estimated" everywhere.
|
|
6
|
+
*/
|
|
7
|
+
export interface ModelPrice {
|
|
8
|
+
inputPer1k: number;
|
|
9
|
+
outputPer1k: number;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
const PRICES: Record<string, ModelPrice> = {
|
|
13
|
+
"gpt-4.1-mini": { inputPer1k: 0.0004, outputPer1k: 0.0016 },
|
|
14
|
+
"gpt-4.1": { inputPer1k: 0.002, outputPer1k: 0.008 },
|
|
15
|
+
"gpt-4o": { inputPer1k: 0.0025, outputPer1k: 0.01 },
|
|
16
|
+
"gpt-4o-mini": { inputPer1k: 0.00015, outputPer1k: 0.0006 },
|
|
17
|
+
"gpt-5": { inputPer1k: 0.00125, outputPer1k: 0.01 },
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
const DEFAULT_PRICE: ModelPrice = { inputPer1k: 0.0005, outputPer1k: 0.0015 };
|
|
21
|
+
|
|
22
|
+
export function priceFor(model: string): ModelPrice {
|
|
23
|
+
return PRICES[model] ?? DEFAULT_PRICE;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** Rough token estimate (~4 chars/token) when an adapter can't report usage. */
|
|
27
|
+
export function estimateTokens(text: string): number {
|
|
28
|
+
if (!text) return 0;
|
|
29
|
+
return Math.max(1, Math.ceil(text.length / 4));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Build a usage estimate from an input and output string. */
|
|
33
|
+
export function estimateUsage(input: string, output: string): TokenUsage {
|
|
34
|
+
const promptTokens = estimateTokens(input);
|
|
35
|
+
const completionTokens = estimateTokens(output);
|
|
36
|
+
return { promptTokens, completionTokens, totalTokens: promptTokens + completionTokens };
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/** Estimate USD cost from usage + model pricing. */
|
|
40
|
+
export function estimateCost(usage: TokenUsage, model: string): number {
|
|
41
|
+
const p = priceFor(model);
|
|
42
|
+
return (
|
|
43
|
+
(usage.promptTokens / 1000) * p.inputPer1k +
|
|
44
|
+
(usage.completionTokens / 1000) * p.outputPer1k
|
|
45
|
+
);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export const ZERO_USAGE: TokenUsage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
|
|
49
|
+
|
|
50
|
+
export function addUsage(a: TokenUsage, b: TokenUsage): TokenUsage {
|
|
51
|
+
return {
|
|
52
|
+
promptTokens: a.promptTokens + b.promptTokens,
|
|
53
|
+
completionTokens: a.completionTokens + b.completionTokens,
|
|
54
|
+
totalTokens: a.totalTokens + b.totalTokens,
|
|
55
|
+
};
|
|
56
|
+
}
|
package/src/registry.ts
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { TestCase, TestFn } from "./types";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Global test registry. Importing a `*.test.ts` file runs its top-level
|
|
5
|
+
* `test(...)` calls, which push into this single shared registry (Jest-style).
|
|
6
|
+
*/
|
|
7
|
+
const tests: TestCase[] = [];
|
|
8
|
+
let currentFile = "<inline>";
|
|
9
|
+
|
|
10
|
+
/** Set by discovery immediately before importing each test file. */
|
|
11
|
+
export function setCurrentFile(file: string): void {
|
|
12
|
+
currentFile = file;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/** Register a test. Public `test()` export. */
|
|
16
|
+
export function test(name: string, fn: TestFn): void {
|
|
17
|
+
tests.push({ name, fn, file: currentFile });
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/** All registered tests, in registration order. */
|
|
21
|
+
export function getTests(): TestCase[] {
|
|
22
|
+
return tests;
|
|
23
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/** Minimal ANSI color helpers (respects NO_COLOR and non-TTY output). */
|
|
2
|
+
const enabled =
|
|
3
|
+
!process.env.NO_COLOR &&
|
|
4
|
+
process.env.TERM !== "dumb" &&
|
|
5
|
+
(process.stdout.isTTY ?? false);
|
|
6
|
+
|
|
7
|
+
const ESC = String.fromCharCode(27);
|
|
8
|
+
const wrap = (open: number, close: number) => (s: string) =>
|
|
9
|
+
enabled ? `${ESC}[${open}m${s}${ESC}[${close}m` : s;
|
|
10
|
+
|
|
11
|
+
export const c = {
|
|
12
|
+
green: wrap(32, 39),
|
|
13
|
+
red: wrap(31, 39),
|
|
14
|
+
yellow: wrap(33, 39),
|
|
15
|
+
cyan: wrap(36, 39),
|
|
16
|
+
gray: wrap(90, 39),
|
|
17
|
+
bold: wrap(1, 22),
|
|
18
|
+
dim: wrap(2, 22),
|
|
19
|
+
underline: wrap(4, 24),
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
export function fmtMs(ms: number): string {
|
|
23
|
+
if (ms >= 1000) return `${(ms / 1000).toFixed(2)}s`;
|
|
24
|
+
return `${Math.round(ms)}ms`;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export function fmtCost(usd: number): string {
|
|
28
|
+
if (usd === 0) return "$0.00";
|
|
29
|
+
if (usd < 0.01) return `$${usd.toFixed(4)}`;
|
|
30
|
+
return `$${usd.toFixed(2)}`;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function fmtPct(pct: number): string {
|
|
34
|
+
const sign = pct > 0 ? "+" : "";
|
|
35
|
+
return `${sign}${pct.toFixed(1)}%`;
|
|
36
|
+
}
|