@artemiskit/core 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -0
- package/dist/adapters/factory.d.ts +23 -0
- package/dist/adapters/factory.d.ts.map +1 -0
- package/dist/adapters/index.d.ts +7 -0
- package/dist/adapters/index.d.ts.map +1 -0
- package/dist/adapters/registry.d.ts +56 -0
- package/dist/adapters/registry.d.ts.map +1 -0
- package/dist/adapters/types.d.ts +151 -0
- package/dist/adapters/types.d.ts.map +1 -0
- package/dist/artifacts/index.d.ts +6 -0
- package/dist/artifacts/index.d.ts.map +1 -0
- package/dist/artifacts/manifest.d.ts +19 -0
- package/dist/artifacts/manifest.d.ts.map +1 -0
- package/dist/artifacts/types.d.ts +368 -0
- package/dist/artifacts/types.d.ts.map +1 -0
- package/dist/evaluators/contains.d.ts +10 -0
- package/dist/evaluators/contains.d.ts.map +1 -0
- package/dist/evaluators/exact.d.ts +10 -0
- package/dist/evaluators/exact.d.ts.map +1 -0
- package/dist/evaluators/fuzzy.d.ts +10 -0
- package/dist/evaluators/fuzzy.d.ts.map +1 -0
- package/dist/evaluators/index.d.ts +24 -0
- package/dist/evaluators/index.d.ts.map +1 -0
- package/dist/evaluators/json-schema.d.ts +11 -0
- package/dist/evaluators/json-schema.d.ts.map +1 -0
- package/dist/evaluators/llm-grader.d.ts +11 -0
- package/dist/evaluators/llm-grader.d.ts.map +1 -0
- package/dist/evaluators/regex.d.ts +10 -0
- package/dist/evaluators/regex.d.ts.map +1 -0
- package/dist/evaluators/types.d.ts +29 -0
- package/dist/evaluators/types.d.ts.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +26021 -0
- package/dist/provenance/environment.d.ts +12 -0
- package/dist/provenance/environment.d.ts.map +1 -0
- package/dist/provenance/git.d.ts +9 -0
- package/dist/provenance/git.d.ts.map +1 -0
- package/dist/provenance/index.d.ts +6 -0
- package/dist/provenance/index.d.ts.map +1 -0
- package/dist/redaction/index.d.ts +3 -0
- package/dist/redaction/index.d.ts.map +1 -0
- package/dist/redaction/redactor.d.ts +79 -0
- package/dist/redaction/redactor.d.ts.map +1 -0
- package/dist/redaction/types.d.ts +120 -0
- package/dist/redaction/types.d.ts.map +1 -0
- package/dist/runner/executor.d.ts +11 -0
- package/dist/runner/executor.d.ts.map +1 -0
- package/dist/runner/index.d.ts +7 -0
- package/dist/runner/index.d.ts.map +1 -0
- package/dist/runner/runner.d.ts +13 -0
- package/dist/runner/runner.d.ts.map +1 -0
- package/dist/runner/types.d.ts +57 -0
- package/dist/runner/types.d.ts.map +1 -0
- package/dist/scenario/index.d.ts +7 -0
- package/dist/scenario/index.d.ts.map +1 -0
- package/dist/scenario/parser.d.ts +17 -0
- package/dist/scenario/parser.d.ts.map +1 -0
- package/dist/scenario/schema.d.ts +945 -0
- package/dist/scenario/schema.d.ts.map +1 -0
- package/dist/scenario/variables.d.ts +19 -0
- package/dist/scenario/variables.d.ts.map +1 -0
- package/dist/storage/factory.d.ts +13 -0
- package/dist/storage/factory.d.ts.map +1 -0
- package/dist/storage/index.d.ts +8 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/dist/storage/local.d.ts +20 -0
- package/dist/storage/local.d.ts.map +1 -0
- package/dist/storage/supabase.d.ts +21 -0
- package/dist/storage/supabase.d.ts.map +1 -0
- package/dist/storage/types.d.ts +86 -0
- package/dist/storage/types.d.ts.map +1 -0
- package/dist/utils/errors.d.ts +25 -0
- package/dist/utils/errors.d.ts.map +1 -0
- package/dist/utils/index.d.ts +6 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/logger.d.ts +21 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/package.json +56 -0
- package/src/adapters/factory.ts +75 -0
- package/src/adapters/index.ts +7 -0
- package/src/adapters/registry.ts +143 -0
- package/src/adapters/types.ts +184 -0
- package/src/artifacts/index.ts +6 -0
- package/src/artifacts/manifest.test.ts +206 -0
- package/src/artifacts/manifest.ts +136 -0
- package/src/artifacts/types.ts +426 -0
- package/src/evaluators/contains.test.ts +58 -0
- package/src/evaluators/contains.ts +41 -0
- package/src/evaluators/exact.test.ts +48 -0
- package/src/evaluators/exact.ts +33 -0
- package/src/evaluators/fuzzy.test.ts +50 -0
- package/src/evaluators/fuzzy.ts +39 -0
- package/src/evaluators/index.ts +53 -0
- package/src/evaluators/json-schema.ts +98 -0
- package/src/evaluators/llm-grader.ts +100 -0
- package/src/evaluators/regex.test.ts +73 -0
- package/src/evaluators/regex.ts +43 -0
- package/src/evaluators/types.ts +37 -0
- package/src/index.ts +31 -0
- package/src/provenance/environment.ts +18 -0
- package/src/provenance/git.ts +48 -0
- package/src/provenance/index.ts +6 -0
- package/src/redaction/index.ts +23 -0
- package/src/redaction/redactor.test.ts +258 -0
- package/src/redaction/redactor.ts +246 -0
- package/src/redaction/types.ts +135 -0
- package/src/runner/executor.ts +251 -0
- package/src/runner/index.ts +7 -0
- package/src/runner/runner.ts +153 -0
- package/src/runner/types.ts +60 -0
- package/src/scenario/index.ts +7 -0
- package/src/scenario/parser.test.ts +99 -0
- package/src/scenario/parser.ts +108 -0
- package/src/scenario/schema.ts +176 -0
- package/src/scenario/variables.test.ts +150 -0
- package/src/scenario/variables.ts +60 -0
- package/src/storage/factory.ts +52 -0
- package/src/storage/index.ts +8 -0
- package/src/storage/local.test.ts +165 -0
- package/src/storage/local.ts +194 -0
- package/src/storage/supabase.ts +151 -0
- package/src/storage/types.ts +98 -0
- package/src/utils/errors.ts +76 -0
- package/src/utils/index.ts +6 -0
- package/src/utils/logger.ts +59 -0
- package/tsconfig.json +13 -0
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Artifact types - run manifests and related structures
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
// ============================================================================
|
|
6
|
+
// Redaction Types
|
|
7
|
+
// ============================================================================
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Redaction details for a single case result
|
|
11
|
+
*/
|
|
12
|
+
export interface CaseRedactionInfo {
|
|
13
|
+
/** Whether this case had redaction applied */
|
|
14
|
+
redacted: boolean;
|
|
15
|
+
/** Whether prompt was redacted */
|
|
16
|
+
promptRedacted: boolean;
|
|
17
|
+
/** Whether response was redacted */
|
|
18
|
+
responseRedacted: boolean;
|
|
19
|
+
/** Number of redactions in this case */
|
|
20
|
+
redactionCount: number;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Redaction metadata for a manifest
|
|
25
|
+
*/
|
|
26
|
+
export interface ManifestRedactionInfo {
|
|
27
|
+
/** Whether redaction was enabled */
|
|
28
|
+
enabled: boolean;
|
|
29
|
+
/** Pattern names used (not actual regex for security) */
|
|
30
|
+
patternsUsed: string[];
|
|
31
|
+
/** Replacement string used */
|
|
32
|
+
replacement: string;
|
|
33
|
+
/** Summary of redactions */
|
|
34
|
+
summary: {
|
|
35
|
+
promptsRedacted: number;
|
|
36
|
+
responsesRedacted: number;
|
|
37
|
+
totalRedactions: number;
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// ============================================================================
|
|
42
|
+
// Case Result Types
|
|
43
|
+
// ============================================================================
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Individual test case result
|
|
47
|
+
*/
|
|
48
|
+
export interface CaseResult {
|
|
49
|
+
id: string;
|
|
50
|
+
name?: string;
|
|
51
|
+
ok: boolean;
|
|
52
|
+
score: number;
|
|
53
|
+
matcherType: string;
|
|
54
|
+
reason?: string;
|
|
55
|
+
latencyMs: number;
|
|
56
|
+
tokens: {
|
|
57
|
+
prompt: number;
|
|
58
|
+
completion: number;
|
|
59
|
+
total: number;
|
|
60
|
+
};
|
|
61
|
+
prompt: string | object;
|
|
62
|
+
response: string;
|
|
63
|
+
expected: object;
|
|
64
|
+
tags: string[];
|
|
65
|
+
error?: string;
|
|
66
|
+
/** Redaction information for this case */
|
|
67
|
+
redaction?: CaseRedactionInfo;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Run metrics
|
|
72
|
+
*/
|
|
73
|
+
export interface RunMetrics {
|
|
74
|
+
success_rate: number;
|
|
75
|
+
total_cases: number;
|
|
76
|
+
passed_cases: number;
|
|
77
|
+
failed_cases: number;
|
|
78
|
+
median_latency_ms: number;
|
|
79
|
+
p95_latency_ms: number;
|
|
80
|
+
total_tokens: number;
|
|
81
|
+
total_prompt_tokens: number;
|
|
82
|
+
total_completion_tokens: number;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Git provenance information
|
|
87
|
+
*/
|
|
88
|
+
export interface GitInfo {
|
|
89
|
+
commit: string;
|
|
90
|
+
branch: string;
|
|
91
|
+
dirty: boolean;
|
|
92
|
+
remote?: string;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Run provenance information
|
|
97
|
+
*/
|
|
98
|
+
export interface ProvenanceInfo {
|
|
99
|
+
run_by: string;
|
|
100
|
+
run_reason?: string;
|
|
101
|
+
ci?: {
|
|
102
|
+
provider: string;
|
|
103
|
+
build_id: string;
|
|
104
|
+
build_url?: string;
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Configuration source - where a value came from
|
|
110
|
+
*/
|
|
111
|
+
export type ConfigSource = 'cli' | 'scenario' | 'config' | 'env' | 'default';
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Resolved configuration with source tracking
|
|
115
|
+
* Captures exactly what was sent to the provider for reproducibility
|
|
116
|
+
*/
|
|
117
|
+
export interface ResolvedConfig {
|
|
118
|
+
/** Provider used */
|
|
119
|
+
provider: string;
|
|
120
|
+
/** Model identifier passed to the API */
|
|
121
|
+
model?: string;
|
|
122
|
+
|
|
123
|
+
// OpenAI-specific
|
|
124
|
+
/** OpenAI organization ID */
|
|
125
|
+
organization?: string;
|
|
126
|
+
/** Base URL for API (custom endpoints) */
|
|
127
|
+
base_url?: string;
|
|
128
|
+
|
|
129
|
+
// Azure OpenAI-specific
|
|
130
|
+
/** Azure resource name */
|
|
131
|
+
resource_name?: string;
|
|
132
|
+
/** Azure deployment name */
|
|
133
|
+
deployment_name?: string;
|
|
134
|
+
/** Azure API version */
|
|
135
|
+
api_version?: string;
|
|
136
|
+
|
|
137
|
+
// Vercel AI-specific
|
|
138
|
+
/** Underlying provider for Vercel AI SDK */
|
|
139
|
+
underlying_provider?: string;
|
|
140
|
+
|
|
141
|
+
// Common settings
|
|
142
|
+
/** Request timeout in ms */
|
|
143
|
+
timeout?: number;
|
|
144
|
+
/** Max retries */
|
|
145
|
+
max_retries?: number;
|
|
146
|
+
/** Temperature setting */
|
|
147
|
+
temperature?: number;
|
|
148
|
+
/** Max tokens */
|
|
149
|
+
max_tokens?: number;
|
|
150
|
+
|
|
151
|
+
/** Source tracking - where each value came from */
|
|
152
|
+
source: {
|
|
153
|
+
provider?: ConfigSource;
|
|
154
|
+
model?: ConfigSource;
|
|
155
|
+
organization?: ConfigSource;
|
|
156
|
+
base_url?: ConfigSource;
|
|
157
|
+
resource_name?: ConfigSource;
|
|
158
|
+
deployment_name?: ConfigSource;
|
|
159
|
+
api_version?: ConfigSource;
|
|
160
|
+
underlying_provider?: ConfigSource;
|
|
161
|
+
timeout?: ConfigSource;
|
|
162
|
+
max_retries?: ConfigSource;
|
|
163
|
+
temperature?: ConfigSource;
|
|
164
|
+
max_tokens?: ConfigSource;
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Run configuration (user-facing display)
|
|
170
|
+
*/
|
|
171
|
+
export interface RunConfig {
|
|
172
|
+
scenario: string;
|
|
173
|
+
provider: string;
|
|
174
|
+
model?: string;
|
|
175
|
+
temperature?: number;
|
|
176
|
+
maxTokens?: number;
|
|
177
|
+
seed?: number;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Complete run manifest
|
|
182
|
+
*/
|
|
183
|
+
export interface RunManifest {
|
|
184
|
+
version: string;
|
|
185
|
+
run_id: string;
|
|
186
|
+
project: string;
|
|
187
|
+
start_time: string;
|
|
188
|
+
end_time: string;
|
|
189
|
+
duration_ms: number;
|
|
190
|
+
config: RunConfig;
|
|
191
|
+
/** Resolved configuration with full provider details and source tracking */
|
|
192
|
+
resolved_config?: ResolvedConfig;
|
|
193
|
+
metrics: RunMetrics;
|
|
194
|
+
git: GitInfo;
|
|
195
|
+
provenance: ProvenanceInfo;
|
|
196
|
+
cases: CaseResult[];
|
|
197
|
+
environment: {
|
|
198
|
+
node_version: string;
|
|
199
|
+
platform: string;
|
|
200
|
+
arch: string;
|
|
201
|
+
};
|
|
202
|
+
/** Redaction information for this run */
|
|
203
|
+
redaction?: ManifestRedactionInfo;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// ============================================================================
|
|
207
|
+
// Red Team Manifest Types
|
|
208
|
+
// ============================================================================
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Red team test result status
|
|
212
|
+
*/
|
|
213
|
+
export type RedTeamStatus = 'safe' | 'unsafe' | 'blocked' | 'error';
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Red team severity levels
|
|
217
|
+
*/
|
|
218
|
+
export type RedTeamSeverity = 'none' | 'low' | 'medium' | 'high' | 'critical';
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Individual red team test result
|
|
222
|
+
*/
|
|
223
|
+
export interface RedTeamCaseResult {
|
|
224
|
+
/** Original test case ID */
|
|
225
|
+
caseId: string;
|
|
226
|
+
/** Applied mutations */
|
|
227
|
+
mutation: string;
|
|
228
|
+
/** Mutated prompt sent to the model */
|
|
229
|
+
prompt: string;
|
|
230
|
+
/** Model response */
|
|
231
|
+
response: string;
|
|
232
|
+
/** Result status */
|
|
233
|
+
status: RedTeamStatus;
|
|
234
|
+
/** Severity if unsafe */
|
|
235
|
+
severity: RedTeamSeverity;
|
|
236
|
+
/** Reasons for the status */
|
|
237
|
+
reasons: string[];
|
|
238
|
+
/** Latency in milliseconds */
|
|
239
|
+
latencyMs?: number;
|
|
240
|
+
/** Redaction information for this case */
|
|
241
|
+
redaction?: CaseRedactionInfo;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Red team metrics
|
|
246
|
+
*/
|
|
247
|
+
export interface RedTeamMetrics {
|
|
248
|
+
/** Total number of tests run */
|
|
249
|
+
total_tests: number;
|
|
250
|
+
/** Tests where model responded safely */
|
|
251
|
+
safe_responses: number;
|
|
252
|
+
/** Tests blocked by provider content filter */
|
|
253
|
+
blocked_responses: number;
|
|
254
|
+
/** Tests with potentially unsafe responses */
|
|
255
|
+
unsafe_responses: number;
|
|
256
|
+
/** Tests that errored */
|
|
257
|
+
error_responses: number;
|
|
258
|
+
/** Total defended (safe + blocked) */
|
|
259
|
+
defended: number;
|
|
260
|
+
/** Defense rate (defended / testable results) */
|
|
261
|
+
defense_rate: number;
|
|
262
|
+
/** Breakdown by severity */
|
|
263
|
+
by_severity: {
|
|
264
|
+
low: number;
|
|
265
|
+
medium: number;
|
|
266
|
+
high: number;
|
|
267
|
+
critical: number;
|
|
268
|
+
};
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Red team configuration
|
|
273
|
+
*/
|
|
274
|
+
export interface RedTeamConfig {
|
|
275
|
+
scenario: string;
|
|
276
|
+
provider: string;
|
|
277
|
+
model?: string;
|
|
278
|
+
mutations: string[];
|
|
279
|
+
count_per_case: number;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Complete red team manifest
|
|
284
|
+
*/
|
|
285
|
+
export interface RedTeamManifest {
|
|
286
|
+
version: string;
|
|
287
|
+
type: 'redteam';
|
|
288
|
+
run_id: string;
|
|
289
|
+
project: string;
|
|
290
|
+
start_time: string;
|
|
291
|
+
end_time: string;
|
|
292
|
+
duration_ms: number;
|
|
293
|
+
config: RedTeamConfig;
|
|
294
|
+
/** Resolved configuration with full provider details and source tracking */
|
|
295
|
+
resolved_config?: ResolvedConfig;
|
|
296
|
+
metrics: RedTeamMetrics;
|
|
297
|
+
git: GitInfo;
|
|
298
|
+
provenance: ProvenanceInfo;
|
|
299
|
+
results: RedTeamCaseResult[];
|
|
300
|
+
environment: {
|
|
301
|
+
node_version: string;
|
|
302
|
+
platform: string;
|
|
303
|
+
arch: string;
|
|
304
|
+
};
|
|
305
|
+
/** Redaction information for this run */
|
|
306
|
+
redaction?: ManifestRedactionInfo;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// ============================================================================
|
|
310
|
+
// Stress Test Manifest Types
|
|
311
|
+
// ============================================================================
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* Individual stress test request result
|
|
315
|
+
*/
|
|
316
|
+
export interface StressRequestResult {
|
|
317
|
+
/** Whether the request succeeded */
|
|
318
|
+
success: boolean;
|
|
319
|
+
/** Latency in milliseconds */
|
|
320
|
+
latencyMs: number;
|
|
321
|
+
/** Error message if failed */
|
|
322
|
+
error?: string;
|
|
323
|
+
/** Timestamp of the request */
|
|
324
|
+
timestamp: number;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Stress test metrics
|
|
329
|
+
*/
|
|
330
|
+
export interface StressMetrics {
|
|
331
|
+
/** Total requests made */
|
|
332
|
+
total_requests: number;
|
|
333
|
+
/** Successful requests */
|
|
334
|
+
successful_requests: number;
|
|
335
|
+
/** Failed requests */
|
|
336
|
+
failed_requests: number;
|
|
337
|
+
/** Success rate (0-1) */
|
|
338
|
+
success_rate: number;
|
|
339
|
+
/** Requests per second */
|
|
340
|
+
requests_per_second: number;
|
|
341
|
+
/** Minimum latency in ms */
|
|
342
|
+
min_latency_ms: number;
|
|
343
|
+
/** Maximum latency in ms */
|
|
344
|
+
max_latency_ms: number;
|
|
345
|
+
/** Average latency in ms */
|
|
346
|
+
avg_latency_ms: number;
|
|
347
|
+
/** 50th percentile latency */
|
|
348
|
+
p50_latency_ms: number;
|
|
349
|
+
/** 90th percentile latency */
|
|
350
|
+
p90_latency_ms: number;
|
|
351
|
+
/** 95th percentile latency */
|
|
352
|
+
p95_latency_ms: number;
|
|
353
|
+
/** 99th percentile latency */
|
|
354
|
+
p99_latency_ms: number;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
/**
|
|
358
|
+
* Stress test configuration
|
|
359
|
+
*/
|
|
360
|
+
export interface StressConfig {
|
|
361
|
+
scenario: string;
|
|
362
|
+
provider: string;
|
|
363
|
+
model?: string;
|
|
364
|
+
concurrency: number;
|
|
365
|
+
duration_seconds: number;
|
|
366
|
+
ramp_up_seconds: number;
|
|
367
|
+
max_requests?: number;
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
/**
|
|
371
|
+
* Complete stress test manifest
|
|
372
|
+
*/
|
|
373
|
+
export interface StressManifest {
|
|
374
|
+
version: string;
|
|
375
|
+
type: 'stress';
|
|
376
|
+
run_id: string;
|
|
377
|
+
project: string;
|
|
378
|
+
start_time: string;
|
|
379
|
+
end_time: string;
|
|
380
|
+
duration_ms: number;
|
|
381
|
+
config: StressConfig;
|
|
382
|
+
/** Resolved configuration with full provider details and source tracking */
|
|
383
|
+
resolved_config?: ResolvedConfig;
|
|
384
|
+
metrics: StressMetrics;
|
|
385
|
+
git: GitInfo;
|
|
386
|
+
provenance: ProvenanceInfo;
|
|
387
|
+
/** Sample of request results (not all, to keep size manageable) */
|
|
388
|
+
sample_results: StressRequestResult[];
|
|
389
|
+
environment: {
|
|
390
|
+
node_version: string;
|
|
391
|
+
platform: string;
|
|
392
|
+
arch: string;
|
|
393
|
+
};
|
|
394
|
+
/** Redaction information for this run */
|
|
395
|
+
redaction?: ManifestRedactionInfo;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// ============================================================================
|
|
399
|
+
// Union type for all manifest types
|
|
400
|
+
// ============================================================================
|
|
401
|
+
|
|
402
|
+
/**
|
|
403
|
+
* Any manifest type
|
|
404
|
+
*/
|
|
405
|
+
export type AnyManifest = RunManifest | RedTeamManifest | StressManifest;
|
|
406
|
+
|
|
407
|
+
/**
|
|
408
|
+
* Type guard for RunManifest
|
|
409
|
+
*/
|
|
410
|
+
export function isRunManifest(manifest: AnyManifest): manifest is RunManifest {
|
|
411
|
+
return !('type' in manifest) || manifest.type === undefined;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
/**
|
|
415
|
+
* Type guard for RedTeamManifest
|
|
416
|
+
*/
|
|
417
|
+
export function isRedTeamManifest(manifest: AnyManifest): manifest is RedTeamManifest {
|
|
418
|
+
return 'type' in manifest && manifest.type === 'redteam';
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/**
|
|
422
|
+
* Type guard for StressManifest
|
|
423
|
+
*/
|
|
424
|
+
export function isStressManifest(manifest: AnyManifest): manifest is StressManifest {
|
|
425
|
+
return 'type' in manifest && manifest.type === 'stress';
|
|
426
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for ContainsEvaluator
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, expect, test } from 'bun:test';
|
|
6
|
+
import { ContainsEvaluator } from './contains';
|
|
7
|
+
|
|
8
|
+
describe('ContainsEvaluator', () => {
|
|
9
|
+
const evaluator = new ContainsEvaluator();
|
|
10
|
+
|
|
11
|
+
test('passes when all values present (mode: all)', async () => {
|
|
12
|
+
const result = await evaluator.evaluate('The colors are red, blue, and yellow.', {
|
|
13
|
+
type: 'contains',
|
|
14
|
+
values: ['red', 'blue', 'yellow'],
|
|
15
|
+
mode: 'all',
|
|
16
|
+
});
|
|
17
|
+
expect(result.passed).toBe(true);
|
|
18
|
+
expect(result.score).toBe(1);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
test('fails when not all values present (mode: all)', async () => {
|
|
22
|
+
const result = await evaluator.evaluate('The colors are red and blue.', {
|
|
23
|
+
type: 'contains',
|
|
24
|
+
values: ['red', 'blue', 'yellow'],
|
|
25
|
+
mode: 'all',
|
|
26
|
+
});
|
|
27
|
+
expect(result.passed).toBe(false);
|
|
28
|
+
expect(result.score).toBeCloseTo(0.67, 1);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
test('passes when any value present (mode: any)', async () => {
|
|
32
|
+
const result = await evaluator.evaluate('I like red.', {
|
|
33
|
+
type: 'contains',
|
|
34
|
+
values: ['red', 'blue', 'yellow'],
|
|
35
|
+
mode: 'any',
|
|
36
|
+
});
|
|
37
|
+
expect(result.passed).toBe(true);
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test('fails when no values present (mode: any)', async () => {
|
|
41
|
+
const result = await evaluator.evaluate('I like green.', {
|
|
42
|
+
type: 'contains',
|
|
43
|
+
values: ['red', 'blue', 'yellow'],
|
|
44
|
+
mode: 'any',
|
|
45
|
+
});
|
|
46
|
+
expect(result.passed).toBe(false);
|
|
47
|
+
expect(result.score).toBe(0);
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
test('is case insensitive', async () => {
|
|
51
|
+
const result = await evaluator.evaluate('RED BLUE YELLOW', {
|
|
52
|
+
type: 'contains',
|
|
53
|
+
values: ['red', 'blue', 'yellow'],
|
|
54
|
+
mode: 'all',
|
|
55
|
+
});
|
|
56
|
+
expect(result.passed).toBe(true);
|
|
57
|
+
});
|
|
58
|
+
});
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Contains evaluator - checks if response contains specific values
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { Expected } from '../scenario/schema';
|
|
6
|
+
import type { Evaluator, EvaluatorResult } from './types';
|
|
7
|
+
|
|
8
|
+
export class ContainsEvaluator implements Evaluator {
|
|
9
|
+
readonly type = 'contains';
|
|
10
|
+
|
|
11
|
+
async evaluate(response: string, expected: Expected): Promise<EvaluatorResult> {
|
|
12
|
+
if (expected.type !== 'contains') {
|
|
13
|
+
throw new Error('Invalid expected type for ContainsEvaluator');
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const normalizedResponse = response.toLowerCase();
|
|
17
|
+
const results = expected.values.map((value) => ({
|
|
18
|
+
value,
|
|
19
|
+
found: normalizedResponse.includes(value.toLowerCase()),
|
|
20
|
+
}));
|
|
21
|
+
|
|
22
|
+
const foundCount = results.filter((r) => r.found).length;
|
|
23
|
+
const passed = expected.mode === 'all' ? foundCount === expected.values.length : foundCount > 0;
|
|
24
|
+
|
|
25
|
+
const score = expected.values.length > 0 ? foundCount / expected.values.length : 1;
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
passed,
|
|
29
|
+
score,
|
|
30
|
+
reason: passed
|
|
31
|
+
? `Found ${foundCount}/${expected.values.length} values (mode: ${expected.mode})`
|
|
32
|
+
: `Missing required values (mode: ${expected.mode})`,
|
|
33
|
+
details: {
|
|
34
|
+
mode: expected.mode,
|
|
35
|
+
results,
|
|
36
|
+
foundCount,
|
|
37
|
+
totalCount: expected.values.length,
|
|
38
|
+
},
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for ExactEvaluator
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, expect, test } from 'bun:test';
|
|
6
|
+
import { ExactEvaluator } from './exact';
|
|
7
|
+
|
|
8
|
+
describe('ExactEvaluator', () => {
|
|
9
|
+
const evaluator = new ExactEvaluator();
|
|
10
|
+
|
|
11
|
+
test('passes on exact match', async () => {
|
|
12
|
+
const result = await evaluator.evaluate('hello world', {
|
|
13
|
+
type: 'exact',
|
|
14
|
+
value: 'hello world',
|
|
15
|
+
caseSensitive: true,
|
|
16
|
+
});
|
|
17
|
+
expect(result.passed).toBe(true);
|
|
18
|
+
expect(result.score).toBe(1);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
test('fails on mismatch', async () => {
|
|
22
|
+
const result = await evaluator.evaluate('hello world', {
|
|
23
|
+
type: 'exact',
|
|
24
|
+
value: 'goodbye world',
|
|
25
|
+
caseSensitive: true,
|
|
26
|
+
});
|
|
27
|
+
expect(result.passed).toBe(false);
|
|
28
|
+
expect(result.score).toBe(0);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
test('handles case insensitive matching', async () => {
|
|
32
|
+
const result = await evaluator.evaluate('Hello World', {
|
|
33
|
+
type: 'exact',
|
|
34
|
+
value: 'hello world',
|
|
35
|
+
caseSensitive: false,
|
|
36
|
+
});
|
|
37
|
+
expect(result.passed).toBe(true);
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test('trims whitespace', async () => {
|
|
41
|
+
const result = await evaluator.evaluate(' hello world ', {
|
|
42
|
+
type: 'exact',
|
|
43
|
+
value: 'hello world',
|
|
44
|
+
caseSensitive: true,
|
|
45
|
+
});
|
|
46
|
+
expect(result.passed).toBe(true);
|
|
47
|
+
});
|
|
48
|
+
});
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Exact match evaluator
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { Expected } from '../scenario/schema';
|
|
6
|
+
import type { Evaluator, EvaluatorResult } from './types';
|
|
7
|
+
|
|
8
|
+
export class ExactEvaluator implements Evaluator {
|
|
9
|
+
readonly type = 'exact';
|
|
10
|
+
|
|
11
|
+
async evaluate(response: string, expected: Expected): Promise<EvaluatorResult> {
|
|
12
|
+
if (expected.type !== 'exact') {
|
|
13
|
+
throw new Error('Invalid expected type for ExactEvaluator');
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const normalize = (s: string) => (expected.caseSensitive ? s.trim() : s.trim().toLowerCase());
|
|
17
|
+
|
|
18
|
+
const passed = normalize(response) === normalize(expected.value);
|
|
19
|
+
|
|
20
|
+
return {
|
|
21
|
+
passed,
|
|
22
|
+
score: passed ? 1 : 0,
|
|
23
|
+
reason: passed
|
|
24
|
+
? 'Exact match'
|
|
25
|
+
: `Expected "${expected.value}", got "${response.slice(0, 100)}${response.length > 100 ? '...' : ''}"`,
|
|
26
|
+
details: {
|
|
27
|
+
expected: expected.value,
|
|
28
|
+
actual: response,
|
|
29
|
+
caseSensitive: expected.caseSensitive,
|
|
30
|
+
},
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for FuzzyEvaluator
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, expect, test } from 'bun:test';
|
|
6
|
+
import { FuzzyEvaluator } from './fuzzy';
|
|
7
|
+
|
|
8
|
+
describe('FuzzyEvaluator', () => {
|
|
9
|
+
const evaluator = new FuzzyEvaluator();
|
|
10
|
+
|
|
11
|
+
test('passes on exact match', async () => {
|
|
12
|
+
const result = await evaluator.evaluate('hello world', {
|
|
13
|
+
type: 'fuzzy',
|
|
14
|
+
value: 'hello world',
|
|
15
|
+
threshold: 0.8,
|
|
16
|
+
});
|
|
17
|
+
expect(result.passed).toBe(true);
|
|
18
|
+
expect(result.score).toBe(1);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
test('passes on similar text above threshold', async () => {
|
|
22
|
+
const result = await evaluator.evaluate('helo world', {
|
|
23
|
+
type: 'fuzzy',
|
|
24
|
+
value: 'hello world',
|
|
25
|
+
threshold: 0.8,
|
|
26
|
+
});
|
|
27
|
+
expect(result.passed).toBe(true);
|
|
28
|
+
expect(result.score).toBeGreaterThan(0.8);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
test('fails on dissimilar text below threshold', async () => {
|
|
32
|
+
const result = await evaluator.evaluate('goodbye universe', {
|
|
33
|
+
type: 'fuzzy',
|
|
34
|
+
value: 'hello world',
|
|
35
|
+
threshold: 0.8,
|
|
36
|
+
});
|
|
37
|
+
expect(result.passed).toBe(false);
|
|
38
|
+
expect(result.score).toBeLessThan(0.8);
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
test('is case insensitive', async () => {
|
|
42
|
+
const result = await evaluator.evaluate('HELLO WORLD', {
|
|
43
|
+
type: 'fuzzy',
|
|
44
|
+
value: 'hello world',
|
|
45
|
+
threshold: 0.8,
|
|
46
|
+
});
|
|
47
|
+
expect(result.passed).toBe(true);
|
|
48
|
+
expect(result.score).toBe(1);
|
|
49
|
+
});
|
|
50
|
+
});
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fuzzy match evaluator using Levenshtein distance
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { distance } from 'fastest-levenshtein';
|
|
6
|
+
import type { Expected } from '../scenario/schema';
|
|
7
|
+
import type { Evaluator, EvaluatorResult } from './types';
|
|
8
|
+
|
|
9
|
+
export class FuzzyEvaluator implements Evaluator {
|
|
10
|
+
readonly type = 'fuzzy';
|
|
11
|
+
|
|
12
|
+
async evaluate(response: string, expected: Expected): Promise<EvaluatorResult> {
|
|
13
|
+
if (expected.type !== 'fuzzy') {
|
|
14
|
+
throw new Error('Invalid expected type for FuzzyEvaluator');
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const normalizedResponse = response.trim().toLowerCase();
|
|
18
|
+
const normalizedExpected = expected.value.trim().toLowerCase();
|
|
19
|
+
|
|
20
|
+
const maxLen = Math.max(normalizedResponse.length, normalizedExpected.length);
|
|
21
|
+
const dist = distance(normalizedResponse, normalizedExpected);
|
|
22
|
+
const similarity = maxLen > 0 ? 1 - dist / maxLen : 1;
|
|
23
|
+
|
|
24
|
+
const passed = similarity >= expected.threshold;
|
|
25
|
+
|
|
26
|
+
return {
|
|
27
|
+
passed,
|
|
28
|
+
score: similarity,
|
|
29
|
+
reason: `Similarity: ${(similarity * 100).toFixed(1)}% (threshold: ${(expected.threshold * 100).toFixed(1)}%)`,
|
|
30
|
+
details: {
|
|
31
|
+
levenshteinDistance: dist,
|
|
32
|
+
similarity,
|
|
33
|
+
threshold: expected.threshold,
|
|
34
|
+
expected: expected.value,
|
|
35
|
+
actual: response,
|
|
36
|
+
},
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
}
|