@artemiskit/sdk 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,450 @@
1
+ /**
2
+ * @artemiskit/sdk
3
+ * Tests for custom matchers
4
+ */
5
+
6
+ import { describe, expect, it } from 'vitest';
7
+ import {
8
+ toAchieveRPS,
9
+ toHaveDefenseRate,
10
+ toHaveMedianLatencyBelow,
11
+ toHaveNoCriticalVulnerabilities,
12
+ toHaveNoHighSeverityVulnerabilities,
13
+ toHaveP95LatencyBelow,
14
+ toHaveStressP95LatencyBelow,
15
+ toHaveStressSuccessRate,
16
+ toHaveSuccessRate,
17
+ toPassAllCases,
18
+ toPassCasesWithTag,
19
+ toPassRedTeam,
20
+ toPassStressTest,
21
+ } from '../matchers/core';
22
+ import type { RedTeamResult, RunResult, StressResult } from '../types';
23
+
24
+ // Helper to create mock run results
25
+ function createMockRunResult(
26
+ overrides: Partial<{
27
+ success: boolean;
28
+ passedCases: number;
29
+ failedCases: number;
30
+ successRate: number;
31
+ medianLatency: number;
32
+ p95Latency: number;
33
+ cases: Array<{
34
+ id: string;
35
+ name: string;
36
+ ok: boolean;
37
+ tags: string[];
38
+ reason?: string;
39
+ }>;
40
+ }> = {}
41
+ ): RunResult {
42
+ const {
43
+ success = true,
44
+ passedCases = 10,
45
+ failedCases = 0,
46
+ successRate = 1,
47
+ medianLatency = 100,
48
+ p95Latency = 150,
49
+ cases = [],
50
+ } = overrides;
51
+
52
+ const totalCases = passedCases + failedCases;
53
+ const defaultCases =
54
+ cases.length > 0
55
+ ? cases
56
+ : Array.from({ length: totalCases }, (_, i) => ({
57
+ id: `case-${i}`,
58
+ name: `Case ${i}`,
59
+ ok: i < passedCases,
60
+ score: i < passedCases ? 1 : 0,
61
+ matcherType: 'contains',
62
+ reason: i < passedCases ? 'Passed' : 'Failed',
63
+ latencyMs: 100,
64
+ tokens: { prompt: 10, completion: 5, total: 15 },
65
+ prompt: 'test prompt',
66
+ response: 'test response',
67
+ expected: { type: 'contains', values: ['test'] },
68
+ tags: [],
69
+ }));
70
+
71
+ return {
72
+ success,
73
+ // biome-ignore lint/suspicious/noExplicitAny: Test helper
74
+ cases: defaultCases as any,
75
+ manifest: {
76
+ version: '1.0',
77
+ run_id: 'test-run',
78
+ project: 'test',
79
+ start_time: new Date().toISOString(),
80
+ end_time: new Date().toISOString(),
81
+ duration_ms: 1000,
82
+ config: {
83
+ scenario: 'test',
84
+ provider: 'mock',
85
+ model: 'mock-model',
86
+ },
87
+ metrics: {
88
+ success_rate: successRate,
89
+ total_cases: totalCases,
90
+ passed_cases: passedCases,
91
+ failed_cases: failedCases,
92
+ median_latency_ms: medianLatency,
93
+ p95_latency_ms: p95Latency,
94
+ total_tokens: 150,
95
+ total_prompt_tokens: 100,
96
+ total_completion_tokens: 50,
97
+ },
98
+ git: { commit: 'abc', branch: 'main', dirty: false },
99
+ provenance: { run_by: 'test' },
100
+ // biome-ignore lint/suspicious/noExplicitAny: Test helper
101
+ cases: defaultCases as any,
102
+ environment: { node_version: 'v18', platform: 'linux', arch: 'x64' },
103
+ },
104
+ };
105
+ }
106
+
107
+ // Helper to create mock red team results
108
+ function createMockRedTeamResult(
109
+ overrides: Partial<{
110
+ success: boolean;
111
+ defenseRate: number;
112
+ unsafeCount: number;
113
+ criticalCount: number;
114
+ highCount: number;
115
+ }> = {}
116
+ ): RedTeamResult {
117
+ const {
118
+ success = true,
119
+ defenseRate = 0.98,
120
+ unsafeCount = 1,
121
+ criticalCount = 0,
122
+ highCount = 0,
123
+ } = overrides;
124
+
125
+ return {
126
+ success,
127
+ defenseRate,
128
+ unsafeCount,
129
+ manifest: {
130
+ version: '1.0',
131
+ type: 'redteam',
132
+ run_id: 'test-run',
133
+ project: 'test',
134
+ start_time: new Date().toISOString(),
135
+ end_time: new Date().toISOString(),
136
+ duration_ms: 1000,
137
+ config: {
138
+ scenario: 'test',
139
+ provider: 'mock',
140
+ model: 'mock-model',
141
+ mutations: ['jailbreak'],
142
+ count_per_case: 1,
143
+ },
144
+ metrics: {
145
+ total_tests: 50,
146
+ safe_responses: 45,
147
+ blocked_responses: 4,
148
+ unsafe_responses: unsafeCount,
149
+ error_responses: 0,
150
+ defended: 49,
151
+ defense_rate: defenseRate,
152
+ by_severity: {
153
+ low: 0,
154
+ medium: 0,
155
+ high: highCount,
156
+ critical: criticalCount,
157
+ },
158
+ },
159
+ git: { commit: 'abc', branch: 'main', dirty: false },
160
+ provenance: { run_by: 'test' },
161
+ results: [],
162
+ environment: { node_version: 'v18', platform: 'linux', arch: 'x64' },
163
+ },
164
+ };
165
+ }
166
+
167
+ // Helper to create mock stress results
168
+ function createMockStressResult(
169
+ overrides: Partial<{
170
+ success: boolean;
171
+ successRate: number;
172
+ rps: number;
173
+ p95LatencyMs: number;
174
+ }> = {}
175
+ ): StressResult {
176
+ const { success = true, successRate = 0.99, rps = 50, p95LatencyMs = 200 } = overrides;
177
+
178
+ return {
179
+ success,
180
+ successRate,
181
+ rps,
182
+ p95LatencyMs,
183
+ manifest: {
184
+ version: '1.0',
185
+ type: 'stress',
186
+ run_id: 'test-run',
187
+ project: 'test',
188
+ start_time: new Date().toISOString(),
189
+ end_time: new Date().toISOString(),
190
+ duration_ms: 30000,
191
+ config: {
192
+ scenario: 'test',
193
+ provider: 'mock',
194
+ model: 'mock-model',
195
+ concurrency: 10,
196
+ duration_seconds: 30,
197
+ ramp_up_seconds: 5,
198
+ },
199
+ metrics: {
200
+ total_requests: 1500,
201
+ successful_requests: Math.floor(1500 * successRate),
202
+ failed_requests: Math.floor(1500 * (1 - successRate)),
203
+ success_rate: successRate,
204
+ requests_per_second: rps,
205
+ min_latency_ms: 50,
206
+ max_latency_ms: 500,
207
+ avg_latency_ms: 150,
208
+ p50_latency_ms: 140,
209
+ p90_latency_ms: 180,
210
+ p95_latency_ms: p95LatencyMs,
211
+ p99_latency_ms: 300,
212
+ },
213
+ git: { commit: 'abc', branch: 'main', dirty: false },
214
+ provenance: { run_by: 'test' },
215
+ sample_results: [],
216
+ environment: { node_version: 'v18', platform: 'linux', arch: 'x64' },
217
+ },
218
+ };
219
+ }
220
+
221
+ describe('Run Result Matchers', () => {
222
+ describe('toPassAllCases', () => {
223
+ it('should pass when all cases pass', () => {
224
+ const result = createMockRunResult({
225
+ success: true,
226
+ passedCases: 10,
227
+ failedCases: 0,
228
+ });
229
+ const matcherResult = toPassAllCases(result);
230
+ expect(matcherResult.pass).toBe(true);
231
+ });
232
+
233
+ it('should fail when some cases fail', () => {
234
+ const result = createMockRunResult({
235
+ success: false,
236
+ passedCases: 8,
237
+ failedCases: 2,
238
+ });
239
+ const matcherResult = toPassAllCases(result);
240
+ expect(matcherResult.pass).toBe(false);
241
+ expect(matcherResult.message()).toContain('2 out of 10 cases failed');
242
+ });
243
+ });
244
+
245
+ describe('toHaveSuccessRate', () => {
246
+ it('should pass when success rate meets threshold', () => {
247
+ const result = createMockRunResult({ successRate: 0.95 });
248
+ const matcherResult = toHaveSuccessRate(result, 0.9);
249
+ expect(matcherResult.pass).toBe(true);
250
+ });
251
+
252
+ it('should fail when success rate is below threshold', () => {
253
+ const result = createMockRunResult({ successRate: 0.85 });
254
+ const matcherResult = toHaveSuccessRate(result, 0.9);
255
+ expect(matcherResult.pass).toBe(false);
256
+ expect(matcherResult.message()).toContain('85.0%');
257
+ expect(matcherResult.message()).toContain('90.0%');
258
+ });
259
+ });
260
+
261
+ describe('toPassCasesWithTag', () => {
262
+ it('should pass when all tagged cases pass', () => {
263
+ const result = createMockRunResult({
264
+ cases: [
265
+ { id: '1', name: 'Case 1', ok: true, tags: ['important'] },
266
+ { id: '2', name: 'Case 2', ok: true, tags: ['important'] },
267
+ { id: '3', name: 'Case 3', ok: false, tags: ['other'] },
268
+ ],
269
+ });
270
+ const matcherResult = toPassCasesWithTag(result, 'important');
271
+ expect(matcherResult.pass).toBe(true);
272
+ });
273
+
274
+ it('should fail when some tagged cases fail', () => {
275
+ const result = createMockRunResult({
276
+ cases: [
277
+ { id: '1', name: 'Case 1', ok: true, tags: ['important'] },
278
+ {
279
+ id: '2',
280
+ name: 'Case 2',
281
+ ok: false,
282
+ tags: ['important'],
283
+ reason: 'Failed assertion',
284
+ },
285
+ ],
286
+ });
287
+ const matcherResult = toPassCasesWithTag(result, 'important');
288
+ expect(matcherResult.pass).toBe(false);
289
+ expect(matcherResult.message()).toContain('1 out of 2 failed');
290
+ });
291
+ });
292
+
293
+ describe('toHaveMedianLatencyBelow', () => {
294
+ it('should pass when median latency is within threshold', () => {
295
+ const result = createMockRunResult({ medianLatency: 100 });
296
+ const matcherResult = toHaveMedianLatencyBelow(result, 200);
297
+ expect(matcherResult.pass).toBe(true);
298
+ });
299
+
300
+ it('should fail when median latency exceeds threshold', () => {
301
+ const result = createMockRunResult({ medianLatency: 300 });
302
+ const matcherResult = toHaveMedianLatencyBelow(result, 200);
303
+ expect(matcherResult.pass).toBe(false);
304
+ expect(matcherResult.message()).toContain('300ms');
305
+ });
306
+ });
307
+
308
+ describe('toHaveP95LatencyBelow', () => {
309
+ it('should pass when P95 latency is within threshold', () => {
310
+ const result = createMockRunResult({ p95Latency: 150 });
311
+ const matcherResult = toHaveP95LatencyBelow(result, 200);
312
+ expect(matcherResult.pass).toBe(true);
313
+ });
314
+
315
+ it('should fail when P95 latency exceeds threshold', () => {
316
+ const result = createMockRunResult({ p95Latency: 250 });
317
+ const matcherResult = toHaveP95LatencyBelow(result, 200);
318
+ expect(matcherResult.pass).toBe(false);
319
+ });
320
+ });
321
+ });
322
+
323
+ describe('Red Team Result Matchers', () => {
324
+ describe('toHaveDefenseRate', () => {
325
+ it('should pass when defense rate meets threshold', () => {
326
+ const result = createMockRedTeamResult({ defenseRate: 0.98 });
327
+ const matcherResult = toHaveDefenseRate(result, 0.95);
328
+ expect(matcherResult.pass).toBe(true);
329
+ });
330
+
331
+ it('should fail when defense rate is below threshold', () => {
332
+ const result = createMockRedTeamResult({ defenseRate: 0.85 });
333
+ const matcherResult = toHaveDefenseRate(result, 0.95);
334
+ expect(matcherResult.pass).toBe(false);
335
+ expect(matcherResult.message()).toContain('85.0%');
336
+ });
337
+ });
338
+
339
+ describe('toHaveNoCriticalVulnerabilities', () => {
340
+ it('should pass when no critical vulnerabilities', () => {
341
+ const result = createMockRedTeamResult({ criticalCount: 0 });
342
+ const matcherResult = toHaveNoCriticalVulnerabilities(result);
343
+ expect(matcherResult.pass).toBe(true);
344
+ });
345
+
346
+ it('should fail when critical vulnerabilities exist', () => {
347
+ const result = createMockRedTeamResult({ criticalCount: 2 });
348
+ const matcherResult = toHaveNoCriticalVulnerabilities(result);
349
+ expect(matcherResult.pass).toBe(false);
350
+ expect(matcherResult.message()).toContain('2');
351
+ });
352
+ });
353
+
354
+ describe('toHaveNoHighSeverityVulnerabilities', () => {
355
+ it('should pass when no high/critical vulnerabilities', () => {
356
+ const result = createMockRedTeamResult({
357
+ highCount: 0,
358
+ criticalCount: 0,
359
+ });
360
+ const matcherResult = toHaveNoHighSeverityVulnerabilities(result);
361
+ expect(matcherResult.pass).toBe(true);
362
+ });
363
+
364
+ it('should fail when high vulnerabilities exist', () => {
365
+ const result = createMockRedTeamResult({ highCount: 3, criticalCount: 1 });
366
+ const matcherResult = toHaveNoHighSeverityVulnerabilities(result);
367
+ expect(matcherResult.pass).toBe(false);
368
+ expect(matcherResult.message()).toContain('4');
369
+ expect(matcherResult.message()).toContain('3 high');
370
+ });
371
+ });
372
+
373
+ describe('toPassRedTeam', () => {
374
+ it('should pass when red team test passes', () => {
375
+ const result = createMockRedTeamResult({ success: true });
376
+ const matcherResult = toPassRedTeam(result);
377
+ expect(matcherResult.pass).toBe(true);
378
+ });
379
+
380
+ it('should fail when red team test fails', () => {
381
+ const result = createMockRedTeamResult({
382
+ success: false,
383
+ defenseRate: 0.7,
384
+ });
385
+ const matcherResult = toPassRedTeam(result);
386
+ expect(matcherResult.pass).toBe(false);
387
+ expect(matcherResult.message()).toContain('70.0%');
388
+ });
389
+ });
390
+ });
391
+
392
+ describe('Stress Test Result Matchers', () => {
393
+ describe('toHaveStressSuccessRate', () => {
394
+ it('should pass when success rate meets threshold', () => {
395
+ const result = createMockStressResult({ successRate: 0.99 });
396
+ const matcherResult = toHaveStressSuccessRate(result, 0.95);
397
+ expect(matcherResult.pass).toBe(true);
398
+ });
399
+
400
+ it('should fail when success rate is below threshold', () => {
401
+ const result = createMockStressResult({ successRate: 0.85 });
402
+ const matcherResult = toHaveStressSuccessRate(result, 0.95);
403
+ expect(matcherResult.pass).toBe(false);
404
+ });
405
+ });
406
+
407
+ describe('toAchieveRPS', () => {
408
+ it('should pass when RPS meets target', () => {
409
+ const result = createMockStressResult({ rps: 100 });
410
+ const matcherResult = toAchieveRPS(result, 50);
411
+ expect(matcherResult.pass).toBe(true);
412
+ });
413
+
414
+ it('should fail when RPS is below target', () => {
415
+ const result = createMockStressResult({ rps: 30 });
416
+ const matcherResult = toAchieveRPS(result, 50);
417
+ expect(matcherResult.pass).toBe(false);
418
+ expect(matcherResult.message()).toContain('30.0 RPS');
419
+ });
420
+ });
421
+
422
+ describe('toHaveStressP95LatencyBelow', () => {
423
+ it('should pass when P95 latency is within threshold', () => {
424
+ const result = createMockStressResult({ p95LatencyMs: 150 });
425
+ const matcherResult = toHaveStressP95LatencyBelow(result, 200);
426
+ expect(matcherResult.pass).toBe(true);
427
+ });
428
+
429
+ it('should fail when P95 latency exceeds threshold', () => {
430
+ const result = createMockStressResult({ p95LatencyMs: 300 });
431
+ const matcherResult = toHaveStressP95LatencyBelow(result, 200);
432
+ expect(matcherResult.pass).toBe(false);
433
+ });
434
+ });
435
+
436
+ describe('toPassStressTest', () => {
437
+ it('should pass when stress test passes', () => {
438
+ const result = createMockStressResult({ success: true });
439
+ const matcherResult = toPassStressTest(result);
440
+ expect(matcherResult.pass).toBe(true);
441
+ });
442
+
443
+ it('should fail when stress test fails', () => {
444
+ const result = createMockStressResult({ success: false, successRate: 0.8 });
445
+ const matcherResult = toPassStressTest(result);
446
+ expect(matcherResult.pass).toBe(false);
447
+ expect(matcherResult.message()).toContain('80.0%');
448
+ });
449
+ });
450
+ });