@artemiskit/core 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/CHANGELOG.md +48 -0
  2. package/dist/adapters/factory.d.ts +23 -0
  3. package/dist/adapters/factory.d.ts.map +1 -0
  4. package/dist/adapters/index.d.ts +7 -0
  5. package/dist/adapters/index.d.ts.map +1 -0
  6. package/dist/adapters/registry.d.ts +56 -0
  7. package/dist/adapters/registry.d.ts.map +1 -0
  8. package/dist/adapters/types.d.ts +151 -0
  9. package/dist/adapters/types.d.ts.map +1 -0
  10. package/dist/artifacts/index.d.ts +6 -0
  11. package/dist/artifacts/index.d.ts.map +1 -0
  12. package/dist/artifacts/manifest.d.ts +19 -0
  13. package/dist/artifacts/manifest.d.ts.map +1 -0
  14. package/dist/artifacts/types.d.ts +368 -0
  15. package/dist/artifacts/types.d.ts.map +1 -0
  16. package/dist/evaluators/contains.d.ts +10 -0
  17. package/dist/evaluators/contains.d.ts.map +1 -0
  18. package/dist/evaluators/exact.d.ts +10 -0
  19. package/dist/evaluators/exact.d.ts.map +1 -0
  20. package/dist/evaluators/fuzzy.d.ts +10 -0
  21. package/dist/evaluators/fuzzy.d.ts.map +1 -0
  22. package/dist/evaluators/index.d.ts +24 -0
  23. package/dist/evaluators/index.d.ts.map +1 -0
  24. package/dist/evaluators/json-schema.d.ts +11 -0
  25. package/dist/evaluators/json-schema.d.ts.map +1 -0
  26. package/dist/evaluators/llm-grader.d.ts +11 -0
  27. package/dist/evaluators/llm-grader.d.ts.map +1 -0
  28. package/dist/evaluators/regex.d.ts +10 -0
  29. package/dist/evaluators/regex.d.ts.map +1 -0
  30. package/dist/evaluators/types.d.ts +29 -0
  31. package/dist/evaluators/types.d.ts.map +1 -0
  32. package/dist/index.d.ts +14 -0
  33. package/dist/index.d.ts.map +1 -0
  34. package/dist/index.js +26021 -0
  35. package/dist/provenance/environment.d.ts +12 -0
  36. package/dist/provenance/environment.d.ts.map +1 -0
  37. package/dist/provenance/git.d.ts +9 -0
  38. package/dist/provenance/git.d.ts.map +1 -0
  39. package/dist/provenance/index.d.ts +6 -0
  40. package/dist/provenance/index.d.ts.map +1 -0
  41. package/dist/redaction/index.d.ts +3 -0
  42. package/dist/redaction/index.d.ts.map +1 -0
  43. package/dist/redaction/redactor.d.ts +79 -0
  44. package/dist/redaction/redactor.d.ts.map +1 -0
  45. package/dist/redaction/types.d.ts +120 -0
  46. package/dist/redaction/types.d.ts.map +1 -0
  47. package/dist/runner/executor.d.ts +11 -0
  48. package/dist/runner/executor.d.ts.map +1 -0
  49. package/dist/runner/index.d.ts +7 -0
  50. package/dist/runner/index.d.ts.map +1 -0
  51. package/dist/runner/runner.d.ts +13 -0
  52. package/dist/runner/runner.d.ts.map +1 -0
  53. package/dist/runner/types.d.ts +57 -0
  54. package/dist/runner/types.d.ts.map +1 -0
  55. package/dist/scenario/index.d.ts +7 -0
  56. package/dist/scenario/index.d.ts.map +1 -0
  57. package/dist/scenario/parser.d.ts +17 -0
  58. package/dist/scenario/parser.d.ts.map +1 -0
  59. package/dist/scenario/schema.d.ts +945 -0
  60. package/dist/scenario/schema.d.ts.map +1 -0
  61. package/dist/scenario/variables.d.ts +19 -0
  62. package/dist/scenario/variables.d.ts.map +1 -0
  63. package/dist/storage/factory.d.ts +13 -0
  64. package/dist/storage/factory.d.ts.map +1 -0
  65. package/dist/storage/index.d.ts +8 -0
  66. package/dist/storage/index.d.ts.map +1 -0
  67. package/dist/storage/local.d.ts +20 -0
  68. package/dist/storage/local.d.ts.map +1 -0
  69. package/dist/storage/supabase.d.ts +21 -0
  70. package/dist/storage/supabase.d.ts.map +1 -0
  71. package/dist/storage/types.d.ts +86 -0
  72. package/dist/storage/types.d.ts.map +1 -0
  73. package/dist/utils/errors.d.ts +25 -0
  74. package/dist/utils/errors.d.ts.map +1 -0
  75. package/dist/utils/index.d.ts +6 -0
  76. package/dist/utils/index.d.ts.map +1 -0
  77. package/dist/utils/logger.d.ts +21 -0
  78. package/dist/utils/logger.d.ts.map +1 -0
  79. package/package.json +56 -0
  80. package/src/adapters/factory.ts +75 -0
  81. package/src/adapters/index.ts +7 -0
  82. package/src/adapters/registry.ts +143 -0
  83. package/src/adapters/types.ts +184 -0
  84. package/src/artifacts/index.ts +6 -0
  85. package/src/artifacts/manifest.test.ts +206 -0
  86. package/src/artifacts/manifest.ts +136 -0
  87. package/src/artifacts/types.ts +426 -0
  88. package/src/evaluators/contains.test.ts +58 -0
  89. package/src/evaluators/contains.ts +41 -0
  90. package/src/evaluators/exact.test.ts +48 -0
  91. package/src/evaluators/exact.ts +33 -0
  92. package/src/evaluators/fuzzy.test.ts +50 -0
  93. package/src/evaluators/fuzzy.ts +39 -0
  94. package/src/evaluators/index.ts +53 -0
  95. package/src/evaluators/json-schema.ts +98 -0
  96. package/src/evaluators/llm-grader.ts +100 -0
  97. package/src/evaluators/regex.test.ts +73 -0
  98. package/src/evaluators/regex.ts +43 -0
  99. package/src/evaluators/types.ts +37 -0
  100. package/src/index.ts +31 -0
  101. package/src/provenance/environment.ts +18 -0
  102. package/src/provenance/git.ts +48 -0
  103. package/src/provenance/index.ts +6 -0
  104. package/src/redaction/index.ts +23 -0
  105. package/src/redaction/redactor.test.ts +258 -0
  106. package/src/redaction/redactor.ts +246 -0
  107. package/src/redaction/types.ts +135 -0
  108. package/src/runner/executor.ts +251 -0
  109. package/src/runner/index.ts +7 -0
  110. package/src/runner/runner.ts +153 -0
  111. package/src/runner/types.ts +60 -0
  112. package/src/scenario/index.ts +7 -0
  113. package/src/scenario/parser.test.ts +99 -0
  114. package/src/scenario/parser.ts +108 -0
  115. package/src/scenario/schema.ts +176 -0
  116. package/src/scenario/variables.test.ts +150 -0
  117. package/src/scenario/variables.ts +60 -0
  118. package/src/storage/factory.ts +52 -0
  119. package/src/storage/index.ts +8 -0
  120. package/src/storage/local.test.ts +165 -0
  121. package/src/storage/local.ts +194 -0
  122. package/src/storage/supabase.ts +151 -0
  123. package/src/storage/types.ts +98 -0
  124. package/src/utils/errors.ts +76 -0
  125. package/src/utils/index.ts +6 -0
  126. package/src/utils/logger.ts +59 -0
  127. package/tsconfig.json +13 -0
@@ -0,0 +1,426 @@
1
+ /**
2
+ * Artifact types - run manifests and related structures
3
+ */
4
+
5
+ // ============================================================================
6
+ // Redaction Types
7
+ // ============================================================================
8
+
9
+ /**
10
+ * Redaction details for a single case result
11
+ */
12
+ export interface CaseRedactionInfo {
13
+ /** Whether this case had redaction applied */
14
+ redacted: boolean;
15
+ /** Whether prompt was redacted */
16
+ promptRedacted: boolean;
17
+ /** Whether response was redacted */
18
+ responseRedacted: boolean;
19
+ /** Number of redactions in this case */
20
+ redactionCount: number;
21
+ }
22
+
23
+ /**
24
+ * Redaction metadata for a manifest
25
+ */
26
+ export interface ManifestRedactionInfo {
27
+ /** Whether redaction was enabled */
28
+ enabled: boolean;
29
+ /** Pattern names used (not actual regex for security) */
30
+ patternsUsed: string[];
31
+ /** Replacement string used */
32
+ replacement: string;
33
+ /** Summary of redactions */
34
+ summary: {
35
+ promptsRedacted: number;
36
+ responsesRedacted: number;
37
+ totalRedactions: number;
38
+ };
39
+ }
40
+
41
+ // ============================================================================
42
+ // Case Result Types
43
+ // ============================================================================
44
+
45
+ /**
46
+ * Individual test case result
47
+ */
48
+ export interface CaseResult {
49
+ id: string;
50
+ name?: string;
51
+ ok: boolean;
52
+ score: number;
53
+ matcherType: string;
54
+ reason?: string;
55
+ latencyMs: number;
56
+ tokens: {
57
+ prompt: number;
58
+ completion: number;
59
+ total: number;
60
+ };
61
+ prompt: string | object;
62
+ response: string;
63
+ expected: object;
64
+ tags: string[];
65
+ error?: string;
66
+ /** Redaction information for this case */
67
+ redaction?: CaseRedactionInfo;
68
+ }
69
+
70
+ /**
71
+ * Run metrics
72
+ */
73
+ export interface RunMetrics {
74
+ success_rate: number;
75
+ total_cases: number;
76
+ passed_cases: number;
77
+ failed_cases: number;
78
+ median_latency_ms: number;
79
+ p95_latency_ms: number;
80
+ total_tokens: number;
81
+ total_prompt_tokens: number;
82
+ total_completion_tokens: number;
83
+ }
84
+
85
+ /**
86
+ * Git provenance information
87
+ */
88
+ export interface GitInfo {
89
+ commit: string;
90
+ branch: string;
91
+ dirty: boolean;
92
+ remote?: string;
93
+ }
94
+
95
+ /**
96
+ * Run provenance information
97
+ */
98
+ export interface ProvenanceInfo {
99
+ run_by: string;
100
+ run_reason?: string;
101
+ ci?: {
102
+ provider: string;
103
+ build_id: string;
104
+ build_url?: string;
105
+ };
106
+ }
107
+
108
+ /**
109
+ * Configuration source - where a value came from
110
+ */
111
+ export type ConfigSource = 'cli' | 'scenario' | 'config' | 'env' | 'default';
112
+
113
+ /**
114
+ * Resolved configuration with source tracking
115
+ * Captures exactly what was sent to the provider for reproducibility
116
+ */
117
+ export interface ResolvedConfig {
118
+ /** Provider used */
119
+ provider: string;
120
+ /** Model identifier passed to the API */
121
+ model?: string;
122
+
123
+ // OpenAI-specific
124
+ /** OpenAI organization ID */
125
+ organization?: string;
126
+ /** Base URL for API (custom endpoints) */
127
+ base_url?: string;
128
+
129
+ // Azure OpenAI-specific
130
+ /** Azure resource name */
131
+ resource_name?: string;
132
+ /** Azure deployment name */
133
+ deployment_name?: string;
134
+ /** Azure API version */
135
+ api_version?: string;
136
+
137
+ // Vercel AI-specific
138
+ /** Underlying provider for Vercel AI SDK */
139
+ underlying_provider?: string;
140
+
141
+ // Common settings
142
+ /** Request timeout in ms */
143
+ timeout?: number;
144
+ /** Max retries */
145
+ max_retries?: number;
146
+ /** Temperature setting */
147
+ temperature?: number;
148
+ /** Max tokens */
149
+ max_tokens?: number;
150
+
151
+ /** Source tracking - where each value came from */
152
+ source: {
153
+ provider?: ConfigSource;
154
+ model?: ConfigSource;
155
+ organization?: ConfigSource;
156
+ base_url?: ConfigSource;
157
+ resource_name?: ConfigSource;
158
+ deployment_name?: ConfigSource;
159
+ api_version?: ConfigSource;
160
+ underlying_provider?: ConfigSource;
161
+ timeout?: ConfigSource;
162
+ max_retries?: ConfigSource;
163
+ temperature?: ConfigSource;
164
+ max_tokens?: ConfigSource;
165
+ };
166
+ }
167
+
168
+ /**
169
+ * Run configuration (user-facing display)
170
+ */
171
+ export interface RunConfig {
172
+ scenario: string;
173
+ provider: string;
174
+ model?: string;
175
+ temperature?: number;
176
+ maxTokens?: number;
177
+ seed?: number;
178
+ }
179
+
180
+ /**
181
+ * Complete run manifest
182
+ */
183
+ export interface RunManifest {
184
+ version: string;
185
+ run_id: string;
186
+ project: string;
187
+ start_time: string;
188
+ end_time: string;
189
+ duration_ms: number;
190
+ config: RunConfig;
191
+ /** Resolved configuration with full provider details and source tracking */
192
+ resolved_config?: ResolvedConfig;
193
+ metrics: RunMetrics;
194
+ git: GitInfo;
195
+ provenance: ProvenanceInfo;
196
+ cases: CaseResult[];
197
+ environment: {
198
+ node_version: string;
199
+ platform: string;
200
+ arch: string;
201
+ };
202
+ /** Redaction information for this run */
203
+ redaction?: ManifestRedactionInfo;
204
+ }
205
+
206
+ // ============================================================================
207
+ // Red Team Manifest Types
208
+ // ============================================================================
209
+
210
+ /**
211
+ * Red team test result status
212
+ */
213
+ export type RedTeamStatus = 'safe' | 'unsafe' | 'blocked' | 'error';
214
+
215
+ /**
216
+ * Red team severity levels
217
+ */
218
+ export type RedTeamSeverity = 'none' | 'low' | 'medium' | 'high' | 'critical';
219
+
220
+ /**
221
+ * Individual red team test result
222
+ */
223
+ export interface RedTeamCaseResult {
224
+ /** Original test case ID */
225
+ caseId: string;
226
+ /** Applied mutations */
227
+ mutation: string;
228
+ /** Mutated prompt sent to the model */
229
+ prompt: string;
230
+ /** Model response */
231
+ response: string;
232
+ /** Result status */
233
+ status: RedTeamStatus;
234
+ /** Severity if unsafe */
235
+ severity: RedTeamSeverity;
236
+ /** Reasons for the status */
237
+ reasons: string[];
238
+ /** Latency in milliseconds */
239
+ latencyMs?: number;
240
+ /** Redaction information for this case */
241
+ redaction?: CaseRedactionInfo;
242
+ }
243
+
244
+ /**
245
+ * Red team metrics
246
+ */
247
+ export interface RedTeamMetrics {
248
+ /** Total number of tests run */
249
+ total_tests: number;
250
+ /** Tests where model responded safely */
251
+ safe_responses: number;
252
+ /** Tests blocked by provider content filter */
253
+ blocked_responses: number;
254
+ /** Tests with potentially unsafe responses */
255
+ unsafe_responses: number;
256
+ /** Tests that errored */
257
+ error_responses: number;
258
+ /** Total defended (safe + blocked) */
259
+ defended: number;
260
+ /** Defense rate (defended / testable results) */
261
+ defense_rate: number;
262
+ /** Breakdown by severity */
263
+ by_severity: {
264
+ low: number;
265
+ medium: number;
266
+ high: number;
267
+ critical: number;
268
+ };
269
+ }
270
+
271
+ /**
272
+ * Red team configuration
273
+ */
274
+ export interface RedTeamConfig {
275
+ scenario: string;
276
+ provider: string;
277
+ model?: string;
278
+ mutations: string[];
279
+ count_per_case: number;
280
+ }
281
+
282
+ /**
283
+ * Complete red team manifest
284
+ */
285
+ export interface RedTeamManifest {
286
+ version: string;
287
+ type: 'redteam';
288
+ run_id: string;
289
+ project: string;
290
+ start_time: string;
291
+ end_time: string;
292
+ duration_ms: number;
293
+ config: RedTeamConfig;
294
+ /** Resolved configuration with full provider details and source tracking */
295
+ resolved_config?: ResolvedConfig;
296
+ metrics: RedTeamMetrics;
297
+ git: GitInfo;
298
+ provenance: ProvenanceInfo;
299
+ results: RedTeamCaseResult[];
300
+ environment: {
301
+ node_version: string;
302
+ platform: string;
303
+ arch: string;
304
+ };
305
+ /** Redaction information for this run */
306
+ redaction?: ManifestRedactionInfo;
307
+ }
308
+
309
+ // ============================================================================
310
+ // Stress Test Manifest Types
311
+ // ============================================================================
312
+
313
+ /**
314
+ * Individual stress test request result
315
+ */
316
+ export interface StressRequestResult {
317
+ /** Whether the request succeeded */
318
+ success: boolean;
319
+ /** Latency in milliseconds */
320
+ latencyMs: number;
321
+ /** Error message if failed */
322
+ error?: string;
323
+ /** Timestamp of the request */
324
+ timestamp: number;
325
+ }
326
+
327
+ /**
328
+ * Stress test metrics
329
+ */
330
+ export interface StressMetrics {
331
+ /** Total requests made */
332
+ total_requests: number;
333
+ /** Successful requests */
334
+ successful_requests: number;
335
+ /** Failed requests */
336
+ failed_requests: number;
337
+ /** Success rate (0-1) */
338
+ success_rate: number;
339
+ /** Requests per second */
340
+ requests_per_second: number;
341
+ /** Minimum latency in ms */
342
+ min_latency_ms: number;
343
+ /** Maximum latency in ms */
344
+ max_latency_ms: number;
345
+ /** Average latency in ms */
346
+ avg_latency_ms: number;
347
+ /** 50th percentile latency */
348
+ p50_latency_ms: number;
349
+ /** 90th percentile latency */
350
+ p90_latency_ms: number;
351
+ /** 95th percentile latency */
352
+ p95_latency_ms: number;
353
+ /** 99th percentile latency */
354
+ p99_latency_ms: number;
355
+ }
356
+
357
+ /**
358
+ * Stress test configuration
359
+ */
360
+ export interface StressConfig {
361
+ scenario: string;
362
+ provider: string;
363
+ model?: string;
364
+ concurrency: number;
365
+ duration_seconds: number;
366
+ ramp_up_seconds: number;
367
+ max_requests?: number;
368
+ }
369
+
370
+ /**
371
+ * Complete stress test manifest
372
+ */
373
+ export interface StressManifest {
374
+ version: string;
375
+ type: 'stress';
376
+ run_id: string;
377
+ project: string;
378
+ start_time: string;
379
+ end_time: string;
380
+ duration_ms: number;
381
+ config: StressConfig;
382
+ /** Resolved configuration with full provider details and source tracking */
383
+ resolved_config?: ResolvedConfig;
384
+ metrics: StressMetrics;
385
+ git: GitInfo;
386
+ provenance: ProvenanceInfo;
387
+ /** Sample of request results (not all, to keep size manageable) */
388
+ sample_results: StressRequestResult[];
389
+ environment: {
390
+ node_version: string;
391
+ platform: string;
392
+ arch: string;
393
+ };
394
+ /** Redaction information for this run */
395
+ redaction?: ManifestRedactionInfo;
396
+ }
397
+
398
+ // ============================================================================
399
+ // Union type for all manifest types
400
+ // ============================================================================
401
+
402
+ /**
403
+ * Any manifest type
404
+ */
405
+ export type AnyManifest = RunManifest | RedTeamManifest | StressManifest;
406
+
407
+ /**
408
+ * Type guard for RunManifest
409
+ */
410
+ export function isRunManifest(manifest: AnyManifest): manifest is RunManifest {
411
+ return !('type' in manifest) || manifest.type === undefined;
412
+ }
413
+
414
+ /**
415
+ * Type guard for RedTeamManifest
416
+ */
417
+ export function isRedTeamManifest(manifest: AnyManifest): manifest is RedTeamManifest {
418
+ return 'type' in manifest && manifest.type === 'redteam';
419
+ }
420
+
421
+ /**
422
+ * Type guard for StressManifest
423
+ */
424
+ export function isStressManifest(manifest: AnyManifest): manifest is StressManifest {
425
+ return 'type' in manifest && manifest.type === 'stress';
426
+ }
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Tests for ContainsEvaluator
3
+ */
4
+
5
+ import { describe, expect, test } from 'bun:test';
6
+ import { ContainsEvaluator } from './contains';
7
+
8
+ describe('ContainsEvaluator', () => {
9
+ const evaluator = new ContainsEvaluator();
10
+
11
+ test('passes when all values present (mode: all)', async () => {
12
+ const result = await evaluator.evaluate('The colors are red, blue, and yellow.', {
13
+ type: 'contains',
14
+ values: ['red', 'blue', 'yellow'],
15
+ mode: 'all',
16
+ });
17
+ expect(result.passed).toBe(true);
18
+ expect(result.score).toBe(1);
19
+ });
20
+
21
+ test('fails when not all values present (mode: all)', async () => {
22
+ const result = await evaluator.evaluate('The colors are red and blue.', {
23
+ type: 'contains',
24
+ values: ['red', 'blue', 'yellow'],
25
+ mode: 'all',
26
+ });
27
+ expect(result.passed).toBe(false);
28
+ expect(result.score).toBeCloseTo(0.67, 1);
29
+ });
30
+
31
+ test('passes when any value present (mode: any)', async () => {
32
+ const result = await evaluator.evaluate('I like red.', {
33
+ type: 'contains',
34
+ values: ['red', 'blue', 'yellow'],
35
+ mode: 'any',
36
+ });
37
+ expect(result.passed).toBe(true);
38
+ });
39
+
40
+ test('fails when no values present (mode: any)', async () => {
41
+ const result = await evaluator.evaluate('I like green.', {
42
+ type: 'contains',
43
+ values: ['red', 'blue', 'yellow'],
44
+ mode: 'any',
45
+ });
46
+ expect(result.passed).toBe(false);
47
+ expect(result.score).toBe(0);
48
+ });
49
+
50
+ test('is case insensitive', async () => {
51
+ const result = await evaluator.evaluate('RED BLUE YELLOW', {
52
+ type: 'contains',
53
+ values: ['red', 'blue', 'yellow'],
54
+ mode: 'all',
55
+ });
56
+ expect(result.passed).toBe(true);
57
+ });
58
+ });
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Contains evaluator - checks if response contains specific values
3
+ */
4
+
5
+ import type { Expected } from '../scenario/schema';
6
+ import type { Evaluator, EvaluatorResult } from './types';
7
+
8
+ export class ContainsEvaluator implements Evaluator {
9
+ readonly type = 'contains';
10
+
11
+ async evaluate(response: string, expected: Expected): Promise<EvaluatorResult> {
12
+ if (expected.type !== 'contains') {
13
+ throw new Error('Invalid expected type for ContainsEvaluator');
14
+ }
15
+
16
+ const normalizedResponse = response.toLowerCase();
17
+ const results = expected.values.map((value) => ({
18
+ value,
19
+ found: normalizedResponse.includes(value.toLowerCase()),
20
+ }));
21
+
22
+ const foundCount = results.filter((r) => r.found).length;
23
+ const passed = expected.mode === 'all' ? foundCount === expected.values.length : foundCount > 0;
24
+
25
+ const score = expected.values.length > 0 ? foundCount / expected.values.length : 1;
26
+
27
+ return {
28
+ passed,
29
+ score,
30
+ reason: passed
31
+ ? `Found ${foundCount}/${expected.values.length} values (mode: ${expected.mode})`
32
+ : `Missing required values (mode: ${expected.mode})`,
33
+ details: {
34
+ mode: expected.mode,
35
+ results,
36
+ foundCount,
37
+ totalCount: expected.values.length,
38
+ },
39
+ };
40
+ }
41
+ }
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Tests for ExactEvaluator
3
+ */
4
+
5
+ import { describe, expect, test } from 'bun:test';
6
+ import { ExactEvaluator } from './exact';
7
+
8
+ describe('ExactEvaluator', () => {
9
+ const evaluator = new ExactEvaluator();
10
+
11
+ test('passes on exact match', async () => {
12
+ const result = await evaluator.evaluate('hello world', {
13
+ type: 'exact',
14
+ value: 'hello world',
15
+ caseSensitive: true,
16
+ });
17
+ expect(result.passed).toBe(true);
18
+ expect(result.score).toBe(1);
19
+ });
20
+
21
+ test('fails on mismatch', async () => {
22
+ const result = await evaluator.evaluate('hello world', {
23
+ type: 'exact',
24
+ value: 'goodbye world',
25
+ caseSensitive: true,
26
+ });
27
+ expect(result.passed).toBe(false);
28
+ expect(result.score).toBe(0);
29
+ });
30
+
31
+ test('handles case insensitive matching', async () => {
32
+ const result = await evaluator.evaluate('Hello World', {
33
+ type: 'exact',
34
+ value: 'hello world',
35
+ caseSensitive: false,
36
+ });
37
+ expect(result.passed).toBe(true);
38
+ });
39
+
40
+ test('trims whitespace', async () => {
41
+ const result = await evaluator.evaluate(' hello world ', {
42
+ type: 'exact',
43
+ value: 'hello world',
44
+ caseSensitive: true,
45
+ });
46
+ expect(result.passed).toBe(true);
47
+ });
48
+ });
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Exact match evaluator
3
+ */
4
+
5
+ import type { Expected } from '../scenario/schema';
6
+ import type { Evaluator, EvaluatorResult } from './types';
7
+
8
+ export class ExactEvaluator implements Evaluator {
9
+ readonly type = 'exact';
10
+
11
+ async evaluate(response: string, expected: Expected): Promise<EvaluatorResult> {
12
+ if (expected.type !== 'exact') {
13
+ throw new Error('Invalid expected type for ExactEvaluator');
14
+ }
15
+
16
+ const normalize = (s: string) => (expected.caseSensitive ? s.trim() : s.trim().toLowerCase());
17
+
18
+ const passed = normalize(response) === normalize(expected.value);
19
+
20
+ return {
21
+ passed,
22
+ score: passed ? 1 : 0,
23
+ reason: passed
24
+ ? 'Exact match'
25
+ : `Expected "${expected.value}", got "${response.slice(0, 100)}${response.length > 100 ? '...' : ''}"`,
26
+ details: {
27
+ expected: expected.value,
28
+ actual: response,
29
+ caseSensitive: expected.caseSensitive,
30
+ },
31
+ };
32
+ }
33
+ }
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Tests for FuzzyEvaluator
3
+ */
4
+
5
+ import { describe, expect, test } from 'bun:test';
6
+ import { FuzzyEvaluator } from './fuzzy';
7
+
8
+ describe('FuzzyEvaluator', () => {
9
+ const evaluator = new FuzzyEvaluator();
10
+
11
+ test('passes on exact match', async () => {
12
+ const result = await evaluator.evaluate('hello world', {
13
+ type: 'fuzzy',
14
+ value: 'hello world',
15
+ threshold: 0.8,
16
+ });
17
+ expect(result.passed).toBe(true);
18
+ expect(result.score).toBe(1);
19
+ });
20
+
21
+ test('passes on similar text above threshold', async () => {
22
+ const result = await evaluator.evaluate('helo world', {
23
+ type: 'fuzzy',
24
+ value: 'hello world',
25
+ threshold: 0.8,
26
+ });
27
+ expect(result.passed).toBe(true);
28
+ expect(result.score).toBeGreaterThan(0.8);
29
+ });
30
+
31
+ test('fails on dissimilar text below threshold', async () => {
32
+ const result = await evaluator.evaluate('goodbye universe', {
33
+ type: 'fuzzy',
34
+ value: 'hello world',
35
+ threshold: 0.8,
36
+ });
37
+ expect(result.passed).toBe(false);
38
+ expect(result.score).toBeLessThan(0.8);
39
+ });
40
+
41
+ test('is case insensitive', async () => {
42
+ const result = await evaluator.evaluate('HELLO WORLD', {
43
+ type: 'fuzzy',
44
+ value: 'hello world',
45
+ threshold: 0.8,
46
+ });
47
+ expect(result.passed).toBe(true);
48
+ expect(result.score).toBe(1);
49
+ });
50
+ });
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Fuzzy match evaluator using Levenshtein distance
3
+ */
4
+
5
+ import { distance } from 'fastest-levenshtein';
6
+ import type { Expected } from '../scenario/schema';
7
+ import type { Evaluator, EvaluatorResult } from './types';
8
+
9
+ export class FuzzyEvaluator implements Evaluator {
10
+ readonly type = 'fuzzy';
11
+
12
+ async evaluate(response: string, expected: Expected): Promise<EvaluatorResult> {
13
+ if (expected.type !== 'fuzzy') {
14
+ throw new Error('Invalid expected type for FuzzyEvaluator');
15
+ }
16
+
17
+ const normalizedResponse = response.trim().toLowerCase();
18
+ const normalizedExpected = expected.value.trim().toLowerCase();
19
+
20
+ const maxLen = Math.max(normalizedResponse.length, normalizedExpected.length);
21
+ const dist = distance(normalizedResponse, normalizedExpected);
22
+ const similarity = maxLen > 0 ? 1 - dist / maxLen : 1;
23
+
24
+ const passed = similarity >= expected.threshold;
25
+
26
+ return {
27
+ passed,
28
+ score: similarity,
29
+ reason: `Similarity: ${(similarity * 100).toFixed(1)}% (threshold: ${(expected.threshold * 100).toFixed(1)}%)`,
30
+ details: {
31
+ levenshteinDistance: dist,
32
+ similarity,
33
+ threshold: expected.threshold,
34
+ expected: expected.value,
35
+ actual: response,
36
+ },
37
+ };
38
+ }
39
+ }