@artemiskit/core 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -173,3 +173,199 @@ export interface BaselineStorageAdapter extends StorageAdapter {
173
173
  regressionThreshold: number;
174
174
  } | null>;
175
175
  }
176
+
177
+ // ============================================================================
178
+ // Case Results Types (for granular analytics)
179
+ // ============================================================================
180
+
181
+ /**
182
+ * Status of an individual case result
183
+ */
184
+ export type CaseResultStatus = 'passed' | 'failed' | 'error';
185
+
186
+ /**
187
+ * Individual case result record for storage
188
+ */
189
+ export interface CaseResultRecord {
190
+ /** Unique ID (auto-generated if not provided) */
191
+ id?: string;
192
+ /** Run ID this case belongs to */
193
+ runId: string;
194
+ /** Case ID from the test */
195
+ caseId: string;
196
+ /** Optional case name */
197
+ caseName?: string;
198
+ /** Result status */
199
+ status: CaseResultStatus;
200
+ /** Score from 0.0 to 1.0 */
201
+ score: number;
202
+ /** Type of matcher used */
203
+ matcherType: string;
204
+ /** Reason for the status */
205
+ reason?: string;
206
+ /** Model response */
207
+ response: string;
208
+ /** Latency in milliseconds */
209
+ latencyMs: number;
210
+ /** Prompt tokens used */
211
+ promptTokens: number;
212
+ /** Completion tokens used */
213
+ completionTokens: number;
214
+ /** Total tokens used */
215
+ totalTokens: number;
216
+ /** Error message if status is 'error' */
217
+ error?: string;
218
+ /** Tags for categorization */
219
+ tags?: string[];
220
+ /** ISO timestamp when created */
221
+ createdAt?: string;
222
+ }
223
+
224
+ /**
225
+ * Options for querying case results
226
+ */
227
+ export interface CaseResultQueryOptions {
228
+ /** Filter by run ID */
229
+ runId?: string;
230
+ /** Filter by case ID */
231
+ caseId?: string;
232
+ /** Filter by status */
233
+ status?: CaseResultStatus;
234
+ /** Filter by tags (any match) */
235
+ tags?: string[];
236
+ /** Maximum results to return */
237
+ limit?: number;
238
+ /** Offset for pagination */
239
+ offset?: number;
240
+ }
241
+
242
+ // ============================================================================
243
+ // Metrics History Types (for trending)
244
+ // ============================================================================
245
+
246
+ /**
247
+ * Daily metrics snapshot for a project/scenario
248
+ */
249
+ export interface MetricsSnapshot {
250
+ /** Unique ID (auto-generated if not provided) */
251
+ id?: string;
252
+ /** Date of the snapshot (YYYY-MM-DD) */
253
+ date: string;
254
+ /** Project name */
255
+ project: string;
256
+ /** Optional scenario name (null for project-wide) */
257
+ scenario?: string;
258
+ /** Total runs on this date */
259
+ totalRuns: number;
260
+ /** Total cases across all runs */
261
+ totalCases: number;
262
+ /** Total passed cases */
263
+ passedCases: number;
264
+ /** Total failed cases */
265
+ failedCases: number;
266
+ /** Average success rate */
267
+ avgSuccessRate: number;
268
+ /** Average latency in ms */
269
+ avgLatencyMs: number;
270
+ /** Average tokens per run */
271
+ avgTokensPerRun: number;
272
+ /** Minimum success rate */
273
+ minSuccessRate?: number;
274
+ /** Maximum success rate */
275
+ maxSuccessRate?: number;
276
+ /** Minimum latency in ms */
277
+ minLatencyMs?: number;
278
+ /** Maximum latency in ms */
279
+ maxLatencyMs?: number;
280
+ /** Total tokens consumed */
281
+ totalTokens: number;
282
+ /** ISO timestamp when created */
283
+ createdAt?: string;
284
+ /** ISO timestamp when last updated */
285
+ updatedAt?: string;
286
+ }
287
+
288
+ /**
289
+ * Options for querying metrics history
290
+ */
291
+ export interface MetricsTrendOptions {
292
+ /** Project to query */
293
+ project: string;
294
+ /** Optional scenario filter */
295
+ scenario?: string;
296
+ /** Start date (YYYY-MM-DD) */
297
+ startDate?: string;
298
+ /** End date (YYYY-MM-DD) */
299
+ endDate?: string;
300
+ /** Maximum results to return */
301
+ limit?: number;
302
+ }
303
+
304
+ /**
305
+ * Trend data point for visualization
306
+ */
307
+ export interface TrendDataPoint {
308
+ date: string;
309
+ successRate: number;
310
+ latencyMs: number;
311
+ totalRuns: number;
312
+ totalTokens: number;
313
+ }
314
+
315
+ // ============================================================================
316
+ // Enhanced Storage Adapter with Analytics
317
+ // ============================================================================
318
+
319
+ /**
320
+ * Extended storage adapter with analytics capabilities
321
+ */
322
+ export interface AnalyticsStorageAdapter extends BaselineStorageAdapter {
323
+ /**
324
+ * Save an individual case result
325
+ */
326
+ saveCaseResult(result: CaseResultRecord): Promise<string>;
327
+
328
+ /**
329
+ * Save multiple case results in batch
330
+ */
331
+ saveCaseResults(results: CaseResultRecord[]): Promise<string[]>;
332
+
333
+ /**
334
+ * Get case results for a run
335
+ */
336
+ getCaseResults(runId: string): Promise<CaseResultRecord[]>;
337
+
338
+ /**
339
+ * Query case results with filters
340
+ */
341
+ queryCaseResults(options: CaseResultQueryOptions): Promise<CaseResultRecord[]>;
342
+
343
+ /**
344
+ * Save a metrics snapshot
345
+ */
346
+ saveMetricsSnapshot(snapshot: MetricsSnapshot): Promise<string>;
347
+
348
+ /**
349
+ * Get metrics trend data
350
+ */
351
+ getMetricsTrend(options: MetricsTrendOptions): Promise<TrendDataPoint[]>;
352
+
353
+ /**
354
+ * Get a specific metrics snapshot
355
+ */
356
+ getMetricsSnapshot(
357
+ date: string,
358
+ project: string,
359
+ scenario?: string
360
+ ): Promise<MetricsSnapshot | null>;
361
+
362
+ /**
363
+ * Aggregate and save daily metrics from runs
364
+ * This can be called to build/update metrics_history from existing runs
365
+ */
366
+ aggregateDailyMetrics?(
367
+ date: string,
368
+ project: string,
369
+ scenario?: string
370
+ ): Promise<MetricsSnapshot>;
371
+ }
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Validator module exports
3
+ */
4
+
5
+ export * from './types';
6
+ export { ScenarioValidator } from './validator';
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Validator types
3
+ */
4
+
5
+ /**
6
+ * Validation error severity
7
+ */
8
+ export type ValidationSeverity = 'error' | 'warning';
9
+
10
+ /**
11
+ * Validation error/warning
12
+ */
13
+ export interface ValidationIssue {
14
+ /** Line number in the file (1-indexed) */
15
+ line: number;
16
+ /** Column number (optional) */
17
+ column?: number;
18
+ /** Error/warning message */
19
+ message: string;
20
+ /** Rule that triggered this issue */
21
+ rule: string;
22
+ /** Severity level */
23
+ severity: ValidationSeverity;
24
+ /** Suggested fix (optional) */
25
+ suggestion?: string;
26
+ }
27
+
28
+ /**
29
+ * Result for a single file validation
30
+ */
31
+ export interface ValidationResult {
32
+ /** File path that was validated */
33
+ file: string;
34
+ /** Whether the file is valid (no errors) */
35
+ valid: boolean;
36
+ /** List of errors found */
37
+ errors: ValidationIssue[];
38
+ /** List of warnings found */
39
+ warnings: ValidationIssue[];
40
+ }
41
+
42
+ /**
43
+ * Summary of validation across multiple files
44
+ */
45
+ export interface ValidationSummary {
46
+ /** Total files validated */
47
+ total: number;
48
+ /** Files that passed validation */
49
+ passed: number;
50
+ /** Files that failed validation */
51
+ failed: number;
52
+ /** Files with warnings only */
53
+ withWarnings: number;
54
+ }
55
+
56
+ /**
57
+ * Options for the validator
58
+ */
59
+ export interface ValidatorOptions {
60
+ /** Treat warnings as errors */
61
+ strict?: boolean;
62
+ }
@@ -0,0 +1,345 @@
1
+ /**
2
+ * Scenario Validator
3
+ *
4
+ * Validates scenario files for:
5
+ * 1. YAML syntax errors
6
+ * 2. Schema violations (required fields, types)
7
+ * 3. Semantic errors (duplicate IDs, undefined variables)
8
+ * 4. Warnings (deprecated patterns)
9
+ */
10
+
11
+ import { readFileSync } from 'node:fs';
12
+ import yaml from 'yaml';
13
+ import type { ZodError } from 'zod';
14
+ import { ScenarioSchema } from '../scenario/schema';
15
+ import type { ValidationIssue, ValidationResult, ValidatorOptions } from './types';
16
+
17
+ /**
18
+ * Scenario validator class
19
+ */
20
+ export class ScenarioValidator {
21
+ private _options: ValidatorOptions;
22
+
23
+ constructor(options: ValidatorOptions = {}) {
24
+ this._options = options;
25
+ }
26
+
27
+ get options(): ValidatorOptions {
28
+ return this._options;
29
+ }
30
+
31
+ /**
32
+ * Validate a scenario file
33
+ */
34
+ validate(filePath: string): ValidationResult {
35
+ const errors: ValidationIssue[] = [];
36
+ const warnings: ValidationIssue[] = [];
37
+
38
+ // Read file content
39
+ let content: string;
40
+ try {
41
+ content = readFileSync(filePath, 'utf-8');
42
+ } catch (err) {
43
+ const error = err as NodeJS.ErrnoException;
44
+ errors.push({
45
+ line: 1,
46
+ message: `Failed to read file: ${error.message}`,
47
+ rule: 'file-read',
48
+ severity: 'error',
49
+ });
50
+ return { file: filePath, valid: false, errors, warnings };
51
+ }
52
+
53
+ // Level 1: YAML Syntax validation
54
+ let parsed: unknown;
55
+ try {
56
+ parsed = yaml.parse(content, {
57
+ prettyErrors: true,
58
+ strict: true,
59
+ });
60
+ } catch (err) {
61
+ if (err instanceof yaml.YAMLError) {
62
+ const linePos = err.linePos?.[0];
63
+ errors.push({
64
+ line: linePos?.line || 1,
65
+ column: linePos?.col,
66
+ message: `Invalid YAML syntax: ${err.message}`,
67
+ rule: 'yaml-syntax',
68
+ severity: 'error',
69
+ });
70
+ } else {
71
+ errors.push({
72
+ line: 1,
73
+ message: `YAML parse error: ${(err as Error).message}`,
74
+ rule: 'yaml-syntax',
75
+ severity: 'error',
76
+ });
77
+ }
78
+ return { file: filePath, valid: false, errors, warnings };
79
+ }
80
+
81
+ // Check if parsed result is null or not an object
82
+ if (parsed === null || typeof parsed !== 'object') {
83
+ errors.push({
84
+ line: 1,
85
+ message: 'Scenario must be a YAML object',
86
+ rule: 'schema-type',
87
+ severity: 'error',
88
+ });
89
+ return { file: filePath, valid: false, errors, warnings };
90
+ }
91
+
92
+ // Level 2: Schema validation using Zod
93
+ const schemaResult = ScenarioSchema.safeParse(parsed);
94
+ if (!schemaResult.success) {
95
+ const zodErrors = this.formatZodErrors(schemaResult.error, content);
96
+ errors.push(...zodErrors);
97
+ }
98
+
99
+ // Level 3: Semantic validation (only if schema passed)
100
+ if (schemaResult.success) {
101
+ const semanticErrors = this.validateSemantics(schemaResult.data, content);
102
+ errors.push(...semanticErrors);
103
+ }
104
+
105
+ // Level 4: Warnings detection
106
+ const detectedWarnings = this.detectWarnings(parsed, content);
107
+ warnings.push(...detectedWarnings);
108
+
109
+ return {
110
+ file: filePath,
111
+ valid: errors.length === 0,
112
+ errors,
113
+ warnings,
114
+ };
115
+ }
116
+
117
+ /**
118
+ * Format Zod errors into ValidationIssues
119
+ */
120
+ private formatZodErrors(error: ZodError, content: string): ValidationIssue[] {
121
+ const issues: ValidationIssue[] = [];
122
+ const lines = content.split('\n');
123
+
124
+ for (const issue of error.issues) {
125
+ const path = issue.path.join('.');
126
+ const line = this.findLineForPath(lines, issue.path);
127
+
128
+ let message: string;
129
+ switch (issue.code) {
130
+ case 'invalid_type':
131
+ message = `'${path}' expected ${issue.expected}, received ${issue.received}`;
132
+ break;
133
+ case 'invalid_enum_value':
134
+ message = `'${path}' must be one of: ${(issue as { options: string[] }).options.join(', ')}`;
135
+ break;
136
+ case 'too_small':
137
+ if ((issue as { type: string }).type === 'array') {
138
+ message = `'${path}' must have at least ${(issue as { minimum: number }).minimum} item(s)`;
139
+ } else {
140
+ message = `'${path}' is too small`;
141
+ }
142
+ break;
143
+ case 'unrecognized_keys':
144
+ message = `Unrecognized field(s): ${(issue as { keys: string[] }).keys.join(', ')}`;
145
+ break;
146
+ default:
147
+ message = issue.message;
148
+ }
149
+
150
+ issues.push({
151
+ line,
152
+ message,
153
+ rule: `schema-${issue.code}`,
154
+ severity: 'error',
155
+ });
156
+ }
157
+
158
+ return issues;
159
+ }
160
+
161
+ /**
162
+ * Find approximate line number for a YAML path
163
+ */
164
+ private findLineForPath(lines: string[], path: (string | number)[]): number {
165
+ if (path.length === 0) return 1;
166
+
167
+ // Simple heuristic: search for the key in the file
168
+ const searchKey = String(path[path.length - 1]);
169
+
170
+ for (let i = 0; i < lines.length; i++) {
171
+ const line = lines[i];
172
+ // Check if line contains the key (accounting for YAML formatting)
173
+ if (line.includes(`${searchKey}:`) || line.includes(`- ${searchKey}:`)) {
174
+ return i + 1; // 1-indexed
175
+ }
176
+ // For array indices, look for "- id:" pattern
177
+ if (typeof path[path.length - 1] === 'number' && path.includes('cases')) {
178
+ if (line.trim().startsWith('- id:')) {
179
+ return i + 1;
180
+ }
181
+ }
182
+ }
183
+
184
+ return 1; // Default to first line
185
+ }
186
+
187
+ /**
188
+ * Validate semantic rules
189
+ */
190
+ private validateSemantics(
191
+ scenario: {
192
+ cases: Array<{ id: string; prompt: string | unknown; variables?: Record<string, unknown> }>;
193
+ variables?: Record<string, unknown>;
194
+ },
195
+ content: string
196
+ ): ValidationIssue[] {
197
+ const errors: ValidationIssue[] = [];
198
+ const lines = content.split('\n');
199
+
200
+ // Check for duplicate case IDs
201
+ const caseIds = new Set<string>();
202
+ for (const testCase of scenario.cases) {
203
+ if (caseIds.has(testCase.id)) {
204
+ const line = this.findLineForCaseId(lines, testCase.id);
205
+ errors.push({
206
+ line,
207
+ message: `Duplicate case ID: '${testCase.id}'`,
208
+ rule: 'duplicate-case-id',
209
+ severity: 'error',
210
+ });
211
+ }
212
+ caseIds.add(testCase.id);
213
+ }
214
+
215
+ // Check variable references
216
+ const globalVars = scenario.variables || {};
217
+ for (const testCase of scenario.cases) {
218
+ const caseVars = testCase.variables || {};
219
+ const allVars = { ...globalVars, ...caseVars };
220
+
221
+ const prompt =
222
+ typeof testCase.prompt === 'string' ? testCase.prompt : JSON.stringify(testCase.prompt);
223
+
224
+ const refs = this.extractVariableRefs(prompt);
225
+ for (const ref of refs) {
226
+ if (!(ref in allVars)) {
227
+ const line = this.findLineForCaseId(lines, testCase.id);
228
+ errors.push({
229
+ line,
230
+ message: `Undefined variable '{{${ref}}}' in case '${testCase.id}'`,
231
+ rule: 'undefined-variable',
232
+ severity: 'error',
233
+ suggestion: `Define '${ref}' in scenario.variables or case.variables`,
234
+ });
235
+ }
236
+ }
237
+ }
238
+
239
+ return errors;
240
+ }
241
+
242
+ /**
243
+ * Find line number for a case ID
244
+ */
245
+ private findLineForCaseId(lines: string[], caseId: string): number {
246
+ for (let i = 0; i < lines.length; i++) {
247
+ if (
248
+ lines[i].includes(`id: ${caseId}`) ||
249
+ lines[i].includes(`id: "${caseId}"`) ||
250
+ lines[i].includes(`id: '${caseId}'`)
251
+ ) {
252
+ return i + 1;
253
+ }
254
+ }
255
+ return 1;
256
+ }
257
+
258
+ /**
259
+ * Extract variable references from a string ({{varName}} format)
260
+ */
261
+ private extractVariableRefs(text: string): string[] {
262
+ const regex = /\{\{(\w+)\}\}/g;
263
+ const refs: string[] = [];
264
+ const matches = text.matchAll(regex);
265
+ for (const match of matches) {
266
+ refs.push(match[1]);
267
+ }
268
+ return refs;
269
+ }
270
+
271
+ /**
272
+ * Detect warnings (non-blocking issues)
273
+ */
274
+ private detectWarnings(parsed: unknown, content: string): ValidationIssue[] {
275
+ const warnings: ValidationIssue[] = [];
276
+ const lines = content.split('\n');
277
+
278
+ if (parsed && typeof parsed === 'object') {
279
+ const obj = parsed as Record<string, unknown>;
280
+
281
+ // Check for deprecated 'criteria' field (should be 'rubric' for llm_grader)
282
+ if (this.hasDeepKey(obj, 'criteria')) {
283
+ const line = this.findLineForKey(lines, 'criteria');
284
+ warnings.push({
285
+ line,
286
+ message: "'criteria' is deprecated, use 'rubric' instead (llm_grader)",
287
+ rule: 'deprecated-field',
288
+ severity: 'warning',
289
+ suggestion: "Replace 'criteria' with 'rubric'",
290
+ });
291
+ }
292
+
293
+ // Check for very large number of cases without parallel recommendation
294
+ const cases = obj.cases as unknown[] | undefined;
295
+ if (Array.isArray(cases) && cases.length > 20) {
296
+ warnings.push({
297
+ line: 1,
298
+ message: `Scenario has ${cases.length} cases. Consider using --parallel for faster execution.`,
299
+ rule: 'performance-hint',
300
+ severity: 'warning',
301
+ });
302
+ }
303
+
304
+ // Check for missing description
305
+ if (!obj.description) {
306
+ warnings.push({
307
+ line: 1,
308
+ message:
309
+ "Scenario is missing 'description' field. Adding a description improves documentation.",
310
+ rule: 'missing-description',
311
+ severity: 'warning',
312
+ });
313
+ }
314
+ }
315
+
316
+ return warnings;
317
+ }
318
+
319
+ /**
320
+ * Check if object has a key at any depth
321
+ */
322
+ private hasDeepKey(obj: unknown, key: string): boolean {
323
+ if (obj === null || typeof obj !== 'object') return false;
324
+
325
+ if (key in (obj as Record<string, unknown>)) return true;
326
+
327
+ for (const value of Object.values(obj as Record<string, unknown>)) {
328
+ if (this.hasDeepKey(value, key)) return true;
329
+ }
330
+
331
+ return false;
332
+ }
333
+
334
+ /**
335
+ * Find line number for a key
336
+ */
337
+ private findLineForKey(lines: string[], key: string): number {
338
+ for (let i = 0; i < lines.length; i++) {
339
+ if (lines[i].includes(`${key}:`)) {
340
+ return i + 1;
341
+ }
342
+ }
343
+ return 1;
344
+ }
345
+ }