@learning-commons/evaluators 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,464 @@
1
+ import { z } from 'zod';
2
+
3
+ /**
4
+ * Logging interface for the Evaluators SDK
5
+ *
6
+ * Provides structured logging with verbosity levels.
7
+ * Users can inject custom loggers or use the default console logger.
8
+ */
9
+ /**
10
+ * Log levels in order of verbosity
11
+ */
12
+ declare enum LogLevel {
13
+ /** Debug messages - very verbose, for development */
14
+ DEBUG = 0,
15
+ /** Informational messages - normal operations */
16
+ INFO = 1,
17
+ /** Warning messages - potentially problematic situations */
18
+ WARN = 2,
19
+ /** Error messages - errors that need attention */
20
+ ERROR = 3,
21
+ /** Silent - no logging */
22
+ SILENT = 4
23
+ }
24
+ /**
25
+ * Context object for structured logging
26
+ */
27
+ interface LogContext {
28
+ /** Evaluator type (vocabulary, sentence-structure, etc.) */
29
+ evaluator?: string;
30
+ /** Current operation or stage */
31
+ operation?: string;
32
+ /** Error object if applicable */
33
+ error?: Error;
34
+ /** Additional metadata */
35
+ [key: string]: unknown;
36
+ }
37
+ /**
38
+ * Logger interface
39
+ *
40
+ * Implement this interface to provide custom logging behavior.
41
+ *
42
+ * @example
43
+ * ```typescript
44
+ * const customLogger: Logger = {
45
+ * debug: (msg, ctx) => myLogger.debug(msg, ctx),
46
+ * info: (msg, ctx) => myLogger.info(msg, ctx),
47
+ * warn: (msg, ctx) => myLogger.warn(msg, ctx),
48
+ * error: (msg, ctx) => myLogger.error(msg, ctx),
49
+ * };
50
+ *
51
+ * const evaluator = new VocabularyEvaluator({
52
+ * googleApiKey: '...',
53
+ * openaiApiKey: '...',
54
+ * logger: customLogger,
55
+ * logLevel: LogLevel.INFO,
56
+ * });
57
+ * ```
58
+ */
59
+ interface Logger {
60
+ /**
61
+ * Log debug message
62
+ * Used for detailed debugging information
63
+ */
64
+ debug(message: string, context?: LogContext): void;
65
+ /**
66
+ * Log informational message
67
+ * Used for normal operations
68
+ */
69
+ info(message: string, context?: LogContext): void;
70
+ /**
71
+ * Log warning message
72
+ * Used for potentially problematic situations
73
+ */
74
+ warn(message: string, context?: LogContext): void;
75
+ /**
76
+ * Log error message
77
+ * Used for errors that need attention
78
+ */
79
+ error(message: string, context?: LogContext): void;
80
+ }
81
+
82
+ /**
83
+ * Message format for LLM conversations
84
+ */
85
+ interface Message {
86
+ role: 'system' | 'user' | 'assistant';
87
+ content: string;
88
+ }
89
+ /**
90
+ * Request configuration for structured LLM generation
91
+ */
92
+ interface LLMRequest<T> {
93
+ messages: Message[];
94
+ schema: z.ZodSchema<T>;
95
+ temperature?: number;
96
+ maxTokens?: number;
97
+ }
98
+ /**
99
+ * Response from LLM with usage metadata
100
+ */
101
+ interface LLMResponse<T> {
102
+ data: T;
103
+ model: string;
104
+ usage: {
105
+ inputTokens: number;
106
+ outputTokens: number;
107
+ };
108
+ latencyMs: number;
109
+ }
110
+ /**
111
+ * Response from plain text generation
112
+ */
113
+ interface TextGenerationResponse {
114
+ text: string;
115
+ usage: {
116
+ inputTokens: number;
117
+ outputTokens: number;
118
+ };
119
+ latencyMs: number;
120
+ }
121
+ /**
122
+ * Base interface for LLM provider implementations
123
+ */
124
+ interface LLMProvider {
125
+ /** Canonical label for the provider and model in use (e.g. "openai:gpt-4o") */
126
+ readonly label: string;
127
+ /**
128
+ * Generate structured output from LLM using Zod schema
129
+ */
130
+ generateStructured<T>(request: LLMRequest<T>): Promise<LLMResponse<T>>;
131
+ /**
132
+ * Generate plain text from LLM
133
+ */
134
+ generateText(messages: Message[], temperature?: number): Promise<TextGenerationResponse>;
135
+ }
136
+ /**
137
+ * Named constants for LLM provider types — use instead of raw string literals.
138
+ */
139
+ declare const Providers: {
140
+ readonly google: "google";
141
+ readonly openai: "openai";
142
+ readonly anthropic: "anthropic";
143
+ readonly custom: "custom";
144
+ };
145
+ /**
146
+ * Configuration for LLM provider
147
+ */
148
+ interface ProviderConfig {
149
+ type: 'openai' | 'anthropic' | 'google' | 'custom';
150
+ apiKey?: string;
151
+ model?: string;
152
+ temperature?: number;
153
+ baseURL?: string;
154
+ customProvider?: LLMProvider;
155
+ maxRetries?: number;
156
+ }
157
+
158
+ /**
159
+ * Evaluation status
160
+ */
161
+ type EvaluationStatus = 'success' | 'error';
162
+ /**
163
+ * Token usage metrics from LLM providers
164
+ */
165
+ interface TokenUsage {
166
+ input_tokens: number;
167
+ output_tokens: number;
168
+ }
169
+ /**
170
+ * Per-stage details for multi-stage evaluations
171
+ */
172
+ interface StageDetail {
173
+ /** Stage name (e.g., "background_knowledge", "complexity_evaluation") */
174
+ stage: string;
175
+ /** Provider used for this stage (e.g., "openai:gpt-4o") */
176
+ provider: string;
177
+ /** Total latency including all retries (ms) */
178
+ latency_ms: number;
179
+ /** Token usage aggregated across all attempts */
180
+ token_usage?: TokenUsage;
181
+ /**
182
+ * Whether schema validation failed (indicates prompt needs clearer instructions)
183
+ *
184
+ * TODO: Not currently tracked. Vercel AI SDK abstracts validation away.
185
+ * To implement: Add custom retry wrapper that catches validation errors.
186
+ */
187
+ schema_validation_failed?: boolean;
188
+ }
189
+ /**
190
+ * Extensible metadata for telemetry events
191
+ */
192
+ interface TelemetryMetadata {
193
+ /** Detailed breakdown by stage (for multi-stage evaluations) */
194
+ stage_details?: StageDetail[];
195
+ }
196
+ /**
197
+ * Telemetry event payload
198
+ */
199
+ interface TelemetryEvent {
200
+ timestamp: string;
201
+ sdk_version: string;
202
+ evaluator_type: string;
203
+ grade?: string;
204
+ status: EvaluationStatus;
205
+ error_code?: string;
206
+ latency_ms: number;
207
+ text_length_chars: number;
208
+ provider: string;
209
+ token_usage?: TokenUsage;
210
+ metadata?: TelemetryMetadata;
211
+ model_override?: boolean;
212
+ input_text?: string;
213
+ }
214
+ /**
215
+ * Configuration for telemetry client
216
+ */
217
+ interface TelemetryConfig {
218
+ /** Analytics service endpoint URL */
219
+ endpoint: string;
220
+ /** Learning Commons partner key (optional, sent as X-API-Key header) */
221
+ partnerKey?: string;
222
+ /** Client ID for anonymous tracking (persistent UUID from ~/.config/learning-commons/config.json) */
223
+ clientId: string;
224
+ /** Enable telemetry (default: true) */
225
+ enabled: boolean;
226
+ /** Logger instance (respects the SDK's configured log level and custom logger) */
227
+ logger: Logger;
228
+ }
229
+
230
+ /**
231
+ * Telemetry client for sending analytics events
232
+ *
233
+ * Fire-and-forget implementation that never blocks SDK operations.
234
+ * Errors are logged but don't fail evaluations.
235
+ */
236
+ declare class TelemetryClient {
237
+ private config;
238
+ private logger;
239
+ constructor(config: TelemetryConfig);
240
+ /**
241
+ * Send telemetry event to analytics service
242
+ *
243
+ * Fire-and-forget: Errors are logged but don't throw.
244
+ */
245
+ send(event: TelemetryEvent): Promise<void>;
246
+ }
247
+
248
+ /**
249
+ * Supported LLM providers
250
+ */
251
+ declare enum Provider {
252
+ OpenAI = "openai",
253
+ Google = "google",
254
+ Anthropic = "anthropic"
255
+ }
256
+ /**
257
+ * Granular telemetry configuration options
258
+ */
259
+ interface TelemetryOptions {
260
+ /** Enable telemetry (default: true) */
261
+ enabled?: boolean;
262
+ /** Record input text in telemetry (default: false) */
263
+ recordInputs?: boolean;
264
+ }
265
+ /**
266
+ * Override the provider and model used by an evaluator.
267
+ *
268
+ * When set, all LLM calls use this provider and model instead of the defaults.
269
+ * The evaluator's normal key requirements are bypassed — provide the key for
270
+ * the chosen provider via the matching top-level config field
271
+ * (e.g. `anthropicApiKey` for `Provider.Anthropic`).
272
+ *
273
+ * Both `provider` and `model` are required. An empty or missing `model` throws
274
+ * `ConfigurationError` at construction time. An unrecognised model ID throws
275
+ * `ConfigurationError` at evaluation time when the provider rejects it.
276
+ *
277
+ * Results may vary; evaluators are validated against their recommended models.
278
+ */
279
+ interface ModelOverride {
280
+ provider: Provider;
281
+ model: string;
282
+ }
283
+ /**
284
+ * Base configuration for all evaluators
285
+ */
286
+ interface BaseEvaluatorConfig {
287
+ /** Google API key (for evaluators using Gemini) */
288
+ googleApiKey?: string;
289
+ /** OpenAI API key (for evaluators using GPT) */
290
+ openaiApiKey?: string;
291
+ /** Anthropic API key (for evaluators using Claude) */
292
+ anthropicApiKey?: string;
293
+ /** Learning Commons partner key for authenticated telemetry (optional) */
294
+ partnerKey?: string;
295
+ /**
296
+ * Override the provider and model used by this evaluator.
297
+ * When set, all LLM calls use this provider and model instead of the defaults.
298
+ * See {@link ModelOverride} for details.
299
+ */
300
+ modelOverride?: ModelOverride;
301
+ /**
302
+ * Maximum number of retries for failed API calls (default: 2)
303
+ * Set to 0 to disable retries.
304
+ *
305
+ * Note: With maxRetries=2, a failed call will be attempted up to 3 times total
306
+ * (1 initial attempt + 2 retries)
307
+ */
308
+ maxRetries?: number;
309
+ /**
310
+ * Telemetry configuration (default: all enabled)
311
+ *
312
+ * Can be:
313
+ * - `true`: Enable with defaults (recordInputs: false)
314
+ * - `false`: Disable completely
315
+ * - `TelemetryOptions`: Granular control
316
+ */
317
+ telemetry?: boolean | TelemetryOptions;
318
+ /**
319
+ * Custom logger implementation (optional)
320
+ * If not provided, uses console logger with specified logLevel
321
+ */
322
+ logger?: Logger;
323
+ /**
324
+ * Log level for default console logger (default: WARN)
325
+ * Only used if custom logger is not provided
326
+ *
327
+ * - DEBUG: Very verbose, shows all operations
328
+ * - INFO: Normal operations
329
+ * - WARN: Warnings only (default)
330
+ * - ERROR: Errors only
331
+ * - SILENT: No logging
332
+ */
333
+ logLevel?: LogLevel;
334
+ }
335
+ /**
336
+ * Evaluator metadata interface
337
+ * Each evaluator must provide this metadata as static properties
338
+ */
339
+ interface EvaluatorMetadata {
340
+ /** Unique identifier for the evaluator (e.g., 'vocabulary', 'sentence-structure') */
341
+ readonly id: string;
342
+ /** Human-readable name (e.g., 'Vocabulary', 'Sentence Structure') */
343
+ readonly name: string;
344
+ /** Brief description of what the evaluator does */
345
+ readonly description: string;
346
+ /** Supported grade levels (e.g., ['3', '4', '5', ...]) */
347
+ readonly supportedGrades: readonly string[];
348
+ /** Providers required by this evaluator's default configuration */
349
+ readonly defaultProviders: readonly Provider[];
350
+ }
351
+ /**
352
+ * Abstract base class for all evaluators
353
+ *
354
+ * Provides common functionality:
355
+ * - Telemetry setup and event sending
356
+ * - Text validation
357
+ * - Grade validation (with overridable default)
358
+ * - Metadata creation
359
+ *
360
+ * Concrete evaluators must implement:
361
+ * - static metadata: Provide evaluator metadata (see EvaluatorMetadata interface)
362
+ */
363
+ declare abstract class BaseEvaluator {
364
+ protected telemetryClient?: TelemetryClient;
365
+ protected logger: Logger;
366
+ protected config: Required<Pick<BaseEvaluatorConfig, 'maxRetries'>> & {
367
+ telemetry: Required<TelemetryOptions>;
368
+ modelOverride?: ModelOverride;
369
+ googleApiKey?: string;
370
+ openaiApiKey?: string;
371
+ anthropicApiKey?: string;
372
+ };
373
+ /**
374
+ * Static metadata for the evaluator
375
+ *
376
+ * Concrete evaluators MUST define this property.
377
+ *
378
+ * @example
379
+ * ```typescript
380
+ * class MyEvaluator extends BaseEvaluator {
381
+ * static readonly metadata = {
382
+ * id: 'my-evaluator',
383
+ * name: 'My Evaluator',
384
+ * description: 'Does something useful',
385
+ * supportedGrades: ['3', '4', '5'],
386
+ * defaultProviders: [Provider.Google],
387
+ * };
388
+ * }
389
+ * ```
390
+ */
391
+ static readonly metadata: EvaluatorMetadata;
392
+ /**
393
+ * @throws {ConfigurationError} If the subclass has not defined static metadata
394
+ * @throws {ConfigurationError} If modelOverride has an invalid provider or empty model
395
+ * @throws {ConfigurationError} If a required API key is missing
396
+ */
397
+ constructor(config: BaseEvaluatorConfig);
398
+ /**
399
+ * Get metadata for this evaluator instance
400
+ * @throws {ConfigurationError} If the subclass has not defined static metadata
401
+ */
402
+ protected get metadata(): EvaluatorMetadata;
403
+ /**
404
+ * Validate modelOverride shape: provider must be a known Provider value and
405
+ * model must be a non-empty string.
406
+ * @throws {ConfigurationError} If the override is malformed
407
+ */
408
+ private validateModelOverride;
409
+ /**
410
+ * Validate that the required API key is present.
411
+ * When modelOverride is set, checks the override provider's key.
412
+ * Otherwise checks the keys required by the evaluator's default providers.
413
+ * @throws {ConfigurationError} If a required key is missing
414
+ */
415
+ private validateApiKeys;
416
+ /**
417
+ * Normalize telemetry config to standard format
418
+ */
419
+ private normalizeTelemetryConfig;
420
+ /**
421
+ * Get the evaluator type identifier from metadata
422
+ * @returns The evaluator type ID (e.g., "vocabulary", "sentence-structure")
423
+ */
424
+ protected getEvaluatorType(): string;
425
+ /**
426
+ * Validate text meets requirements
427
+ * Default implementation - can be overridden by concrete evaluators
428
+ *
429
+ * @throws {ValidationError} If text is invalid
430
+ */
431
+ protected validateText(text: string): void;
432
+ /**
433
+ * Validate grade is in supported range
434
+ * Default implementation - can be overridden by concrete evaluators
435
+ *
436
+ * @param grade - Grade level to validate
437
+ * @param validGrades - Set of valid grades for this evaluator
438
+ * @throws {ValidationError} If grade is invalid
439
+ */
440
+ protected validateGrade(grade: string, validGrades: Set<string>): void;
441
+ /**
442
+ * Create an LLM provider, honouring modelOverride if set.
443
+ * When override is active, the key for the override provider is resolved
444
+ * from the matching top-level config field (e.g. anthropicApiKey for Anthropic).
445
+ */
446
+ protected createConfiguredProvider(defaultType: Provider, defaultModel: string, defaultApiKey: string | undefined): LLMProvider;
447
+ /**
448
+ * Send telemetry event to analytics service
449
+ * Common helper for all evaluators
450
+ */
451
+ protected sendTelemetry(params: {
452
+ status: 'success' | 'error';
453
+ latencyMs: number;
454
+ textLength: number;
455
+ grade?: string;
456
+ provider: string;
457
+ errorCode?: string;
458
+ tokenUsage?: TokenUsage;
459
+ metadata?: TelemetryMetadata;
460
+ inputText?: string;
461
+ }): Promise<void>;
462
+ }
463
+
464
+ export { BaseEvaluator as B, type EvaluatorMetadata as E, type LLMProvider as L, type Message as M, Provider as P, type TelemetryOptions as T, type BaseEvaluatorConfig as a, type LLMRequest as b, type LLMResponse as c, type LogContext as d, LogLevel as e, type Logger as f, type ModelOverride as g, type ProviderConfig as h, Providers as i, type TextGenerationResponse as j };