@learning-commons/evaluators 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,44 @@
1
+ # Changelog
2
+
3
+ All notable changes to the `@learning-commons/evaluators` TypeScript SDK will be documented in this file.
4
+
5
+ ## [0.4.0] — 2026-03-23
6
+
7
+ ### Added
8
+
9
+ - **Batch CSV Evaluator** — CLI tool and programmatic API for evaluating multiple texts from a CSV file in parallel. Runs the `text-complexity` group (GLA, SMK, Vocabulary, Sentence Structure, and Conventionality) across up to 50 rows and produces CSV and HTML reports.
10
+
11
+ ---
12
+
13
+ ## [0.3.0] — 2026-03-20
14
+
15
+ ### Added
16
+
17
+ - **Conventionality Evaluator** — evaluates how explicit, literal, and straightforward a text's meaning is versus how abstract, ironic, figurative, or archaic it is, relative to grades 3–12.
18
+ - **Conventionality added to TextComplexityEvaluator** — composite evaluator now runs vocabulary, sentence structure, SMK, and conventionality in parallel; result includes `conventionality` key.
19
+
20
+ ---
21
+
22
+ ## [0.2.0] — 2026-03-18
23
+
24
+ ### Added
25
+
26
+ - **Subject Matter Knowledge (SMK) Evaluator** — evaluates background knowledge demands of educational texts relative to grades 3–12.
27
+ - **SMK added to TextComplexityEvaluator** — composite evaluator now runs vocabulary, sentence structure, and SMK in parallel; result includes `subjectMatterKnowledge` key.
28
+ - **Prompt versioning** — prompts updated to v1.3.0 (`evals/prompts/subject-matter-knowledge/`).
29
+
30
+ ---
31
+
32
+ ## [0.1.0] — Early Release
33
+
34
+ Initial early release of the TypeScript SDK for Learning Commons educational evaluators.
35
+
36
+ ### Added
37
+
38
+ - **Vocabulary Evaluator** — grades 3–12 vocabulary difficulty assessment.
39
+ - **Sentence Structure Evaluator** — syntactic complexity analysis by grade level.
40
+ - **Grade Level Appropriateness (GLA) Evaluator** — overall grade-level suitability scoring.
41
+ - **Text Complexity Evaluator** — composite evaluation combining Vocabulary, Sentence Structure, and GLA.
42
+ - **Provider abstraction** — model-agnostic via Vercel AI SDK; OpenAI, Google, and Anthropic supported.
43
+ - **Telemetry** — opt-in, with `partnerKey` and `recordInputs` (defaults to `false`).
44
+ - **Prompt versioning** — prompts versioned in `evals/prompts/` (v1.2.0), shared with Python notebooks.
package/README.md CHANGED
@@ -180,9 +180,71 @@ console.log(result._internal.identified_topics); // ["hydraulics", "propulsion",
180
180
 
181
181
  ---
182
182
 
183
- ### 4. Text Complexity Evaluator
183
+ ### 4. Conventionality Evaluator
184
184
 
185
- Composite evaluator that analyzes vocabulary, sentence structure, and subject matter knowledge complexity in parallel.
185
+ Evaluates how explicit, literal, and straightforward a text's meaning is versus how abstract, ironic, figurative, or archaic it is for the target grade level. Based on the Common Core Qualitative Text Complexity Rubric.
186
+
187
+ **Supported Grades:** 3-12
188
+
189
+ **Uses:** Google Gemini 3 Flash Preview
190
+
191
+ **Constructor:**
192
+ ```typescript
193
+ const evaluator = new ConventionalityEvaluator({
194
+ googleApiKey?: string; // Google API key (required by this evaluator)
195
+ maxRetries?: number; // Optional - Max retry attempts (default: 2)
196
+ telemetry?: boolean | TelemetryOptions; // Optional (default: true)
197
+ logger?: Logger; // Optional - Custom logger
198
+ logLevel?: LogLevel; // Optional - Logging verbosity (default: WARN)
199
+ });
200
+ ```
201
+
202
+ **API:**
203
+ ```typescript
204
+ await evaluator.evaluate(text: string, grade: string)
205
+ ```
206
+
207
+ **Returns:**
208
+ ```typescript
209
+ {
210
+ score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex';
211
+ reasoning: string;
212
+ metadata: {
213
+ model: string;
214
+ processingTimeMs: number;
215
+ };
216
+ _internal: {
217
+ conventionality_features: string[];
218
+ grade_context: string;
219
+ instructional_insights: string;
220
+ complexity_score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex';
221
+ reasoning: string;
222
+ };
223
+ }
224
+ ```
225
+
226
+ **Example:**
227
+ ```typescript
228
+ import { ConventionalityEvaluator } from '@learning-commons/evaluators';
229
+
230
+ const evaluator = new ConventionalityEvaluator({
231
+ googleApiKey: process.env.GOOGLE_API_KEY,
232
+ });
233
+
234
+ const result = await evaluator.evaluate(
235
+ "The author uses sustained irony to critique societal norms throughout the passage.",
236
+ "10"
237
+ );
238
+ console.log(result.score); // "Very complex"
239
+ console.log(result.reasoning);
240
+ console.log(result._internal.conventionality_features); // ["sustained irony", ...]
241
+ ```
242
+
243
+ ---
244
+
245
+ ### 5. Text Complexity Evaluator
246
+
247
+ Composite evaluator that analyzes vocabulary, sentence structure, subject matter knowledge, and conventionality complexity in parallel.
186
248
 
187
249
  **Supported Grades:** 3-12
188
250
 
@@ -211,10 +273,11 @@ await evaluator.evaluate(text: string, grade: string)
211
273
  vocabulary: EvaluationResult<TextComplexityLevel> | { error: Error };
212
274
  sentenceStructure: EvaluationResult<TextComplexityLevel> | { error: Error };
213
275
  subjectMatterKnowledge: EvaluationResult<TextComplexityLevel> | { error: Error };
276
+ conventionality: EvaluationResult<TextComplexityLevel> | { error: Error };
214
277
  }
215
278
  ```
216
279
 
217
- Each sub-evaluator result is either a full `EvaluationResult` or `{ error: Error }` if that evaluator failed. An error is only thrown if all three fail.
280
+ Each sub-evaluator result is either a full `EvaluationResult` or `{ error: Error }` if that evaluator failed. An error is only thrown if all four fail.
218
281
 
219
282
  **Example:**
220
283
  ```typescript
@@ -236,11 +299,14 @@ if (!('error' in result.sentenceStructure)) {
236
299
  if (!('error' in result.subjectMatterKnowledge)) {
237
300
  console.log('Subject matter knowledge:', result.subjectMatterKnowledge.score);
238
301
  }
302
+ if (!('error' in result.conventionality)) {
303
+ console.log('Conventionality:', result.conventionality.score);
304
+ }
239
305
  ```
240
306
 
241
307
  ---
242
308
 
243
- ### 5. Grade Level Appropriateness Evaluator
309
+ ### 6. Grade Level Appropriateness Evaluator
244
310
 
245
311
  Determines appropriate grade level for text.
246
312
 
@@ -284,6 +350,21 @@ await evaluator.evaluate(text: string)
284
350
 
285
351
  ---
286
352
 
353
+ ## Batch CSV Evaluation
354
+
355
+ For evaluating many texts at once, the SDK ships a CLI tool that reads a CSV file, runs all evaluators in a group, and produces CSV and HTML reports.
356
+
357
+ ```bash
358
+ # Run from the directory containing your CSV
359
+ npx evaluators-batch
360
+ ```
361
+
362
+ The CLI will prompt for your CSV path, API keys, and output directory, then process all rows in parallel with real-time progress.
363
+
364
+ See [`src/batch/README.md`](./src/batch/README.md) for full documentation.
365
+
366
+ ---
367
+
287
368
  ## Error Handling
288
369
 
289
370
  The SDK provides specific error types to help you handle different scenarios:
@@ -388,6 +469,7 @@ interface BaseEvaluatorConfig {
388
469
  - **Vocabulary**: Requires both `googleApiKey` and `openaiApiKey`
389
470
  - **Sentence Structure**: Requires `openaiApiKey` only
390
471
  - **Subject Matter Knowledge**: Requires `googleApiKey` only
472
+ - **Conventionality**: Requires `googleApiKey` only
391
473
  - **Text Complexity**: Requires both `googleApiKey` and `openaiApiKey`
392
474
  - **Grade Level Appropriateness**: Requires `googleApiKey` only
393
475
 
@@ -0,0 +1,331 @@
1
+ /**
2
+ * Logging interface for the Evaluators SDK
3
+ *
4
+ * Provides structured logging with verbosity levels.
5
+ * Users can inject custom loggers or use the default console logger.
6
+ */
7
+ /**
8
+ * Log levels in order of verbosity
9
+ */
10
+ declare enum LogLevel {
11
+ /** Debug messages - very verbose, for development */
12
+ DEBUG = 0,
13
+ /** Informational messages - normal operations */
14
+ INFO = 1,
15
+ /** Warning messages - potentially problematic situations */
16
+ WARN = 2,
17
+ /** Error messages - errors that need attention */
18
+ ERROR = 3,
19
+ /** Silent - no logging */
20
+ SILENT = 4
21
+ }
22
+ /**
23
+ * Context object for structured logging
24
+ */
25
+ interface LogContext {
26
+ /** Evaluator type (vocabulary, sentence-structure, etc.) */
27
+ evaluator?: string;
28
+ /** Current operation or stage */
29
+ operation?: string;
30
+ /** Error object if applicable */
31
+ error?: Error;
32
+ /** Additional metadata */
33
+ [key: string]: unknown;
34
+ }
35
+ /**
36
+ * Logger interface
37
+ *
38
+ * Implement this interface to provide custom logging behavior.
39
+ *
40
+ * @example
41
+ * ```typescript
42
+ * const customLogger: Logger = {
43
+ * debug: (msg, ctx) => myLogger.debug(msg, ctx),
44
+ * info: (msg, ctx) => myLogger.info(msg, ctx),
45
+ * warn: (msg, ctx) => myLogger.warn(msg, ctx),
46
+ * error: (msg, ctx) => myLogger.error(msg, ctx),
47
+ * };
48
+ *
49
+ * const evaluator = new VocabularyEvaluator({
50
+ * googleApiKey: '...',
51
+ * openaiApiKey: '...',
52
+ * logger: customLogger,
53
+ * logLevel: LogLevel.INFO,
54
+ * });
55
+ * ```
56
+ */
57
+ interface Logger {
58
+ /**
59
+ * Log debug message
60
+ * Used for detailed debugging information
61
+ */
62
+ debug(message: string, context?: LogContext): void;
63
+ /**
64
+ * Log informational message
65
+ * Used for normal operations
66
+ */
67
+ info(message: string, context?: LogContext): void;
68
+ /**
69
+ * Log warning message
70
+ * Used for potentially problematic situations
71
+ */
72
+ warn(message: string, context?: LogContext): void;
73
+ /**
74
+ * Log error message
75
+ * Used for errors that need attention
76
+ */
77
+ error(message: string, context?: LogContext): void;
78
+ }
79
+
80
+ /**
81
+ * Evaluation status
82
+ */
83
+ type EvaluationStatus = 'success' | 'error';
84
+ /**
85
+ * Token usage metrics from LLM providers
86
+ */
87
+ interface TokenUsage {
88
+ input_tokens: number;
89
+ output_tokens: number;
90
+ }
91
+ /**
92
+ * Per-stage details for multi-stage evaluations
93
+ */
94
+ interface StageDetail {
95
+ /** Stage name (e.g., "background_knowledge", "complexity_evaluation") */
96
+ stage: string;
97
+ /** Provider used for this stage (e.g., "openai:gpt-4o") */
98
+ provider: string;
99
+ /** Total latency including all retries (ms) */
100
+ latency_ms: number;
101
+ /** Token usage aggregated across all attempts */
102
+ token_usage?: TokenUsage;
103
+ /**
104
+ * Whether schema validation failed (indicates prompt needs clearer instructions)
105
+ *
106
+ * TODO: Not currently tracked. Vercel AI SDK abstracts validation away.
107
+ * To implement: Add custom retry wrapper that catches validation errors.
108
+ */
109
+ schema_validation_failed?: boolean;
110
+ }
111
+ /**
112
+ * Extensible metadata for telemetry events
113
+ */
114
+ interface TelemetryMetadata {
115
+ /** Detailed breakdown by stage (for multi-stage evaluations) */
116
+ stage_details?: StageDetail[];
117
+ }
118
+ /**
119
+ * Telemetry event payload
120
+ */
121
+ interface TelemetryEvent {
122
+ timestamp: string;
123
+ sdk_version: string;
124
+ evaluator_type: string;
125
+ grade?: string;
126
+ status: EvaluationStatus;
127
+ error_code?: string;
128
+ latency_ms: number;
129
+ text_length_chars: number;
130
+ provider: string;
131
+ token_usage?: TokenUsage;
132
+ metadata?: TelemetryMetadata;
133
+ input_text?: string;
134
+ }
135
+ /**
136
+ * Configuration for telemetry client
137
+ */
138
+ interface TelemetryConfig {
139
+ /** Analytics service endpoint URL */
140
+ endpoint: string;
141
+ /** Learning Commons partner key (optional, sent as X-API-Key header) */
142
+ partnerKey?: string;
143
+ /** Client ID for anonymous tracking (persistent UUID from ~/.config/learning-commons/config.json) */
144
+ clientId: string;
145
+ /** Enable telemetry (default: true) */
146
+ enabled: boolean;
147
+ /** Logger instance (respects the SDK's configured log level and custom logger) */
148
+ logger: Logger;
149
+ }
150
+
151
+ /**
152
+ * Telemetry client for sending analytics events
153
+ *
154
+ * Fire-and-forget implementation that never blocks SDK operations.
155
+ * Errors are logged but don't fail evaluations.
156
+ */
157
+ declare class TelemetryClient {
158
+ private config;
159
+ private logger;
160
+ constructor(config: TelemetryConfig);
161
+ /**
162
+ * Send telemetry event to analytics service
163
+ *
164
+ * Fire-and-forget: Errors are logged but don't throw.
165
+ */
166
+ send(event: TelemetryEvent): Promise<void>;
167
+ }
168
+
169
+ /**
170
+ * Granular telemetry configuration options
171
+ */
172
+ interface TelemetryOptions {
173
+ /** Enable telemetry (default: true) */
174
+ enabled?: boolean;
175
+ /** Record input text in telemetry (default: false) */
176
+ recordInputs?: boolean;
177
+ }
178
+ /**
179
+ * Base configuration for all evaluators
180
+ */
181
+ interface BaseEvaluatorConfig {
182
+ /** Google API key (for evaluators using Gemini) */
183
+ googleApiKey?: string;
184
+ /** OpenAI API key (for evaluators using GPT) */
185
+ openaiApiKey?: string;
186
+ /** Learning Commons partner key for authenticated telemetry (optional) */
187
+ partnerKey?: string;
188
+ /**
189
+ * Maximum number of retries for failed API calls (default: 2)
190
+ * Set to 0 to disable retries.
191
+ *
192
+ * Note: With maxRetries=2, a failed call will be attempted up to 3 times total
193
+ * (1 initial attempt + 2 retries)
194
+ */
195
+ maxRetries?: number;
196
+ /**
197
+ * Telemetry configuration (default: all enabled)
198
+ *
199
+ * Can be:
200
+ * - `true`: Enable with defaults (recordInputs: false)
201
+ * - `false`: Disable completely
202
+ * - `TelemetryOptions`: Granular control
203
+ */
204
+ telemetry?: boolean | TelemetryOptions;
205
+ /**
206
+ * Custom logger implementation (optional)
207
+ * If not provided, uses console logger with specified logLevel
208
+ */
209
+ logger?: Logger;
210
+ /**
211
+ * Log level for default console logger (default: WARN)
212
+ * Only used if custom logger is not provided
213
+ *
214
+ * - DEBUG: Very verbose, shows all operations
215
+ * - INFO: Normal operations
216
+ * - WARN: Warnings only (default)
217
+ * - ERROR: Errors only
218
+ * - SILENT: No logging
219
+ */
220
+ logLevel?: LogLevel;
221
+ }
222
+ /**
223
+ * Evaluator metadata interface
224
+ * Each evaluator must provide this metadata as static properties
225
+ */
226
+ interface EvaluatorMetadata {
227
+ /** Unique identifier for the evaluator (e.g., 'vocabulary', 'sentence-structure') */
228
+ readonly id: string;
229
+ /** Human-readable name (e.g., 'Vocabulary', 'Sentence Structure') */
230
+ readonly name: string;
231
+ /** Brief description of what the evaluator does */
232
+ readonly description: string;
233
+ /** Supported grade levels (e.g., ['3', '4', '5', ...]) */
234
+ readonly supportedGrades: readonly string[];
235
+ /** Whether this evaluator requires a Google API key */
236
+ readonly requiresGoogleKey: boolean;
237
+ /** Whether this evaluator requires an OpenAI API key */
238
+ readonly requiresOpenAIKey: boolean;
239
+ }
240
+ /**
241
+ * Abstract base class for all evaluators
242
+ *
243
+ * Provides common functionality:
244
+ * - Telemetry setup and event sending
245
+ * - Text validation
246
+ * - Grade validation (with overridable default)
247
+ * - Metadata creation
248
+ *
249
+ * Concrete evaluators must implement:
250
+ * - static metadata: Provide evaluator metadata (see EvaluatorMetadata interface)
251
+ */
252
+ declare abstract class BaseEvaluator {
253
+ protected telemetryClient?: TelemetryClient;
254
+ protected logger: Logger;
255
+ protected config: Required<Pick<BaseEvaluatorConfig, 'maxRetries'>> & {
256
+ telemetry: Required<TelemetryOptions>;
257
+ };
258
+ /**
259
+ * Static metadata for the evaluator
260
+ *
261
+ * Concrete evaluators MUST define this property.
262
+ *
263
+ * @example
264
+ * ```typescript
265
+ * class MyEvaluator extends BaseEvaluator {
266
+ * static readonly metadata = {
267
+ * id: 'my-evaluator',
268
+ * name: 'My Evaluator',
269
+ * description: 'Does something useful',
270
+ * supportedGrades: ['3', '4', '5'],
271
+ * requiresGoogleKey: true,
272
+ * requiresOpenAIKey: false,
273
+ * };
274
+ * }
275
+ * ```
276
+ */
277
+ static readonly metadata: EvaluatorMetadata;
278
+ constructor(config: BaseEvaluatorConfig);
279
+ /**
280
+ * Get metadata for this evaluator instance
281
+ * @throws {ConfigurationError} If the subclass has not defined static metadata
282
+ */
283
+ protected get metadata(): EvaluatorMetadata;
284
+ /**
285
+ * Validate that required API keys are provided based on metadata
286
+ * @throws {ConfigurationError} If required API keys are missing
287
+ */
288
+ private validateApiKeys;
289
+ /**
290
+ * Normalize telemetry config to standard format
291
+ */
292
+ private normalizeTelemetryConfig;
293
+ /**
294
+ * Get the evaluator type identifier from metadata
295
+ * @returns The evaluator type ID (e.g., "vocabulary", "sentence-structure")
296
+ */
297
+ protected getEvaluatorType(): string;
298
+ /**
299
+ * Validate text meets requirements
300
+ * Default implementation - can be overridden by concrete evaluators
301
+ *
302
+ * @throws {ValidationError} If text is invalid
303
+ */
304
+ protected validateText(text: string): void;
305
+ /**
306
+ * Validate grade is in supported range
307
+ * Default implementation - can be overridden by concrete evaluators
308
+ *
309
+ * @param grade - Grade level to validate
310
+ * @param validGrades - Set of valid grades for this evaluator
311
+ * @throws {ValidationError} If grade is invalid
312
+ */
313
+ protected validateGrade(grade: string, validGrades: Set<string>): void;
314
+ /**
315
+ * Send telemetry event to analytics service
316
+ * Common helper for all evaluators
317
+ */
318
+ protected sendTelemetry(params: {
319
+ status: 'success' | 'error';
320
+ latencyMs: number;
321
+ textLength: number;
322
+ grade?: string;
323
+ provider: string;
324
+ errorCode?: string;
325
+ tokenUsage?: TokenUsage;
326
+ metadata?: TelemetryMetadata;
327
+ inputText?: string;
328
+ }): Promise<void>;
329
+ }
330
+
331
+ export { BaseEvaluator as B, type EvaluatorMetadata as E, type Logger as L, type TelemetryOptions as T, type BaseEvaluatorConfig as a, type LogContext as b, LogLevel as c };