@learning-commons/evaluators 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,52 @@
1
+ # Changelog
2
+
3
+ All notable changes to the `@learning-commons/evaluators` TypeScript SDK will be documented in this file.
4
+
5
+ ## [0.5.0](https://github.com/learning-commons-org/evaluators/compare/sdks-typescript-v0.4.0...sdks-typescript-v0.5.0) (2026-05-07)
6
+
7
+
8
+ ### Features
9
+
10
+ * **ts-sdk:** add modelOverride option to all evaluators ([#34](https://github.com/learning-commons-org/evaluators/issues/34)) ([c57c4fc](https://github.com/learning-commons-org/evaluators/commit/c57c4fc86bc56846afe92e6d451705642e399309))
11
+ * **ts-sdk:** Add Purpose evaluator ([#57](https://github.com/learning-commons-org/evaluators/issues/57)) ([8b6d715](https://github.com/learning-commons-org/evaluators/commit/8b6d715b49ba1911de35ccc1b6aeaef888289a1d))
12
+
13
+ ## [0.4.0] — 2026-03-23
14
+
15
+ ### Added
16
+
17
+ - **Batch CSV Evaluator** — CLI tool and programmatic API for evaluating multiple texts from a CSV file in parallel. Runs the `text-complexity` group (GLA, SMK, Vocabulary, Sentence Structure, and Conventionality) across up to 50 rows and produces CSV and HTML reports.
18
+
19
+ ---
20
+
21
+ ## [0.3.0] — 2026-03-20
22
+
23
+ ### Added
24
+
25
+ - **Conventionality Evaluator** — evaluates how explicit, literal, and straightforward a text's meaning is versus how abstract, ironic, figurative, or archaic it is, relative to grades 3–12.
26
+ - **Conventionality added to TextComplexityEvaluator** — composite evaluator now runs vocabulary, sentence structure, SMK, and conventionality in parallel; result includes `conventionality` key.
27
+
28
+ ---
29
+
30
+ ## [0.2.0] — 2026-03-18
31
+
32
+ ### Added
33
+
34
+ - **Subject Matter Knowledge (SMK) Evaluator** — evaluates background knowledge demands of educational texts relative to grades 3–12.
35
+ - **SMK added to TextComplexityEvaluator** — composite evaluator now runs vocabulary, sentence structure, and SMK in parallel; result includes `subjectMatterKnowledge` key.
36
+ - **Prompt versioning** — prompts updated to v1.3.0 (`evals/prompts/subject-matter-knowledge/`).
37
+
38
+ ---
39
+
40
+ ## [0.1.0] — Early Release
41
+
42
+ Initial early release of the TypeScript SDK for Learning Commons educational evaluators.
43
+
44
+ ### Added
45
+
46
+ - **Vocabulary Evaluator** — grades 3–12 vocabulary difficulty assessment.
47
+ - **Sentence Structure Evaluator** — syntactic complexity analysis by grade level.
48
+ - **Grade Level Appropriateness (GLA) Evaluator** — overall grade-level suitability scoring.
49
+ - **Text Complexity Evaluator** — composite evaluation combining Vocabulary, Sentence Structure, and GLA.
50
+ - **Provider abstraction** — model-agnostic via Vercel AI SDK; OpenAI, Google, and Anthropic supported.
51
+ - **Telemetry** — opt-in, with `partnerKey` and `recordInputs` (defaults to `false`).
52
+ - **Prompt versioning** — prompts versioned in `evals/prompts/` (v1.2.0), shared with Python notebooks.
package/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # @learning-commons/evaluators
2
2
 
3
+ [![npm version](https://img.shields.io/npm/v/@learning-commons/evaluators)](https://www.npmjs.com/package/@learning-commons/evaluators)
4
+
3
5
  TypeScript SDK for Learning Commons educational text complexity evaluators.
4
6
 
5
7
  ## Installation
@@ -27,7 +29,7 @@ const evaluator = new VocabularyEvaluator({
27
29
  });
28
30
 
29
31
  const result = await evaluator.evaluate("Your text here", "5");
30
- console.log(result.score); // "moderately complex"
32
+ console.log(result.score); // "Moderately complex"
31
33
  ```
32
34
 
33
35
  ---
@@ -36,7 +38,7 @@ console.log(result.score); // "moderately complex"
36
38
 
37
39
  ### 1. Vocabulary Evaluator
38
40
 
39
- Evaluates vocabulary complexity using the Qual Text Complexity rubric (SAP).
41
+ Evaluates vocabulary complexity using the Qualitative Text Complexity rubric (SAP).
40
42
 
41
43
  **Supported Grades:** 3-12
42
44
 
@@ -45,12 +47,13 @@ Evaluates vocabulary complexity using the Qual Text Complexity rubric (SAP).
45
47
  **Constructor:**
46
48
  ```typescript
47
49
  const evaluator = new VocabularyEvaluator({
48
- googleApiKey?: string; // Google API key (required by this evaluator)
49
- openaiApiKey?: string; // OpenAI API key (required by this evaluator)
50
- maxRetries?: number; // Optional - Max retry attempts (default: 2)
51
- telemetry?: boolean | TelemetryOptions; // Optional (default: true)
52
- logger?: Logger; // Optional - Custom logger
53
- logLevel?: LogLevel; // Optional - SILENT | ERROR | WARN | INFO | DEBUG (default: WARN)
50
+ googleApiKey: string; // Google API key
51
+ openaiApiKey: string; // OpenAI API key
52
+ modelOverride?: ModelOverride; // Override the default provider and model
53
+ maxRetries?: number; // Max retry attempts (default: 2)
54
+ telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
55
+ logger?: Logger; // Custom logger
56
+ logLevel?: LogLevel; // Log verbosity (default: WARN)
54
57
  });
55
58
  ```
56
59
 
@@ -62,13 +65,13 @@ await evaluator.evaluate(text: string, grade: string)
62
65
  **Returns:**
63
66
  ```typescript
64
67
  {
65
- score: 'slightly complex' | 'moderately complex' | 'very complex' | 'exceedingly complex';
68
+ score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex';
66
69
  reasoning: string;
67
70
  metadata: {
68
71
  model: string;
69
72
  processingTimeMs: number;
70
73
  };
71
- _internal: VocabularyComplexity; // Detailed analysis
74
+ _internal: VocabularyInternal; // Detailed analysis
72
75
  }
73
76
  ```
74
77
 
@@ -85,11 +88,12 @@ Evaluates sentence structure complexity based on grammatical features.
85
88
  **Constructor:**
86
89
  ```typescript
87
90
  const evaluator = new SentenceStructureEvaluator({
88
- openaiApiKey?: string; // OpenAI API key (required by this evaluator)
89
- maxRetries?: number; // Optional - Max retry attempts (default: 2)
90
- telemetry?: boolean | TelemetryOptions; // Optional (default: true)
91
- logger?: Logger; // Optional - Custom logger
92
- logLevel?: LogLevel; // Optional - Logging verbosity (default: WARN)
91
+ openaiApiKey: string; // OpenAI API key
92
+ modelOverride?: ModelOverride; // Override the default provider and model
93
+ maxRetries?: number; // Max retry attempts (default: 2)
94
+ telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
95
+ logger?: Logger; // Custom logger
96
+ logLevel?: LogLevel; // Log verbosity (default: WARN)
93
97
  });
94
98
  ```
95
99
 
@@ -101,7 +105,7 @@ await evaluator.evaluate(text: string, grade: string)
101
105
  **Returns:**
102
106
  ```typescript
103
107
  {
104
- score: 'Slightly Complex' | 'Moderately Complex' | 'Very Complex' | 'Exceedingly Complex';
108
+ score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex';
105
109
  reasoning: string;
106
110
  metadata: {
107
111
  model: string;
@@ -128,11 +132,12 @@ Evaluates the background knowledge demands of educational texts relative to grad
128
132
  **Constructor:**
129
133
  ```typescript
130
134
  const evaluator = new SmkEvaluator({
131
- googleApiKey?: string; // Google API key (required by this evaluator)
132
- maxRetries?: number; // Optional - Max retry attempts (default: 2)
133
- telemetry?: boolean | TelemetryOptions; // Optional (default: true)
134
- logger?: Logger; // Optional - Custom logger
135
- logLevel?: LogLevel; // Optional - Logging verbosity (default: WARN)
135
+ googleApiKey: string; // Google API key
136
+ modelOverride?: ModelOverride; // Override the default provider and model
137
+ maxRetries?: number; // Max retry attempts (default: 2)
138
+ telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
139
+ logger?: Logger; // Custom logger
140
+ logLevel?: LogLevel; // Log verbosity (default: WARN)
136
141
  });
137
142
  ```
138
143
 
@@ -191,11 +196,12 @@ Evaluates how explicit, literal, and straightforward a text's meaning is versus
191
196
  **Constructor:**
192
197
  ```typescript
193
198
  const evaluator = new ConventionalityEvaluator({
194
- googleApiKey?: string; // Google API key (required by this evaluator)
195
- maxRetries?: number; // Optional - Max retry attempts (default: 2)
196
- telemetry?: boolean | TelemetryOptions; // Optional (default: true)
197
- logger?: Logger; // Optional - Custom logger
198
- logLevel?: LogLevel; // Optional - Logging verbosity (default: WARN)
199
+ googleApiKey: string; // Google API key
200
+ modelOverride?: ModelOverride; // Override the default provider and model
201
+ maxRetries?: number; // Max retry attempts (default: 2)
202
+ telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
203
+ logger?: Logger; // Custom logger
204
+ logLevel?: LogLevel; // Log verbosity (default: WARN)
199
205
  });
200
206
  ```
201
207
 
@@ -248,17 +254,18 @@ Composite evaluator that analyzes vocabulary, sentence structure, subject matter
248
254
 
249
255
  **Supported Grades:** 3-12
250
256
 
251
- **Uses:** Google Gemini 2.5 Pro + Google Gemini 3 Flash Preview + OpenAI GPT-4o (composite)
257
+ **Uses:** Google Gemini 2.5 Pro + Google Gemini 3 Flash Preview + OpenAI GPT-4o + OpenAI GPT-4.1 (composite)
252
258
 
253
259
  **Constructor:**
254
260
  ```typescript
255
261
  const evaluator = new TextComplexityEvaluator({
256
- googleApiKey?: string; // Google API key (required by this evaluator)
257
- openaiApiKey?: string; // OpenAI API key (required by this evaluator)
258
- maxRetries?: number; // Optional - Max retry attempts (default: 2)
259
- telemetry?: boolean | TelemetryOptions; // Optional (default: true)
260
- logger?: Logger; // Optional - Custom logger
261
- logLevel?: LogLevel; // Optional - Logging verbosity (default: WARN)
262
+ googleApiKey: string; // Google API key
263
+ openaiApiKey: string; // OpenAI API key
264
+ modelOverride?: ModelOverride; // Override the default provider and model for all sub-evaluators
265
+ maxRetries?: number; // Max retry attempts (default: 2)
266
+ telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
267
+ logger?: Logger; // Custom logger
268
+ logLevel?: LogLevel; // Log verbosity (default: WARN)
262
269
  });
263
270
  ```
264
271
 
@@ -317,11 +324,12 @@ Determines appropriate grade level for text.
317
324
  **Constructor:**
318
325
  ```typescript
319
326
  const evaluator = new GradeLevelAppropriatenessEvaluator({
320
- googleApiKey?: string; // Google API key (required by this evaluator)
321
- maxRetries?: number; // Optional - Max retry attempts (default: 2)
322
- telemetry?: boolean | TelemetryOptions; // Optional (default: true)
323
- logger?: Logger; // Optional - Custom logger
324
- logLevel?: LogLevel; // Optional - Logging verbosity (default: WARN)
327
+ googleApiKey: string; // Google API key
328
+ modelOverride?: ModelOverride; // Override the default provider and model
329
+ maxRetries?: number; // Max retry attempts (default: 2)
330
+ telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
331
+ logger?: Logger; // Custom logger
332
+ logLevel?: LogLevel; // Log verbosity (default: WARN)
325
333
  });
326
334
  ```
327
335
 
@@ -350,6 +358,98 @@ await evaluator.evaluate(text: string)
350
358
 
351
359
  ---
352
360
 
361
+ ### 7. Purpose Evaluator
362
+
363
+ Evaluates the Purpose dimension of qualitative text complexity — how explicitly the text's purpose is stated versus implied, and how that affects comprehension demands for the target grade level.
364
+
365
+ **Supported Grades:** 3-12
366
+
367
+ **Uses:** Google Gemini 3 Flash Preview
368
+
369
+ **Constructor:**
370
+ ```typescript
371
+ const evaluator = new PurposeEvaluator({
372
+ googleApiKey: string; // Google API key (required by this evaluator)
373
+ modelOverride?: ModelOverride; // Override the default provider and model
374
+ maxRetries?: number; // Max retry attempts (default: 2)
375
+ telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
376
+ logger?: Logger; // Custom logger
377
+ logLevel?: LogLevel; // Log verbosity (default: WARN)
378
+ });
379
+ ```
380
+
381
+ **API:**
382
+ ```typescript
383
+ await evaluator.evaluate(text: string, grade: string)
384
+ ```
385
+
386
+ **Returns:**
387
+ ```typescript
388
+ {
389
+ score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex' | 'More context needed';
390
+ reasoning: string;
391
+ metadata: {
392
+ model: string;
393
+ processingTimeMs: number;
394
+ };
395
+ _internal: {
396
+ complexity_score: 'slightly_complex' | 'moderately_complex' | 'very_complex' | 'exceedingly_complex' | 'more_context_needed';
397
+ reasoning: string;
398
+ details: {
399
+ detailed_summary: Array<{
400
+ factor: string;
401
+ description: string;
402
+ effect_on_complexity_dimension: string;
403
+ }>;
404
+ adjustment_and_scaffolding: Array<{
405
+ scaffolding_need: string;
406
+ suggestion: string;
407
+ }>;
408
+ recommended_use_cases: Array<{
409
+ opportunity: string;
410
+ suggestion: string;
411
+ }>;
412
+ };
413
+ };
414
+ }
415
+ ```
416
+
417
+ > **Note:** The `'More context needed'` score is used for cases where the text alone is insufficient to determine complexity.
418
+
419
+ **Example:**
420
+ ```typescript
421
+ import { PurposeEvaluator } from '@learning-commons/evaluators';
422
+
423
+ const evaluator = new PurposeEvaluator({
424
+ googleApiKey: process.env.GOOGLE_API_KEY,
425
+ });
426
+
427
+ const result = await evaluator.evaluate(
428
+ "The author argues that renewable energy is the only viable solution to climate change.",
429
+ "9"
430
+ );
431
+ console.log(result.score); // "Moderately complex"
432
+ console.log(result.reasoning);
433
+ console.log(result._internal.details.adjustment_and_scaffolding);
434
+ ```
435
+
436
+ ---
437
+
438
+ ## Batch CSV Evaluation
439
+
440
+ For evaluating many texts at once, the SDK ships a CLI tool that reads a CSV file, runs all evaluators in a group, and produces CSV and HTML reports.
441
+
442
+ ```bash
443
+ # Run from the directory containing your CSV
444
+ npx evaluators-batch
445
+ ```
446
+
447
+ The CLI will prompt for your CSV path, API keys, and output directory, then process all rows in parallel with real-time progress.
448
+
449
+ See [`src/batch/README.md`](./src/batch/README.md) for full documentation.
450
+
451
+ ---
452
+
353
453
  ## Error Handling
354
454
 
355
455
  The SDK provides specific error types to help you handle different scenarios:
@@ -384,6 +484,9 @@ try {
384
484
  } else if (error instanceof NetworkError) {
385
485
  // Network connectivity issues
386
486
  console.error('Network error:', error.message);
487
+ } else if (error instanceof TimeoutError) {
488
+ // Request timed out
489
+ console.error('Timeout:', error.message);
387
490
  } else if (error instanceof APIError) {
388
491
  // Other API errors
389
492
  console.error('API error:', error.message, 'Status:', error.statusCode);
@@ -428,6 +531,44 @@ const evaluator = new VocabularyEvaluator({
428
531
 
429
532
  ---
430
533
 
534
+ ## Model Override
535
+
536
+ By default each evaluator uses a recommended provider and model tuned for that task. You can override this with any supported provider — OpenAI, Google, or Anthropic — using the `modelOverride` option.
537
+
538
+ When `modelOverride` is set:
539
+ - All LLM calls within the evaluator use the specified provider and model
540
+ - Only the API key for the override provider is required (e.g. `anthropicApiKey` when using `Provider.Anthropic`); default provider keys are not validated
541
+ - A warning is logged to indicate results may differ from the defaults
542
+ - Telemetry records `model_override: true` so override usage is tracked separately
543
+
544
+ **Validation:** The SDK validates `modelOverride` at construction time and throws `ConfigurationError` if:
545
+ - `provider` is not one of the supported `Provider` values (`openai`, `google`, `anthropic`)
546
+ - `model` is empty or blank — no default is assumed; you must always specify the model ID explicitly
547
+ - The API key for the chosen provider is missing
548
+
549
+ If the model ID is valid at construction but doesn't exist on the provider's API, `ConfigurationError` is thrown when `evaluate()` is called.
550
+
551
+ ```typescript
552
+ import { VocabularyEvaluator, Provider } from '@learning-commons/evaluators';
553
+
554
+ const evaluator = new VocabularyEvaluator({
555
+ anthropicApiKey: process.env.ANTHROPIC_API_KEY,
556
+ modelOverride: {
557
+ provider: Provider.Anthropic,
558
+ model: 'claude-sonnet-4-6',
559
+ },
560
+ });
561
+
562
+ const result = await evaluator.evaluate("Your text here", "5");
563
+ console.log(result.metadata.model); // "anthropic:claude-sonnet-4-6"
564
+ ```
565
+
566
+ See the [Installation](#installation) section for provider adapter setup if you haven't already.
567
+
568
+ > **Note:** Evaluators are validated and quality-tested against their default models. Results with other models may vary. Check `result.metadata.model` to confirm which model was used.
569
+
570
+ ---
571
+
431
572
  ## Telemetry & Privacy
432
573
 
433
574
  See [docs/telemetry.md](./docs/telemetry.md) for telemetry configuration and privacy information.
@@ -440,13 +581,15 @@ All evaluators use the same `BaseEvaluatorConfig` interface:
440
581
 
441
582
  ```typescript
442
583
  interface BaseEvaluatorConfig {
443
- googleApiKey?: string; // Google API key (required by some evaluators)
444
- openaiApiKey?: string; // OpenAI API key (required by some evaluators)
445
- maxRetries?: number; // Max API retry attempts (default: 2)
446
- telemetry?: boolean | TelemetryOptions; // Telemetry config (default: true)
447
- logger?: Logger; // Custom logger (optional)
448
- logLevel?: LogLevel; // Console log level (default: WARN)
449
- partnerKey?: string; // Learning Commons partner key for authenticated telemetry (optional)
584
+ googleApiKey?: string; // Google API key (required by some evaluators)
585
+ openaiApiKey?: string; // OpenAI API key (required by some evaluators)
586
+ anthropicApiKey?: string; // Anthropic API key (required if an evaluator defaults to Claude or when `modelOverride` uses `Provider.Anthropic`)
587
+ modelOverride?: ModelOverride; // Override the provider and model (see Model Override section)
588
+ maxRetries?: number; // Max retry attempts (default: 2)
589
+ telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
590
+ logger?: Logger; // Custom logger
591
+ logLevel?: LogLevel; // Log verbosity (default: WARN)
592
+ partnerKey?: string; // Learning Commons partner key for authenticated telemetry
450
593
  }
451
594
  ```
452
595
 
@@ -457,6 +600,9 @@ interface BaseEvaluatorConfig {
457
600
  - **Conventionality**: Requires `googleApiKey` only
458
601
  - **Text Complexity**: Requires both `googleApiKey` and `openaiApiKey`
459
602
  - **Grade Level Appropriateness**: Requires `googleApiKey` only
603
+ - **Purpose**: Requires `googleApiKey` only
604
+
605
+ When `modelOverride` is set, the default key requirements are bypassed — only the key for the override provider is required (e.g. `anthropicApiKey` when using `Provider.Anthropic`).
460
606
 
461
607
  ---
462
608