@learning-commons/evaluators 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/README.md +188 -45
- package/dist/{base-Ced9oKKa.d.cts → base-DKcAYXfb.d.cts} +142 -9
- package/dist/{base-Ced9oKKa.d.ts → base-DKcAYXfb.d.ts} +142 -9
- package/dist/batch/cli.js +635 -227
- package/dist/batch/cli.js.map +1 -1
- package/dist/batch/index.cjs +618 -218
- package/dist/batch/index.cjs.map +1 -1
- package/dist/batch/index.d.cts +3 -1
- package/dist/batch/index.d.ts +3 -1
- package/dist/batch/index.js +617 -218
- package/dist/batch/index.js.map +1 -1
- package/dist/index.cjs +626 -217
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +155 -86
- package/dist/index.d.ts +155 -86
- package/dist/index.js +622 -218
- package/dist/index.js.map +1 -1
- package/package.json +13 -4
- package/src/batch/README.md +14 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to the `@learning-commons/evaluators` TypeScript SDK will be documented in this file.
|
|
4
4
|
|
|
5
|
+
## [0.6.0](https://github.com/learning-commons-org/evaluators/compare/sdks-typescript-v0.5.0...sdks-typescript-v0.6.0) (2026-05-22)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### Features
|
|
9
|
+
|
|
10
|
+
* **ts-sdk:** add bypassRowLimit option for batch evaluator ([#77](https://github.com/learning-commons-org/evaluators/issues/77)) ([902a60f](https://github.com/learning-commons-org/evaluators/commit/902a60fc934372a151f1d40c0b49ef3313d12609))
|
|
11
|
+
* **ts-sdk:** expose per-call token usage on EvaluationMetadata ([#59](https://github.com/learning-commons-org/evaluators/issues/59)) ([3c8fa0f](https://github.com/learning-commons-org/evaluators/commit/3c8fa0fd8e2389fc902c9cf1f63985b40d2e4b2c))
|
|
12
|
+
|
|
13
|
+
## [0.5.0](https://github.com/learning-commons-org/evaluators/compare/sdks-typescript-v0.4.0...sdks-typescript-v0.5.0) (2026-05-07)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
### Features
|
|
17
|
+
|
|
18
|
+
* **ts-sdk:** add modelOverride option to all evaluators ([#34](https://github.com/learning-commons-org/evaluators/issues/34)) ([c57c4fc](https://github.com/learning-commons-org/evaluators/commit/c57c4fc86bc56846afe92e6d451705642e399309))
|
|
19
|
+
* **ts-sdk:** Add Purpose evaluator ([#57](https://github.com/learning-commons-org/evaluators/issues/57)) ([8b6d715](https://github.com/learning-commons-org/evaluators/commit/8b6d715b49ba1911de35ccc1b6aeaef888289a1d))
|
|
20
|
+
|
|
5
21
|
## [0.4.0] — 2026-03-23
|
|
6
22
|
|
|
7
23
|
### Added
|
package/README.md
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# @learning-commons/evaluators
|
|
2
2
|
|
|
3
|
+
[](https://www.npmjs.com/package/@learning-commons/evaluators)
|
|
4
|
+
|
|
3
5
|
TypeScript SDK for Learning Commons educational text complexity evaluators.
|
|
4
6
|
|
|
5
7
|
## Installation
|
|
@@ -27,7 +29,7 @@ const evaluator = new VocabularyEvaluator({
|
|
|
27
29
|
});
|
|
28
30
|
|
|
29
31
|
const result = await evaluator.evaluate("Your text here", "5");
|
|
30
|
-
console.log(result.score); // "
|
|
32
|
+
console.log(result.score); // "Moderately complex"
|
|
31
33
|
```
|
|
32
34
|
|
|
33
35
|
---
|
|
@@ -36,7 +38,7 @@ console.log(result.score); // "moderately complex"
|
|
|
36
38
|
|
|
37
39
|
### 1. Vocabulary Evaluator
|
|
38
40
|
|
|
39
|
-
Evaluates vocabulary complexity using the
|
|
41
|
+
Evaluates vocabulary complexity using the Qualitative Text Complexity rubric (SAP).
|
|
40
42
|
|
|
41
43
|
**Supported Grades:** 3-12
|
|
42
44
|
|
|
@@ -45,12 +47,13 @@ Evaluates vocabulary complexity using the Qual Text Complexity rubric (SAP).
|
|
|
45
47
|
**Constructor:**
|
|
46
48
|
```typescript
|
|
47
49
|
const evaluator = new VocabularyEvaluator({
|
|
48
|
-
googleApiKey
|
|
49
|
-
openaiApiKey
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
50
|
+
googleApiKey: string; // Google API key
|
|
51
|
+
openaiApiKey: string; // OpenAI API key
|
|
52
|
+
modelOverride?: ModelOverride; // Override the default provider and model
|
|
53
|
+
maxRetries?: number; // Max retry attempts (default: 2)
|
|
54
|
+
telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
|
|
55
|
+
logger?: Logger; // Custom logger
|
|
56
|
+
logLevel?: LogLevel; // Log verbosity (default: WARN)
|
|
54
57
|
});
|
|
55
58
|
```
|
|
56
59
|
|
|
@@ -62,13 +65,15 @@ await evaluator.evaluate(text: string, grade: string)
|
|
|
62
65
|
**Returns:**
|
|
63
66
|
```typescript
|
|
64
67
|
{
|
|
65
|
-
score: '
|
|
68
|
+
score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex';
|
|
66
69
|
reasoning: string;
|
|
67
70
|
metadata: {
|
|
68
71
|
model: string;
|
|
69
72
|
processingTimeMs: number;
|
|
73
|
+
inputTokens: number;
|
|
74
|
+
outputTokens: number;
|
|
70
75
|
};
|
|
71
|
-
_internal:
|
|
76
|
+
_internal: VocabularyInternal; // Detailed analysis
|
|
72
77
|
}
|
|
73
78
|
```
|
|
74
79
|
|
|
@@ -85,11 +90,12 @@ Evaluates sentence structure complexity based on grammatical features.
|
|
|
85
90
|
**Constructor:**
|
|
86
91
|
```typescript
|
|
87
92
|
const evaluator = new SentenceStructureEvaluator({
|
|
88
|
-
openaiApiKey
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
+
openaiApiKey: string; // OpenAI API key
|
|
94
|
+
modelOverride?: ModelOverride; // Override the default provider and model
|
|
95
|
+
maxRetries?: number; // Max retry attempts (default: 2)
|
|
96
|
+
telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
|
|
97
|
+
logger?: Logger; // Custom logger
|
|
98
|
+
logLevel?: LogLevel; // Log verbosity (default: WARN)
|
|
93
99
|
});
|
|
94
100
|
```
|
|
95
101
|
|
|
@@ -101,11 +107,13 @@ await evaluator.evaluate(text: string, grade: string)
|
|
|
101
107
|
**Returns:**
|
|
102
108
|
```typescript
|
|
103
109
|
{
|
|
104
|
-
score: 'Slightly
|
|
110
|
+
score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex';
|
|
105
111
|
reasoning: string;
|
|
106
112
|
metadata: {
|
|
107
113
|
model: string;
|
|
108
114
|
processingTimeMs: number;
|
|
115
|
+
inputTokens: number;
|
|
116
|
+
outputTokens: number;
|
|
109
117
|
};
|
|
110
118
|
_internal: {
|
|
111
119
|
sentenceAnalysis: SentenceAnalysis;
|
|
@@ -128,11 +136,12 @@ Evaluates the background knowledge demands of educational texts relative to grad
|
|
|
128
136
|
**Constructor:**
|
|
129
137
|
```typescript
|
|
130
138
|
const evaluator = new SmkEvaluator({
|
|
131
|
-
googleApiKey
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
139
|
+
googleApiKey: string; // Google API key
|
|
140
|
+
modelOverride?: ModelOverride; // Override the default provider and model
|
|
141
|
+
maxRetries?: number; // Max retry attempts (default: 2)
|
|
142
|
+
telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
|
|
143
|
+
logger?: Logger; // Custom logger
|
|
144
|
+
logLevel?: LogLevel; // Log verbosity (default: WARN)
|
|
136
145
|
});
|
|
137
146
|
```
|
|
138
147
|
|
|
@@ -149,6 +158,8 @@ await evaluator.evaluate(text: string, grade: string)
|
|
|
149
158
|
metadata: {
|
|
150
159
|
model: string;
|
|
151
160
|
processingTimeMs: number;
|
|
161
|
+
inputTokens: number;
|
|
162
|
+
outputTokens: number;
|
|
152
163
|
};
|
|
153
164
|
_internal: {
|
|
154
165
|
identified_topics: string[];
|
|
@@ -191,11 +202,12 @@ Evaluates how explicit, literal, and straightforward a text's meaning is versus
|
|
|
191
202
|
**Constructor:**
|
|
192
203
|
```typescript
|
|
193
204
|
const evaluator = new ConventionalityEvaluator({
|
|
194
|
-
googleApiKey
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
205
|
+
googleApiKey: string; // Google API key
|
|
206
|
+
modelOverride?: ModelOverride; // Override the default provider and model
|
|
207
|
+
maxRetries?: number; // Max retry attempts (default: 2)
|
|
208
|
+
telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
|
|
209
|
+
logger?: Logger; // Custom logger
|
|
210
|
+
logLevel?: LogLevel; // Log verbosity (default: WARN)
|
|
199
211
|
});
|
|
200
212
|
```
|
|
201
213
|
|
|
@@ -212,6 +224,8 @@ await evaluator.evaluate(text: string, grade: string)
|
|
|
212
224
|
metadata: {
|
|
213
225
|
model: string;
|
|
214
226
|
processingTimeMs: number;
|
|
227
|
+
inputTokens: number;
|
|
228
|
+
outputTokens: number;
|
|
215
229
|
};
|
|
216
230
|
_internal: {
|
|
217
231
|
conventionality_features: string[];
|
|
@@ -248,17 +262,18 @@ Composite evaluator that analyzes vocabulary, sentence structure, subject matter
|
|
|
248
262
|
|
|
249
263
|
**Supported Grades:** 3-12
|
|
250
264
|
|
|
251
|
-
**Uses:** Google Gemini 2.5 Pro + Google Gemini 3 Flash Preview + OpenAI GPT-4o (composite)
|
|
265
|
+
**Uses:** Google Gemini 2.5 Pro + Google Gemini 3 Flash Preview + OpenAI GPT-4o + OpenAI GPT-4.1 (composite)
|
|
252
266
|
|
|
253
267
|
**Constructor:**
|
|
254
268
|
```typescript
|
|
255
269
|
const evaluator = new TextComplexityEvaluator({
|
|
256
|
-
googleApiKey
|
|
257
|
-
openaiApiKey
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
270
|
+
googleApiKey: string; // Google API key
|
|
271
|
+
openaiApiKey: string; // OpenAI API key
|
|
272
|
+
modelOverride?: ModelOverride; // Override the default provider and model for all sub-evaluators
|
|
273
|
+
maxRetries?: number; // Max retry attempts (default: 2)
|
|
274
|
+
telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
|
|
275
|
+
logger?: Logger; // Custom logger
|
|
276
|
+
logLevel?: LogLevel; // Log verbosity (default: WARN)
|
|
262
277
|
});
|
|
263
278
|
```
|
|
264
279
|
|
|
@@ -317,11 +332,12 @@ Determines appropriate grade level for text.
|
|
|
317
332
|
**Constructor:**
|
|
318
333
|
```typescript
|
|
319
334
|
const evaluator = new GradeLevelAppropriatenessEvaluator({
|
|
320
|
-
googleApiKey
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
335
|
+
googleApiKey: string; // Google API key
|
|
336
|
+
modelOverride?: ModelOverride; // Override the default provider and model
|
|
337
|
+
maxRetries?: number; // Max retry attempts (default: 2)
|
|
338
|
+
telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
|
|
339
|
+
logger?: Logger; // Custom logger
|
|
340
|
+
logLevel?: LogLevel; // Log verbosity (default: WARN)
|
|
325
341
|
});
|
|
326
342
|
```
|
|
327
343
|
|
|
@@ -338,6 +354,8 @@ await evaluator.evaluate(text: string)
|
|
|
338
354
|
metadata: {
|
|
339
355
|
model: string;
|
|
340
356
|
processingTimeMs: number;
|
|
357
|
+
inputTokens: number;
|
|
358
|
+
outputTokens: number;
|
|
341
359
|
};
|
|
342
360
|
_internal: {
|
|
343
361
|
grade: string;
|
|
@@ -350,6 +368,85 @@ await evaluator.evaluate(text: string)
|
|
|
350
368
|
|
|
351
369
|
---
|
|
352
370
|
|
|
371
|
+
### 7. Purpose Evaluator
|
|
372
|
+
|
|
373
|
+
Evaluates the Purpose dimension of qualitative text complexity — how explicitly the text's purpose is stated versus implied, and how that affects comprehension demands for the target grade level.
|
|
374
|
+
|
|
375
|
+
**Supported Grades:** 3-12
|
|
376
|
+
|
|
377
|
+
**Uses:** Google Gemini 3 Flash Preview
|
|
378
|
+
|
|
379
|
+
**Constructor:**
|
|
380
|
+
```typescript
|
|
381
|
+
const evaluator = new PurposeEvaluator({
|
|
382
|
+
googleApiKey: string; // Google API key (required by this evaluator)
|
|
383
|
+
modelOverride?: ModelOverride; // Override the default provider and model
|
|
384
|
+
maxRetries?: number; // Max retry attempts (default: 2)
|
|
385
|
+
telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
|
|
386
|
+
logger?: Logger; // Custom logger
|
|
387
|
+
logLevel?: LogLevel; // Log verbosity (default: WARN)
|
|
388
|
+
});
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
**API:**
|
|
392
|
+
```typescript
|
|
393
|
+
await evaluator.evaluate(text: string, grade: string)
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
**Returns:**
|
|
397
|
+
```typescript
|
|
398
|
+
{
|
|
399
|
+
score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex' | 'More context needed';
|
|
400
|
+
reasoning: string;
|
|
401
|
+
metadata: {
|
|
402
|
+
model: string;
|
|
403
|
+
processingTimeMs: number;
|
|
404
|
+
inputTokens: number;
|
|
405
|
+
outputTokens: number;
|
|
406
|
+
};
|
|
407
|
+
_internal: {
|
|
408
|
+
complexity_score: 'slightly_complex' | 'moderately_complex' | 'very_complex' | 'exceedingly_complex' | 'more_context_needed';
|
|
409
|
+
reasoning: string;
|
|
410
|
+
details: {
|
|
411
|
+
detailed_summary: Array<{
|
|
412
|
+
factor: string;
|
|
413
|
+
description: string;
|
|
414
|
+
effect_on_complexity_dimension: string;
|
|
415
|
+
}>;
|
|
416
|
+
adjustment_and_scaffolding: Array<{
|
|
417
|
+
scaffolding_need: string;
|
|
418
|
+
suggestion: string;
|
|
419
|
+
}>;
|
|
420
|
+
recommended_use_cases: Array<{
|
|
421
|
+
opportunity: string;
|
|
422
|
+
suggestion: string;
|
|
423
|
+
}>;
|
|
424
|
+
};
|
|
425
|
+
};
|
|
426
|
+
}
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
> **Note:** The `'More context needed'` score is used for cases where the text alone is insufficient to determine complexity.
|
|
430
|
+
|
|
431
|
+
**Example:**
|
|
432
|
+
```typescript
|
|
433
|
+
import { PurposeEvaluator } from '@learning-commons/evaluators';
|
|
434
|
+
|
|
435
|
+
const evaluator = new PurposeEvaluator({
|
|
436
|
+
googleApiKey: process.env.GOOGLE_API_KEY,
|
|
437
|
+
});
|
|
438
|
+
|
|
439
|
+
const result = await evaluator.evaluate(
|
|
440
|
+
"The author argues that renewable energy is the only viable solution to climate change.",
|
|
441
|
+
"9"
|
|
442
|
+
);
|
|
443
|
+
console.log(result.score); // "Moderately complex"
|
|
444
|
+
console.log(result.reasoning);
|
|
445
|
+
console.log(result._internal.details.adjustment_and_scaffolding);
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
---
|
|
449
|
+
|
|
353
450
|
## Batch CSV Evaluation
|
|
354
451
|
|
|
355
452
|
For evaluating many texts at once, the SDK ships a CLI tool that reads a CSV file, runs all evaluators in a group, and produces CSV and HTML reports.
|
|
@@ -399,6 +496,9 @@ try {
|
|
|
399
496
|
} else if (error instanceof NetworkError) {
|
|
400
497
|
// Network connectivity issues
|
|
401
498
|
console.error('Network error:', error.message);
|
|
499
|
+
} else if (error instanceof TimeoutError) {
|
|
500
|
+
// Request timed out
|
|
501
|
+
console.error('Timeout:', error.message);
|
|
402
502
|
} else if (error instanceof APIError) {
|
|
403
503
|
// Other API errors
|
|
404
504
|
console.error('API error:', error.message, 'Status:', error.statusCode);
|
|
@@ -443,6 +543,44 @@ const evaluator = new VocabularyEvaluator({
|
|
|
443
543
|
|
|
444
544
|
---
|
|
445
545
|
|
|
546
|
+
## Model Override
|
|
547
|
+
|
|
548
|
+
By default each evaluator uses a recommended provider and model tuned for that task. You can override this with any supported provider — OpenAI, Google, or Anthropic — using the `modelOverride` option.
|
|
549
|
+
|
|
550
|
+
When `modelOverride` is set:
|
|
551
|
+
- All LLM calls within the evaluator use the specified provider and model
|
|
552
|
+
- Only the API key for the override provider is required (e.g. `anthropicApiKey` when using `Provider.Anthropic`); default provider keys are not validated
|
|
553
|
+
- A warning is logged to indicate results may differ from the defaults
|
|
554
|
+
- Telemetry records `model_override: true` so override usage is tracked separately
|
|
555
|
+
|
|
556
|
+
**Validation:** The SDK validates `modelOverride` at construction time and throws `ConfigurationError` if:
|
|
557
|
+
- `provider` is not one of the supported `Provider` values (`openai`, `google`, `anthropic`)
|
|
558
|
+
- `model` is empty or blank — no default is assumed; you must always specify the model ID explicitly
|
|
559
|
+
- The API key for the chosen provider is missing
|
|
560
|
+
|
|
561
|
+
If the model ID is valid at construction but doesn't exist on the provider's API, `ConfigurationError` is thrown when `evaluate()` is called.
|
|
562
|
+
|
|
563
|
+
```typescript
|
|
564
|
+
import { VocabularyEvaluator, Provider } from '@learning-commons/evaluators';
|
|
565
|
+
|
|
566
|
+
const evaluator = new VocabularyEvaluator({
|
|
567
|
+
anthropicApiKey: process.env.ANTHROPIC_API_KEY,
|
|
568
|
+
modelOverride: {
|
|
569
|
+
provider: Provider.Anthropic,
|
|
570
|
+
model: 'claude-sonnet-4-6',
|
|
571
|
+
},
|
|
572
|
+
});
|
|
573
|
+
|
|
574
|
+
const result = await evaluator.evaluate("Your text here", "5");
|
|
575
|
+
console.log(result.metadata.model); // "anthropic:claude-sonnet-4-6"
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
See the [Installation](#installation) section for provider adapter setup if you haven't already.
|
|
579
|
+
|
|
580
|
+
> **Note:** Evaluators are validated and quality-tested against their default models. Results with other models may vary. Check `result.metadata.model` to confirm which model was used.
|
|
581
|
+
|
|
582
|
+
---
|
|
583
|
+
|
|
446
584
|
## Telemetry & Privacy
|
|
447
585
|
|
|
448
586
|
See [docs/telemetry.md](./docs/telemetry.md) for telemetry configuration and privacy information.
|
|
@@ -455,13 +593,15 @@ All evaluators use the same `BaseEvaluatorConfig` interface:
|
|
|
455
593
|
|
|
456
594
|
```typescript
|
|
457
595
|
interface BaseEvaluatorConfig {
|
|
458
|
-
googleApiKey?: string;
|
|
459
|
-
openaiApiKey?: string;
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
596
|
+
googleApiKey?: string; // Google API key (required by some evaluators)
|
|
597
|
+
openaiApiKey?: string; // OpenAI API key (required by some evaluators)
|
|
598
|
+
anthropicApiKey?: string; // Anthropic API key (required if an evaluator defaults to Claude or when `modelOverride` uses `Provider.Anthropic`)
|
|
599
|
+
modelOverride?: ModelOverride; // Override the provider and model (see Model Override section)
|
|
600
|
+
maxRetries?: number; // Max retry attempts (default: 2)
|
|
601
|
+
telemetry?: boolean | TelemetryOptions; // Telemetry settings (default: enabled)
|
|
602
|
+
logger?: Logger; // Custom logger
|
|
603
|
+
logLevel?: LogLevel; // Log verbosity (default: WARN)
|
|
604
|
+
partnerKey?: string; // Learning Commons partner key for authenticated telemetry
|
|
465
605
|
}
|
|
466
606
|
```
|
|
467
607
|
|
|
@@ -472,6 +612,9 @@ interface BaseEvaluatorConfig {
|
|
|
472
612
|
- **Conventionality**: Requires `googleApiKey` only
|
|
473
613
|
- **Text Complexity**: Requires both `googleApiKey` and `openaiApiKey`
|
|
474
614
|
- **Grade Level Appropriateness**: Requires `googleApiKey` only
|
|
615
|
+
- **Purpose**: Requires `googleApiKey` only
|
|
616
|
+
|
|
617
|
+
When `modelOverride` is set, the default key requirements are bypassed — only the key for the override provider is required (e.g. `anthropicApiKey` when using `Provider.Anthropic`).
|
|
475
618
|
|
|
476
619
|
---
|
|
477
620
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* Logging interface for the Evaluators SDK
|
|
3
5
|
*
|
|
@@ -77,6 +79,82 @@ interface Logger {
|
|
|
77
79
|
error(message: string, context?: LogContext): void;
|
|
78
80
|
}
|
|
79
81
|
|
|
82
|
+
/**
|
|
83
|
+
* Message format for LLM conversations
|
|
84
|
+
*/
|
|
85
|
+
interface Message {
|
|
86
|
+
role: 'system' | 'user' | 'assistant';
|
|
87
|
+
content: string;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Request configuration for structured LLM generation
|
|
91
|
+
*/
|
|
92
|
+
interface LLMRequest<T> {
|
|
93
|
+
messages: Message[];
|
|
94
|
+
schema: z.ZodSchema<T>;
|
|
95
|
+
temperature?: number;
|
|
96
|
+
maxTokens?: number;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Response from LLM with usage metadata
|
|
100
|
+
*/
|
|
101
|
+
interface LLMResponse<T> {
|
|
102
|
+
data: T;
|
|
103
|
+
model: string;
|
|
104
|
+
usage: {
|
|
105
|
+
inputTokens: number;
|
|
106
|
+
outputTokens: number;
|
|
107
|
+
};
|
|
108
|
+
latencyMs: number;
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Response from plain text generation
|
|
112
|
+
*/
|
|
113
|
+
interface TextGenerationResponse {
|
|
114
|
+
text: string;
|
|
115
|
+
usage: {
|
|
116
|
+
inputTokens: number;
|
|
117
|
+
outputTokens: number;
|
|
118
|
+
};
|
|
119
|
+
latencyMs: number;
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Base interface for LLM provider implementations
|
|
123
|
+
*/
|
|
124
|
+
interface LLMProvider {
|
|
125
|
+
/** Canonical label for the provider and model in use (e.g. "openai:gpt-4o") */
|
|
126
|
+
readonly label: string;
|
|
127
|
+
/**
|
|
128
|
+
* Generate structured output from LLM using Zod schema
|
|
129
|
+
*/
|
|
130
|
+
generateStructured<T>(request: LLMRequest<T>): Promise<LLMResponse<T>>;
|
|
131
|
+
/**
|
|
132
|
+
* Generate plain text from LLM
|
|
133
|
+
*/
|
|
134
|
+
generateText(messages: Message[], temperature?: number): Promise<TextGenerationResponse>;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Named constants for LLM provider types — use instead of raw string literals.
|
|
138
|
+
*/
|
|
139
|
+
declare const Providers: {
|
|
140
|
+
readonly google: "google";
|
|
141
|
+
readonly openai: "openai";
|
|
142
|
+
readonly anthropic: "anthropic";
|
|
143
|
+
readonly custom: "custom";
|
|
144
|
+
};
|
|
145
|
+
/**
|
|
146
|
+
* Configuration for LLM provider
|
|
147
|
+
*/
|
|
148
|
+
interface ProviderConfig {
|
|
149
|
+
type: 'openai' | 'anthropic' | 'google' | 'custom';
|
|
150
|
+
apiKey?: string;
|
|
151
|
+
model?: string;
|
|
152
|
+
temperature?: number;
|
|
153
|
+
baseURL?: string;
|
|
154
|
+
customProvider?: LLMProvider;
|
|
155
|
+
maxRetries?: number;
|
|
156
|
+
}
|
|
157
|
+
|
|
80
158
|
/**
|
|
81
159
|
* Evaluation status
|
|
82
160
|
*/
|
|
@@ -130,6 +208,7 @@ interface TelemetryEvent {
|
|
|
130
208
|
provider: string;
|
|
131
209
|
token_usage?: TokenUsage;
|
|
132
210
|
metadata?: TelemetryMetadata;
|
|
211
|
+
model_override?: boolean;
|
|
133
212
|
input_text?: string;
|
|
134
213
|
}
|
|
135
214
|
/**
|
|
@@ -166,6 +245,14 @@ declare class TelemetryClient {
|
|
|
166
245
|
send(event: TelemetryEvent): Promise<void>;
|
|
167
246
|
}
|
|
168
247
|
|
|
248
|
+
/**
|
|
249
|
+
* Supported LLM providers
|
|
250
|
+
*/
|
|
251
|
+
declare enum Provider {
|
|
252
|
+
OpenAI = "openai",
|
|
253
|
+
Google = "google",
|
|
254
|
+
Anthropic = "anthropic"
|
|
255
|
+
}
|
|
169
256
|
/**
|
|
170
257
|
* Granular telemetry configuration options
|
|
171
258
|
*/
|
|
@@ -175,6 +262,24 @@ interface TelemetryOptions {
|
|
|
175
262
|
/** Record input text in telemetry (default: false) */
|
|
176
263
|
recordInputs?: boolean;
|
|
177
264
|
}
|
|
265
|
+
/**
|
|
266
|
+
* Override the provider and model used by an evaluator.
|
|
267
|
+
*
|
|
268
|
+
* When set, all LLM calls use this provider and model instead of the defaults.
|
|
269
|
+
* The evaluator's normal key requirements are bypassed — provide the key for
|
|
270
|
+
* the chosen provider via the matching top-level config field
|
|
271
|
+
* (e.g. `anthropicApiKey` for `Provider.Anthropic`).
|
|
272
|
+
*
|
|
273
|
+
* Both `provider` and `model` are required. An empty or missing `model` throws
|
|
274
|
+
* `ConfigurationError` at construction time. An unrecognised model ID throws
|
|
275
|
+
* `ConfigurationError` at evaluation time when the provider rejects it.
|
|
276
|
+
*
|
|
277
|
+
* Results may vary; evaluators are validated against their recommended models.
|
|
278
|
+
*/
|
|
279
|
+
interface ModelOverride {
|
|
280
|
+
provider: Provider;
|
|
281
|
+
model: string;
|
|
282
|
+
}
|
|
178
283
|
/**
|
|
179
284
|
* Base configuration for all evaluators
|
|
180
285
|
*/
|
|
@@ -183,8 +288,16 @@ interface BaseEvaluatorConfig {
|
|
|
183
288
|
googleApiKey?: string;
|
|
184
289
|
/** OpenAI API key (for evaluators using GPT) */
|
|
185
290
|
openaiApiKey?: string;
|
|
291
|
+
/** Anthropic API key (for evaluators using Claude) */
|
|
292
|
+
anthropicApiKey?: string;
|
|
186
293
|
/** Learning Commons partner key for authenticated telemetry (optional) */
|
|
187
294
|
partnerKey?: string;
|
|
295
|
+
/**
|
|
296
|
+
* Override the provider and model used by this evaluator.
|
|
297
|
+
* When set, all LLM calls use this provider and model instead of the defaults.
|
|
298
|
+
* See {@link ModelOverride} for details.
|
|
299
|
+
*/
|
|
300
|
+
modelOverride?: ModelOverride;
|
|
188
301
|
/**
|
|
189
302
|
* Maximum number of retries for failed API calls (default: 2)
|
|
190
303
|
* Set to 0 to disable retries.
|
|
@@ -232,10 +345,8 @@ interface EvaluatorMetadata {
|
|
|
232
345
|
readonly description: string;
|
|
233
346
|
/** Supported grade levels (e.g., ['3', '4', '5', ...]) */
|
|
234
347
|
readonly supportedGrades: readonly string[];
|
|
235
|
-
/**
|
|
236
|
-
readonly
|
|
237
|
-
/** Whether this evaluator requires an OpenAI API key */
|
|
238
|
-
readonly requiresOpenAIKey: boolean;
|
|
348
|
+
/** Providers required by this evaluator's default configuration */
|
|
349
|
+
readonly defaultProviders: readonly Provider[];
|
|
239
350
|
}
|
|
240
351
|
/**
|
|
241
352
|
* Abstract base class for all evaluators
|
|
@@ -254,6 +365,10 @@ declare abstract class BaseEvaluator {
|
|
|
254
365
|
protected logger: Logger;
|
|
255
366
|
protected config: Required<Pick<BaseEvaluatorConfig, 'maxRetries'>> & {
|
|
256
367
|
telemetry: Required<TelemetryOptions>;
|
|
368
|
+
modelOverride?: ModelOverride;
|
|
369
|
+
googleApiKey?: string;
|
|
370
|
+
openaiApiKey?: string;
|
|
371
|
+
anthropicApiKey?: string;
|
|
257
372
|
};
|
|
258
373
|
/**
|
|
259
374
|
* Static metadata for the evaluator
|
|
@@ -268,13 +383,17 @@ declare abstract class BaseEvaluator {
|
|
|
268
383
|
* name: 'My Evaluator',
|
|
269
384
|
* description: 'Does something useful',
|
|
270
385
|
* supportedGrades: ['3', '4', '5'],
|
|
271
|
-
*
|
|
272
|
-
* requiresOpenAIKey: false,
|
|
386
|
+
* defaultProviders: [Provider.Google],
|
|
273
387
|
* };
|
|
274
388
|
* }
|
|
275
389
|
* ```
|
|
276
390
|
*/
|
|
277
391
|
static readonly metadata: EvaluatorMetadata;
|
|
392
|
+
/**
|
|
393
|
+
* @throws {ConfigurationError} If the subclass has not defined static metadata
|
|
394
|
+
* @throws {ConfigurationError} If modelOverride has an invalid provider or empty model
|
|
395
|
+
* @throws {ConfigurationError} If a required API key is missing
|
|
396
|
+
*/
|
|
278
397
|
constructor(config: BaseEvaluatorConfig);
|
|
279
398
|
/**
|
|
280
399
|
* Get metadata for this evaluator instance
|
|
@@ -282,8 +401,16 @@ declare abstract class BaseEvaluator {
|
|
|
282
401
|
*/
|
|
283
402
|
protected get metadata(): EvaluatorMetadata;
|
|
284
403
|
/**
|
|
285
|
-
* Validate
|
|
286
|
-
*
|
|
404
|
+
* Validate modelOverride shape: provider must be a known Provider value and
|
|
405
|
+
* model must be a non-empty string.
|
|
406
|
+
* @throws {ConfigurationError} If the override is malformed
|
|
407
|
+
*/
|
|
408
|
+
private validateModelOverride;
|
|
409
|
+
/**
|
|
410
|
+
* Validate that the required API key is present.
|
|
411
|
+
* When modelOverride is set, checks the override provider's key.
|
|
412
|
+
* Otherwise checks the keys required by the evaluator's default providers.
|
|
413
|
+
* @throws {ConfigurationError} If a required key is missing
|
|
287
414
|
*/
|
|
288
415
|
private validateApiKeys;
|
|
289
416
|
/**
|
|
@@ -311,6 +438,12 @@ declare abstract class BaseEvaluator {
|
|
|
311
438
|
* @throws {ValidationError} If grade is invalid
|
|
312
439
|
*/
|
|
313
440
|
protected validateGrade(grade: string, validGrades: Set<string>): void;
|
|
441
|
+
/**
|
|
442
|
+
* Create an LLM provider, honouring modelOverride if set.
|
|
443
|
+
* When override is active, the key for the override provider is resolved
|
|
444
|
+
* from the matching top-level config field (e.g. anthropicApiKey for Anthropic).
|
|
445
|
+
*/
|
|
446
|
+
protected createConfiguredProvider(defaultType: Provider, defaultModel: string, defaultApiKey: string | undefined): LLMProvider;
|
|
314
447
|
/**
|
|
315
448
|
* Send telemetry event to analytics service
|
|
316
449
|
* Common helper for all evaluators
|
|
@@ -328,4 +461,4 @@ declare abstract class BaseEvaluator {
|
|
|
328
461
|
}): Promise<void>;
|
|
329
462
|
}
|
|
330
463
|
|
|
331
|
-
export { BaseEvaluator as B, type EvaluatorMetadata as E, type
|
|
464
|
+
export { BaseEvaluator as B, type EvaluatorMetadata as E, type LLMProvider as L, type Message as M, Provider as P, type TelemetryOptions as T, type BaseEvaluatorConfig as a, type LLMRequest as b, type LLMResponse as c, type LogContext as d, LogLevel as e, type Logger as f, type ModelOverride as g, type ProviderConfig as h, Providers as i, type TextGenerationResponse as j };
|