sensei-eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +201 -0
- package/dist/criteria/challenge.d.ts +7 -0
- package/dist/criteria/challenge.d.ts.map +1 -0
- package/dist/criteria/challenge.js +142 -0
- package/dist/criteria/challenge.js.map +1 -0
- package/dist/criteria/index.d.ts +5 -0
- package/dist/criteria/index.d.ts.map +1 -0
- package/dist/criteria/index.js +5 -0
- package/dist/criteria/index.js.map +1 -0
- package/dist/criteria/lesson.d.ts +7 -0
- package/dist/criteria/lesson.d.ts.map +1 -0
- package/dist/criteria/lesson.js +143 -0
- package/dist/criteria/lesson.js.map +1 -0
- package/dist/criteria/review.d.ts +6 -0
- package/dist/criteria/review.d.ts.map +1 -0
- package/dist/criteria/review.js +93 -0
- package/dist/criteria/review.js.map +1 -0
- package/dist/criteria/universal.d.ts +9 -0
- package/dist/criteria/universal.d.ts.map +1 -0
- package/dist/criteria/universal.js +204 -0
- package/dist/criteria/universal.js.map +1 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +4 -0
- package/dist/index.js.map +1 -0
- package/dist/judge.d.ts +7 -0
- package/dist/judge.d.ts.map +1 -0
- package/dist/judge.js +55 -0
- package/dist/judge.js.map +1 -0
- package/dist/runner.d.ts +17 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +59 -0
- package/dist/runner.js.map +1 -0
- package/dist/types.d.ts +56 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/markdown.d.ts +17 -0
- package/dist/utils/markdown.d.ts.map +1 -0
- package/dist/utils/markdown.js +69 -0
- package/dist/utils/markdown.js.map +1 -0
- package/package.json +56 -0
package/README.md
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# edu-eval
|
|
2
|
+
|
|
3
|
+
TypeScript library for evaluating AI-generated educational content using deterministic checks and LLM-as-judge scoring.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install edu-eval
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```typescript
|
|
14
|
+
import { EvalRunner, createJudge, criteria } from 'edu-eval';
|
|
15
|
+
|
|
16
|
+
// Full evaluation (deterministic + LLM judge)
|
|
17
|
+
const judge = createJudge({ apiKey: process.env.ANTHROPIC_API_KEY! });
|
|
18
|
+
const runner = new EvalRunner({
|
|
19
|
+
criteria: [
|
|
20
|
+
...criteria.universal,
|
|
21
|
+
...criteria.lesson,
|
|
22
|
+
...criteria.challenge,
|
|
23
|
+
...criteria.review,
|
|
24
|
+
],
|
|
25
|
+
judge,
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
const result = await runner.evaluate({
|
|
29
|
+
content: lessonMarkdown,
|
|
30
|
+
contentType: 'lesson',
|
|
31
|
+
topic: 'Array methods in JavaScript',
|
|
32
|
+
difficulty: 'beginner',
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
console.log(result.passed); // true if all criteria met their thresholds
|
|
36
|
+
console.log(result.overallScore); // weighted average (0-1)
|
|
37
|
+
console.log(result.scores); // per-criterion breakdown
|
|
38
|
+
|
|
39
|
+
// Quick check — deterministic only, no API calls
|
|
40
|
+
const quick = await runner.quickCheck({
|
|
41
|
+
content: lessonMarkdown,
|
|
42
|
+
contentType: 'lesson',
|
|
43
|
+
});
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## How It Works
|
|
47
|
+
|
|
48
|
+
Content is evaluated through a two-tier system:
|
|
49
|
+
|
|
50
|
+
1. **Deterministic criteria** — fast, pure functions that check markdown formatting, length, and structure
|
|
51
|
+
2. **LLM-judge criteria** — Claude API calls that score content against rubrics on a 1-5 scale, normalized to 0-1
|
|
52
|
+
|
|
53
|
+
`EvalRunner.evaluate()` filters criteria by `contentType`, runs deterministic checks in parallel, then runs LLM checks in parallel, computes a weighted overall score, and returns an `EvalResult`.
|
|
54
|
+
|
|
55
|
+
`EvalRunner.quickCheck()` does the same but skips all LLM criteria (no API calls).
|
|
56
|
+
|
|
57
|
+
### Scoring
|
|
58
|
+
|
|
59
|
+
- **Overall score**: weighted average of all criterion scores (0-1)
|
|
60
|
+
- **Pass/fail**: every individual criterion must meet its threshold for `passed` to be `true`
|
|
61
|
+
- **LLM normalization**: raw 1-5 scores are mapped to 0-1 via `(score - 1) / 4`
|
|
62
|
+
|
|
63
|
+
### Weights
|
|
64
|
+
|
|
65
|
+
| Weight | Criteria |
|
|
66
|
+
|--------|----------|
|
|
67
|
+
| 1.5 | `topic_accuracy`, `pedagogical_structure`, `problem_clarity` |
|
|
68
|
+
| 1.0 | Most criteria (engagement, repetition_avoidance, code_quality, etc.) |
|
|
69
|
+
| 0.5 | `has_code_block`, `has_structure`, `brevity` |
|
|
70
|
+
|
|
71
|
+
## Criteria
|
|
72
|
+
|
|
73
|
+
### Universal (all content types)
|
|
74
|
+
|
|
75
|
+
| Criterion | Method | Description |
|
|
76
|
+
|-----------|--------|-------------|
|
|
77
|
+
| `format_compliance` | deterministic | No unclosed code blocks, bold, or italic markers |
|
|
78
|
+
| `length_compliance` | deterministic | Content within min/max character limits per type |
|
|
79
|
+
| `has_code_block` | deterministic | At least one fenced code block (lesson, challenge only) |
|
|
80
|
+
| `has_structure` | deterministic | Headings and multiple sections (lesson, challenge only) |
|
|
81
|
+
| `engagement` | LLM judge | Hook, pacing, and closure quality |
|
|
82
|
+
| `repetition_avoidance` | LLM judge | Avoids repeating previous content |
|
|
83
|
+
|
|
84
|
+
### Lesson
|
|
85
|
+
|
|
86
|
+
| Criterion | Method | Description |
|
|
87
|
+
|-----------|--------|-------------|
|
|
88
|
+
| `topic_accuracy` | LLM judge | Covers stated topic correctly, no factual errors |
|
|
89
|
+
| `pedagogical_structure` | LLM judge | Progression from intuition to examples to practice |
|
|
90
|
+
| `code_quality` | LLM judge | Relevant, correct, well-explained code examples |
|
|
91
|
+
| `progressive_difficulty` | LLM judge | Appropriate scaffolding for stated difficulty level |
|
|
92
|
+
|
|
93
|
+
### Challenge
|
|
94
|
+
|
|
95
|
+
| Criterion | Method | Description |
|
|
96
|
+
|-----------|--------|-------------|
|
|
97
|
+
| `problem_clarity` | LLM judge | Unambiguous requirements with clear inputs/outputs |
|
|
98
|
+
| `difficulty_calibration` | LLM judge | Matches stated difficulty level |
|
|
99
|
+
| `hint_quality` | LLM judge | Progressive hints without spoiling the solution |
|
|
100
|
+
| `testability` | LLM judge | Verifiable with clear expected outputs or test cases |
|
|
101
|
+
|
|
102
|
+
### Review
|
|
103
|
+
|
|
104
|
+
| Criterion | Method | Description |
|
|
105
|
+
|-----------|--------|-------------|
|
|
106
|
+
| `brevity` | deterministic | Under character limit (default 2000, configurable via metadata) |
|
|
107
|
+
| `actionability` | LLM judge | Concrete, specific next steps |
|
|
108
|
+
| `honesty` | LLM judge | Honest gap assessment without sugarcoating |
|
|
109
|
+
|
|
110
|
+
## API
|
|
111
|
+
|
|
112
|
+
### `EvalRunner`
|
|
113
|
+
|
|
114
|
+
```typescript
|
|
115
|
+
const runner = new EvalRunner({ criteria: EvalCriterion[], judge?: Judge });
|
|
116
|
+
|
|
117
|
+
runner.evaluate(input: EvalInput): Promise<EvalResult> // full eval
|
|
118
|
+
runner.quickCheck(input: EvalInput): Promise<EvalResult> // deterministic only
|
|
119
|
+
runner.getCriteria(contentType: string): EvalCriterion[] // filter by type
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### `createJudge`
|
|
123
|
+
|
|
124
|
+
```typescript
|
|
125
|
+
const judge = createJudge({
|
|
126
|
+
apiKey: string,
|
|
127
|
+
model?: string, // default: 'claude-sonnet-4-20250514'
|
|
128
|
+
maxTokens?: number,
|
|
129
|
+
});
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Types
|
|
133
|
+
|
|
134
|
+
```typescript
|
|
135
|
+
interface EvalInput {
|
|
136
|
+
content: string;
|
|
137
|
+
contentType: string; // 'lesson' | 'challenge' | 'review' | 'quiz'
|
|
138
|
+
topic?: string;
|
|
139
|
+
difficulty?: string;
|
|
140
|
+
previousContent?: string[]; // for repetition checking
|
|
141
|
+
metadata?: Record<string, unknown>;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
interface EvalResult {
|
|
145
|
+
overallScore: number; // weighted average (0-1)
|
|
146
|
+
passed: boolean; // all criteria passed their thresholds
|
|
147
|
+
scores: EvalScore[];
|
|
148
|
+
contentType: string;
|
|
149
|
+
evaluatedAt: string; // ISO timestamp
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
interface EvalScore {
|
|
153
|
+
criterion: string;
|
|
154
|
+
score: number; // normalized (0-1)
|
|
155
|
+
rawScore: number;
|
|
156
|
+
maxScore: number;
|
|
157
|
+
passed: boolean;
|
|
158
|
+
reasoning: string;
|
|
159
|
+
metadata?: Record<string, unknown>;
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Adding a New Criterion
|
|
164
|
+
|
|
165
|
+
1. Define a `JudgeRubric` with a 1-5 scale (if LLM) in the appropriate `src/criteria/*.ts`
|
|
166
|
+
2. Export an `EvalCriterion` object with an `evaluate()` function
|
|
167
|
+
3. Add it to the exported array at the bottom of the file
|
|
168
|
+
4. Add tests — mock the `Judge` for LLM criteria, use fixtures for deterministic
|
|
169
|
+
|
|
170
|
+
## Development
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
npm run build # tsc -> dist/
|
|
174
|
+
npm test # vitest run (unit tests, no API calls)
|
|
175
|
+
npm run test:watch # vitest in watch mode
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Tests mock all LLM calls — no Anthropic API key needed to run the test suite.
|
|
179
|
+
|
|
180
|
+
## Project Structure
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
src/
|
|
184
|
+
index.ts # public API
|
|
185
|
+
types.ts # all interfaces
|
|
186
|
+
runner.ts # EvalRunner orchestration
|
|
187
|
+
judge.ts # Anthropic SDK wrapper
|
|
188
|
+
criteria/
|
|
189
|
+
index.ts # re-exports all criterion sets
|
|
190
|
+
universal.ts # criteria for all content types
|
|
191
|
+
lesson.ts # lesson-specific criteria
|
|
192
|
+
challenge.ts # challenge-specific criteria
|
|
193
|
+
review.ts # review-specific criteria
|
|
194
|
+
utils/
|
|
195
|
+
markdown.ts # markdown parsing helpers
|
|
196
|
+
tests/
|
|
197
|
+
fixtures/ # good/bad lesson and challenge markdown
|
|
198
|
+
criteria/ # per-criterion tests
|
|
199
|
+
runner.test.ts # orchestration tests
|
|
200
|
+
judge.test.ts # judge tests
|
|
201
|
+
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { EvalCriterion } from '../types.js';
|
|
2
|
+
export declare const problemClarity: EvalCriterion;
|
|
3
|
+
export declare const difficultyCalibration: EvalCriterion;
|
|
4
|
+
export declare const hintQuality: EvalCriterion;
|
|
5
|
+
export declare const testability: EvalCriterion;
|
|
6
|
+
export declare const challenge: EvalCriterion[];
|
|
7
|
+
//# sourceMappingURL=challenge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"challenge.d.ts","sourceRoot":"","sources":["../../src/criteria/challenge.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAA4C,MAAM,aAAa,CAAC;AAe3F,eAAO,MAAM,cAAc,EAAE,aAoB5B,CAAC;AAeF,eAAO,MAAM,qBAAqB,EAAE,aAuBnC,CAAC;AAeF,eAAO,MAAM,WAAW,EAAE,aAoBzB,CAAC;AAeF,eAAO,MAAM,WAAW,EAAE,aAoBzB,CAAC;AAEF,eAAO,MAAM,SAAS,EAAE,aAAa,EAKpC,CAAC"}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
const problemClarityRubric = {
|
|
2
|
+
criterion: 'Problem Clarity',
|
|
3
|
+
description: 'Is the problem statement unambiguous? Can the reader understand exactly what they need to build/solve?',
|
|
4
|
+
scale: [
|
|
5
|
+
{ score: 1, label: 'Unclear', description: 'Vague or contradictory requirements, impossible to know what to build' },
|
|
6
|
+
{ score: 2, label: 'Ambiguous', description: 'General idea is clear but key details are missing or unclear' },
|
|
7
|
+
{ score: 3, label: 'Clear', description: 'Requirements are understandable, minor ambiguities acceptable' },
|
|
8
|
+
{ score: 4, label: 'Precise', description: 'Clear inputs, outputs, and constraints with examples' },
|
|
9
|
+
{ score: 5, label: 'Crystal clear', description: 'Unambiguous spec with examples, edge cases noted, nothing left to guess' },
|
|
10
|
+
],
|
|
11
|
+
};
|
|
12
|
+
export const problemClarity = {
|
|
13
|
+
name: 'problem_clarity',
|
|
14
|
+
description: 'Problem statement is unambiguous and complete',
|
|
15
|
+
contentTypes: ['challenge'],
|
|
16
|
+
method: 'llm_judge',
|
|
17
|
+
threshold: 0.5,
|
|
18
|
+
weight: 1.5,
|
|
19
|
+
async evaluate(input, judge) {
|
|
20
|
+
if (!judge)
|
|
21
|
+
throw new Error('Judge required for problem_clarity criterion');
|
|
22
|
+
const result = await judge.score(input.content, problemClarityRubric);
|
|
23
|
+
const score = (result.score - 1) / 4;
|
|
24
|
+
return {
|
|
25
|
+
criterion: 'problem_clarity',
|
|
26
|
+
score,
|
|
27
|
+
rawScore: result.score,
|
|
28
|
+
maxScore: 5,
|
|
29
|
+
passed: score >= this.threshold,
|
|
30
|
+
reasoning: result.reasoning,
|
|
31
|
+
};
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
const difficultyCalibrationsRubric = {
|
|
35
|
+
criterion: 'Difficulty Calibration',
|
|
36
|
+
description: 'Does the challenge match the expected difficulty level? Consider: concept complexity, number of steps, edge cases, and expected time to solve.',
|
|
37
|
+
scale: [
|
|
38
|
+
{ score: 1, label: 'Way off', description: 'Trivial problem labeled hard, or impossible problem labeled easy' },
|
|
39
|
+
{ score: 2, label: 'Miscalibrated', description: 'Noticeably easier or harder than stated level' },
|
|
40
|
+
{ score: 3, label: 'Reasonable', description: 'Roughly matches the stated difficulty, minor miscalibration' },
|
|
41
|
+
{ score: 4, label: 'Well-calibrated', description: 'Difficulty matches expectations, appropriate scope' },
|
|
42
|
+
{ score: 5, label: 'Perfectly tuned', description: 'Exactly the right level of challenge for the stated difficulty' },
|
|
43
|
+
],
|
|
44
|
+
};
|
|
45
|
+
export const difficultyCalibration = {
|
|
46
|
+
name: 'difficulty_calibration',
|
|
47
|
+
description: 'Challenge matches the expected difficulty level',
|
|
48
|
+
contentTypes: ['challenge'],
|
|
49
|
+
method: 'llm_judge',
|
|
50
|
+
threshold: 0.5,
|
|
51
|
+
weight: 1.0,
|
|
52
|
+
async evaluate(input, judge) {
|
|
53
|
+
if (!judge)
|
|
54
|
+
throw new Error('Judge required for difficulty_calibration criterion');
|
|
55
|
+
const context = input.difficulty
|
|
56
|
+
? `Expected difficulty: ${input.difficulty}`
|
|
57
|
+
: undefined;
|
|
58
|
+
const result = await judge.score(input.content, difficultyCalibrationsRubric, context);
|
|
59
|
+
const score = (result.score - 1) / 4;
|
|
60
|
+
return {
|
|
61
|
+
criterion: 'difficulty_calibration',
|
|
62
|
+
score,
|
|
63
|
+
rawScore: result.score,
|
|
64
|
+
maxScore: 5,
|
|
65
|
+
passed: score >= this.threshold,
|
|
66
|
+
reasoning: result.reasoning,
|
|
67
|
+
};
|
|
68
|
+
},
|
|
69
|
+
};
|
|
70
|
+
const hintQualityRubric = {
|
|
71
|
+
criterion: 'Hint Quality',
|
|
72
|
+
description: 'Are hints provided? Are they progressive (nudge → more direct)? Do they help without giving away the answer?',
|
|
73
|
+
scale: [
|
|
74
|
+
{ score: 1, label: 'No hints', description: 'No hints or guidance provided at all' },
|
|
75
|
+
{ score: 2, label: 'Weak hints', description: 'Hints exist but are too vague or give away the answer' },
|
|
76
|
+
{ score: 3, label: 'Adequate', description: 'Hints provide useful direction without spoiling the solution' },
|
|
77
|
+
{ score: 4, label: 'Good progression', description: 'Multiple hint levels from gentle nudge to more direct guidance' },
|
|
78
|
+
{ score: 5, label: 'Perfect scaffolding', description: 'Progressive hints that teach problem-solving strategy, not just the answer' },
|
|
79
|
+
],
|
|
80
|
+
};
|
|
81
|
+
export const hintQuality = {
|
|
82
|
+
name: 'hint_quality',
|
|
83
|
+
description: 'Hints are progressive and helpful without spoiling the solution',
|
|
84
|
+
contentTypes: ['challenge'],
|
|
85
|
+
method: 'llm_judge',
|
|
86
|
+
threshold: 0.5,
|
|
87
|
+
weight: 1.0,
|
|
88
|
+
async evaluate(input, judge) {
|
|
89
|
+
if (!judge)
|
|
90
|
+
throw new Error('Judge required for hint_quality criterion');
|
|
91
|
+
const result = await judge.score(input.content, hintQualityRubric);
|
|
92
|
+
const score = (result.score - 1) / 4;
|
|
93
|
+
return {
|
|
94
|
+
criterion: 'hint_quality',
|
|
95
|
+
score,
|
|
96
|
+
rawScore: result.score,
|
|
97
|
+
maxScore: 5,
|
|
98
|
+
passed: score >= this.threshold,
|
|
99
|
+
reasoning: result.reasoning,
|
|
100
|
+
};
|
|
101
|
+
},
|
|
102
|
+
};
|
|
103
|
+
const testabilityRubric = {
|
|
104
|
+
criterion: 'Testability',
|
|
105
|
+
description: 'Can the solution be verified? Are there clear expected outputs, test cases, or validation criteria?',
|
|
106
|
+
scale: [
|
|
107
|
+
{ score: 1, label: 'Unverifiable', description: 'No way to know if a solution is correct' },
|
|
108
|
+
{ score: 2, label: 'Vaguely testable', description: 'General idea of correctness but no concrete checks' },
|
|
109
|
+
{ score: 3, label: 'Testable', description: 'Expected behavior is clear enough to verify manually' },
|
|
110
|
+
{ score: 4, label: 'Well-specified', description: 'Clear expected outputs or test cases provided' },
|
|
111
|
+
{ score: 5, label: 'Fully specified', description: 'Complete test cases with edge cases, ready to validate automatically' },
|
|
112
|
+
],
|
|
113
|
+
};
|
|
114
|
+
export const testability = {
|
|
115
|
+
name: 'testability',
|
|
116
|
+
description: 'Solution is verifiable with clear expected outputs',
|
|
117
|
+
contentTypes: ['challenge'],
|
|
118
|
+
method: 'llm_judge',
|
|
119
|
+
threshold: 0.5,
|
|
120
|
+
weight: 1.0,
|
|
121
|
+
async evaluate(input, judge) {
|
|
122
|
+
if (!judge)
|
|
123
|
+
throw new Error('Judge required for testability criterion');
|
|
124
|
+
const result = await judge.score(input.content, testabilityRubric);
|
|
125
|
+
const score = (result.score - 1) / 4;
|
|
126
|
+
return {
|
|
127
|
+
criterion: 'testability',
|
|
128
|
+
score,
|
|
129
|
+
rawScore: result.score,
|
|
130
|
+
maxScore: 5,
|
|
131
|
+
passed: score >= this.threshold,
|
|
132
|
+
reasoning: result.reasoning,
|
|
133
|
+
};
|
|
134
|
+
},
|
|
135
|
+
};
|
|
136
|
+
export const challenge = [
|
|
137
|
+
problemClarity,
|
|
138
|
+
difficultyCalibration,
|
|
139
|
+
hintQuality,
|
|
140
|
+
testability,
|
|
141
|
+
];
|
|
142
|
+
//# sourceMappingURL=challenge.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"challenge.js","sourceRoot":"","sources":["../../src/criteria/challenge.ts"],"names":[],"mappings":"AAEA,MAAM,oBAAoB,GAAgB;IACxC,SAAS,EAAE,iBAAiB;IAC5B,WAAW,EACT,wGAAwG;IAC1G,KAAK,EAAE;QACL,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,WAAW,EAAE,uEAAuE,EAAE;QACpH,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,WAAW,EAAE,WAAW,EAAE,8DAA8D,EAAE;QAC7G,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,OAAO,EAAE,WAAW,EAAE,+DAA+D,EAAE;QAC1G,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,WAAW,EAAE,sDAAsD,EAAE;QACnG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,eAAe,EAAE,WAAW,EAAE,yEAAyE,EAAE;KAC7H;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,cAAc,GAAkB;IAC3C,IAAI,EAAE,iBAAiB;IACvB,WAAW,EAAE,+CAA+C;IAC5D,YAAY,EAAE,CAAC,WAAW,CAAC;IAC3B,MAAM,EAAE,WAAW;IACnB,SAAS,EAAE,GAAG;IACd,MAAM,EAAE,GAAG;IACX,KAAK,CAAC,QAAQ,CAAC,KAAgB,EAAE,KAAa;QAC5C,IAAI,CAAC,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,8CAA8C,CAAC,CAAC;QAC5E,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,EAAE,oBAAoB,CAAC,CAAC;QACtE,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QACrC,OAAO;YACL,SAAS,EAAE,iBAAiB;YAC5B,KAAK;YACL,QAAQ,EAAE,MAAM,CAAC,KAAK;YACtB,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,SAAS;YAC/B,SAAS,EAAE,MAAM,CAAC,SAAS;SAC5B,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,MAAM,4BAA4B,GAAgB;IAChD,SAAS,EAAE,wBAAwB;IACnC,WAAW,EACT,gJAAgJ;IAClJ,KAAK,EAAE;QACL,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,WAAW,EAAE,kEAAkE,EAAE;QAC/G,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,eAAe,EAAE,WAAW,EAAE,+CAA+C,EAAE;QAClG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,YAAY,EAAE,WAAW,EAAE,6DAA6D,EAAE;QAC7G,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE,WAAW,EAAE,oDAAoD,EAAE;QACzG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE,WAAW,EAAE,gEAAgE,EAAE;KACtH;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,qBAAqB,GAAkB;IAClD,IAAI,EAAE,wBAAwB;IAC9B,WAAW,EAAE,iDAAiD;IAC9D,YAAY,EAAE,CAAC,WAAW,CAAC;IAC3B,MAAM,EAAE,WAAW;IACnB,SAAS,EAAE,GAAG;IACd,MAAM,EAAE,GAAG;IACX,KAAK,CAAC,QAAQ,CAAC,KAAgB,EAAE,KAAa;QAC5C,IAAI,CAAC,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,qDAAqD,CAAC,CAAC;QACnF,MAAM,OAAO,GAAG,KAAK,CAAC,UAAU;YAC9B,CAAC,CAAC,wBAAwB,KAAK,CAAC,UAAU,EAAE;YAC5C,CAAC,CAAC,SAAS,CAAC;QACd,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,EAAE,4BAA4B,EAAE,OAAO,CAAC,CAAC;QACvF,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QACrC,OAAO;YACL,SAAS,EAAE,wBAAwB;YACnC,KAAK;YACL,QAAQ,EAAE,MAAM,CAAC,KAAK;YACtB,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,SAAS;YAC/B,SAAS,EAAE,MAAM,CAAC,SAAS;SAC5B,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,MAAM,iBAAiB,GAAgB;IACrC,SAAS,EAAE,cAAc;IACzB,WAAW,EACT,8GAA8G;IAChH,KAAK,EAAE;QACL,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,sCAAsC,EAAE;QACpF,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,YAAY,EAAE,WAAW,EAAE,uDAAuD,EAAE;QACvG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,8DAA8D,EAAE;QAC5G,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,kBAAkB,EAAE,WAAW,EAAE,gEAAgE,EAAE;QACtH,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,qBAAqB,EAAE,WAAW,EAAE,4EAA4E,EAAE;KACtI;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,WAAW,GAAkB;IACxC,IAAI,EAAE,cAAc;IACpB,WAAW,EAAE,iEAAiE;IAC9E,YAAY,EAAE,CAAC,WAAW,CAAC;IAC3B,MAAM,EAAE,WAAW;IACnB,SAAS,EAAE,GAAG;IACd,MAAM,EAAE,GAAG;IACX,KAAK,CAAC,QAAQ,CAAC,KAAgB,EAAE,KAAa;QAC5C,IAAI,CAAC,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,2CAA2C,CAAC,CAAC;QACzE,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,EAAE,iBAAiB,CAAC,CAAC;QACnE,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QACrC,OAAO;YACL,SAAS,EAAE,cAAc;YACzB,KAAK;YACL,QAAQ,EAAE,MAAM,CAAC,KAAK;YACtB,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,SAAS;YAC/B,SAAS,EAAE,MAAM,CAAC,SAAS;SAC5B,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,MAAM,iBAAiB,GAAgB;IACrC,SAAS,EAAE,aAAa;IACxB,WAAW,EACT,qGAAqG;IACvG,KAAK,EAAE;QACL,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,cAAc,EAAE,WAAW,EAAE,yCAAyC,EAAE;QAC3F,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,kBAAkB,EAAE,WAAW,EAAE,oDAAoD,EAAE;QAC1G,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,sDAAsD,EAAE;QACpG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,gBAAgB,EAAE,WAAW,EAAE,+CAA+C,EAAE;QACnG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE,WAAW,EAAE,sEAAsE,EAAE;KAC5H;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,WAAW,GAAkB;IACxC,IAAI,EAAE,aAAa;IACnB,WAAW,EAAE,oDAAoD;IACjE,YAAY,EAAE,CAAC,WAAW,CAAC;IAC3B,MAAM,EAAE,WAAW;IACnB,SAAS,EAAE,GAAG;IACd,MAAM,EAAE,GAAG;IACX,KAAK,CAAC,QAAQ,CAAC,KAAgB,EAAE,KAAa;QAC5C,IAAI,CAAC,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;QACxE,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,EAAE,iBAAiB,CAAC,CAAC;QACnE,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QACrC,OAAO;YACL,SAAS,EAAE,aAAa;YACxB,KAAK;YACL,QAAQ,EAAE,MAAM,CAAC,KAAK;YACtB,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,SAAS;YAC/B,SAAS,EAAE,MAAM,CAAC,SAAS;SAC5B,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,SAAS,GAAoB;IACxC,cAAc;IACd,qBAAqB;IACrB,WAAW;IACX,WAAW;CACZ,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/criteria/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/criteria/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { EvalCriterion } from '../types.js';
|
|
2
|
+
export declare const topicAccuracy: EvalCriterion;
|
|
3
|
+
export declare const pedagogicalStructure: EvalCriterion;
|
|
4
|
+
export declare const codeQuality: EvalCriterion;
|
|
5
|
+
export declare const progressiveDifficulty: EvalCriterion;
|
|
6
|
+
export declare const lesson: EvalCriterion[];
|
|
7
|
+
//# sourceMappingURL=lesson.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"lesson.d.ts","sourceRoot":"","sources":["../../src/criteria/lesson.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAA4C,MAAM,aAAa,CAAC;AAc3F,eAAO,MAAM,aAAa,EAAE,aAqB3B,CAAC;AAeF,eAAO,MAAM,oBAAoB,EAAE,aAoBlC,CAAC;AAeF,eAAO,MAAM,WAAW,EAAE,aAoBzB,CAAC;AAeF,eAAO,MAAM,qBAAqB,EAAE,aAuBnC,CAAC;AAEF,eAAO,MAAM,MAAM,EAAE,aAAa,EAKjC,CAAC"}
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
const topicAccuracyRubric = {
|
|
2
|
+
criterion: 'Topic Accuracy',
|
|
3
|
+
description: 'Does the content accurately cover the stated topic? Is the information correct and relevant?',
|
|
4
|
+
scale: [
|
|
5
|
+
{ score: 1, label: 'Off-topic', description: 'Content does not address the stated topic' },
|
|
6
|
+
{ score: 2, label: 'Tangential', description: 'Loosely related but misses core concepts' },
|
|
7
|
+
{ score: 3, label: 'Accurate', description: 'Covers the topic correctly with minor gaps' },
|
|
8
|
+
{ score: 4, label: 'Thorough', description: 'Comprehensive coverage, accurate details' },
|
|
9
|
+
{ score: 5, label: 'Expert-level', description: 'Deep, nuanced, no factual issues, adds genuine insight' },
|
|
10
|
+
],
|
|
11
|
+
};
|
|
12
|
+
export const topicAccuracy = {
|
|
13
|
+
name: 'topic_accuracy',
|
|
14
|
+
description: 'Content accurately covers the stated topic',
|
|
15
|
+
contentTypes: ['lesson'],
|
|
16
|
+
method: 'llm_judge',
|
|
17
|
+
threshold: 0.5,
|
|
18
|
+
weight: 1.5,
|
|
19
|
+
async evaluate(input, judge) {
|
|
20
|
+
if (!judge)
|
|
21
|
+
throw new Error('Judge required for topic_accuracy criterion');
|
|
22
|
+
const context = input.topic ? `Expected topic: ${input.topic}` : undefined;
|
|
23
|
+
const result = await judge.score(input.content, topicAccuracyRubric, context);
|
|
24
|
+
const score = (result.score - 1) / 4;
|
|
25
|
+
return {
|
|
26
|
+
criterion: 'topic_accuracy',
|
|
27
|
+
score,
|
|
28
|
+
rawScore: result.score,
|
|
29
|
+
maxScore: 5,
|
|
30
|
+
passed: score >= this.threshold,
|
|
31
|
+
reasoning: result.reasoning,
|
|
32
|
+
};
|
|
33
|
+
},
|
|
34
|
+
};
|
|
35
|
+
const pedagogicalStructureRubric = {
|
|
36
|
+
criterion: 'Pedagogical Structure',
|
|
37
|
+
description: 'Does the lesson follow good teaching structure? Look for: intuition building → concrete examples → hands-on practice → synthesis/takeaways.',
|
|
38
|
+
scale: [
|
|
39
|
+
{ score: 1, label: 'No structure', description: 'Random facts dumped with no progression' },
|
|
40
|
+
{ score: 2, label: 'Weak structure', description: 'Some organization but missing key phases (e.g. no examples or no practice)' },
|
|
41
|
+
{ score: 3, label: 'Solid structure', description: 'Clear progression from concept to practice, covers main phases' },
|
|
42
|
+
{ score: 4, label: 'Strong pedagogy', description: 'Well-scaffolded with intuition, examples, practice, and synthesis' },
|
|
43
|
+
{ score: 5, label: 'Masterful', description: 'Perfect scaffolding, builds mental models, practice reinforces theory' },
|
|
44
|
+
],
|
|
45
|
+
};
|
|
46
|
+
export const pedagogicalStructure = {
|
|
47
|
+
name: 'pedagogical_structure',
|
|
48
|
+
description: 'Lesson follows intuition → examples → practice → synthesis',
|
|
49
|
+
contentTypes: ['lesson'],
|
|
50
|
+
method: 'llm_judge',
|
|
51
|
+
threshold: 0.5,
|
|
52
|
+
weight: 1.5,
|
|
53
|
+
async evaluate(input, judge) {
|
|
54
|
+
if (!judge)
|
|
55
|
+
throw new Error('Judge required for pedagogical_structure criterion');
|
|
56
|
+
const result = await judge.score(input.content, pedagogicalStructureRubric);
|
|
57
|
+
const score = (result.score - 1) / 4;
|
|
58
|
+
return {
|
|
59
|
+
criterion: 'pedagogical_structure',
|
|
60
|
+
score,
|
|
61
|
+
rawScore: result.score,
|
|
62
|
+
maxScore: 5,
|
|
63
|
+
passed: score >= this.threshold,
|
|
64
|
+
reasoning: result.reasoning,
|
|
65
|
+
};
|
|
66
|
+
},
|
|
67
|
+
};
|
|
68
|
+
const codeQualityRubric = {
|
|
69
|
+
criterion: 'Code Quality',
|
|
70
|
+
description: 'Are the code examples relevant, likely to run correctly, and well-explained? Consider: relevance to topic, correctness, comments/explanations, and practical applicability.',
|
|
71
|
+
scale: [
|
|
72
|
+
{ score: 1, label: 'Broken', description: 'Code has obvious errors or is irrelevant to the topic' },
|
|
73
|
+
{ score: 2, label: 'Weak', description: 'Code runs but is poorly explained or only loosely relevant' },
|
|
74
|
+
{ score: 3, label: 'Good', description: 'Relevant, likely correct code with adequate explanation' },
|
|
75
|
+
{ score: 4, label: 'Strong', description: 'Clean, well-commented code that directly reinforces the lesson' },
|
|
76
|
+
{ score: 5, label: 'Exemplary', description: 'Production-quality code, excellent comments, teaches through the code itself' },
|
|
77
|
+
],
|
|
78
|
+
};
|
|
79
|
+
export const codeQuality = {
|
|
80
|
+
name: 'code_quality',
|
|
81
|
+
description: 'Code examples are relevant, correct, and well-explained',
|
|
82
|
+
contentTypes: ['lesson'],
|
|
83
|
+
method: 'llm_judge',
|
|
84
|
+
threshold: 0.5,
|
|
85
|
+
weight: 1.0,
|
|
86
|
+
async evaluate(input, judge) {
|
|
87
|
+
if (!judge)
|
|
88
|
+
throw new Error('Judge required for code_quality criterion');
|
|
89
|
+
const result = await judge.score(input.content, codeQualityRubric);
|
|
90
|
+
const score = (result.score - 1) / 4;
|
|
91
|
+
return {
|
|
92
|
+
criterion: 'code_quality',
|
|
93
|
+
score,
|
|
94
|
+
rawScore: result.score,
|
|
95
|
+
maxScore: 5,
|
|
96
|
+
passed: score >= this.threshold,
|
|
97
|
+
reasoning: result.reasoning,
|
|
98
|
+
};
|
|
99
|
+
},
|
|
100
|
+
};
|
|
101
|
+
const progressiveDifficultyRubric = {
|
|
102
|
+
criterion: 'Progressive Difficulty',
|
|
103
|
+
description: 'Does the content build appropriately on prerequisites? Is the difficulty level well-calibrated for the stated level?',
|
|
104
|
+
scale: [
|
|
105
|
+
{ score: 1, label: 'Mismatched', description: 'Far too easy or hard for the stated level, no scaffolding' },
|
|
106
|
+
{ score: 2, label: 'Poorly calibrated', description: 'Difficulty jumps around or assumes too much/little' },
|
|
107
|
+
{ score: 3, label: 'Appropriate', description: 'Matches stated difficulty, reasonable prerequisite assumptions' },
|
|
108
|
+
{ score: 4, label: 'Well-calibrated', description: 'Smooth difficulty curve, clear about prerequisites' },
|
|
109
|
+
{ score: 5, label: 'Perfect scaffolding', description: 'Masterfully builds from known to unknown at exactly the right pace' },
|
|
110
|
+
],
|
|
111
|
+
};
|
|
112
|
+
export const progressiveDifficulty = {
|
|
113
|
+
name: 'progressive_difficulty',
|
|
114
|
+
description: 'Content builds appropriately on prerequisites',
|
|
115
|
+
contentTypes: ['lesson'],
|
|
116
|
+
method: 'llm_judge',
|
|
117
|
+
threshold: 0.5,
|
|
118
|
+
weight: 1.0,
|
|
119
|
+
async evaluate(input, judge) {
|
|
120
|
+
if (!judge)
|
|
121
|
+
throw new Error('Judge required for progressive_difficulty criterion');
|
|
122
|
+
const context = input.difficulty
|
|
123
|
+
? `Expected difficulty level: ${input.difficulty}`
|
|
124
|
+
: undefined;
|
|
125
|
+
const result = await judge.score(input.content, progressiveDifficultyRubric, context);
|
|
126
|
+
const score = (result.score - 1) / 4;
|
|
127
|
+
return {
|
|
128
|
+
criterion: 'progressive_difficulty',
|
|
129
|
+
score,
|
|
130
|
+
rawScore: result.score,
|
|
131
|
+
maxScore: 5,
|
|
132
|
+
passed: score >= this.threshold,
|
|
133
|
+
reasoning: result.reasoning,
|
|
134
|
+
};
|
|
135
|
+
},
|
|
136
|
+
};
|
|
137
|
+
export const lesson = [
|
|
138
|
+
topicAccuracy,
|
|
139
|
+
pedagogicalStructure,
|
|
140
|
+
codeQuality,
|
|
141
|
+
progressiveDifficulty,
|
|
142
|
+
];
|
|
143
|
+
//# sourceMappingURL=lesson.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"lesson.js","sourceRoot":"","sources":["../../src/criteria/lesson.ts"],"names":[],"mappings":"AAEA,MAAM,mBAAmB,GAAgB;IACvC,SAAS,EAAE,gBAAgB;IAC3B,WAAW,EAAE,8FAA8F;IAC3G,KAAK,EAAE;QACL,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,WAAW,EAAE,WAAW,EAAE,2CAA2C,EAAE;QAC1F,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,YAAY,EAAE,WAAW,EAAE,0CAA0C,EAAE;QAC1F,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,4CAA4C,EAAE;QAC1F,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,0CAA0C,EAAE;QACxF,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,cAAc,EAAE,WAAW,EAAE,wDAAwD,EAAE;KAC3G;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,aAAa,GAAkB;IAC1C,IAAI,EAAE,gBAAgB;IACtB,WAAW,EAAE,4CAA4C;IACzD,YAAY,EAAE,CAAC,QAAQ,CAAC;IACxB,MAAM,EAAE,WAAW;IACnB,SAAS,EAAE,GAAG;IACd,MAAM,EAAE,GAAG;IACX,KAAK,CAAC,QAAQ,CAAC,KAAgB,EAAE,KAAa;QAC5C,IAAI,CAAC,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;QAC3E,MAAM,OAAO,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,mBAAmB,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;QAC3E,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,EAAE,mBAAmB,EAAE,OAAO,CAAC,CAAC;QAC9E,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QACrC,OAAO;YACL,SAAS,EAAE,gBAAgB;YAC3B,KAAK;YACL,QAAQ,EAAE,MAAM,CAAC,KAAK;YACtB,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,SAAS;YAC/B,SAAS,EAAE,MAAM,CAAC,SAAS;SAC5B,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,MAAM,0BAA0B,GAAgB;IAC9C,SAAS,EAAE,uBAAuB;IAClC,WAAW,EACT,6IAA6I;IAC/I,KAAK,EAAE;QACL,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,cAAc,EAAE,WAAW,EAAE,yCAAyC,EAAE;QAC3F,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,gBAAgB,EAAE,WAAW,EAAE,4EAA4E,EAAE;QAChI,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE,WAAW,EAAE,gEAAgE,EAAE;QACrH,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE,WAAW,EAAE,mEAAmE,EAAE;QACxH,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,WAAW,EAAE,WAAW,EAAE,uEAAuE,EAAE;KACvH;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,oBAAoB,GAAkB;IACjD,IAAI,EAAE,uBAAuB;IAC7B,WAAW,EAAE,4DAA4D;IACzE,YAAY,EAAE,CAAC,QAAQ,CAAC;IACxB,MAAM,EAAE,WAAW;IACnB,SAAS,EAAE,GAAG;IACd,MAAM,EAAE,GAAG;IACX,KAAK,CAAC,QAAQ,CAAC,KAAgB,EAAE,KAAa;QAC5C,IAAI,CAAC,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,oDAAoD,CAAC,CAAC;QAClF,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,EAAE,0BAA0B,CAAC,CAAC;QAC5E,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QACrC,OAAO;YACL,SAAS,EAAE,uBAAuB;YAClC,KAAK;YACL,QAAQ,EAAE,MAAM,CAAC,KAAK;YACtB,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,SAAS;YAC/B,SAAS,EAAE,MAAM,CAAC,SAAS;SAC5B,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,MAAM,iBAAiB,GAAgB;IACrC,SAAS,EAAE,cAAc;IACzB,WAAW,EACT,6KAA6K;IAC/K,KAAK,EAAE;QACL,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,WAAW,EAAE,uDAAuD,EAAE;QACnG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,WAAW,EAAE,4DAA4D,EAAE;QACtG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,WAAW,EAAE,yDAAyD,EAAE;QACnG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,WAAW,EAAE,gEAAgE,EAAE;QAC5G,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,WAAW,EAAE,WAAW,EAAE,8EAA8E,EAAE;KAC9H;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,WAAW,GAAkB;IACxC,IAAI,EAAE,cAAc;IACpB,WAAW,EAAE,yDAAyD;IACtE,YAAY,EAAE,CAAC,QAAQ,CAAC;IACxB,MAAM,EAAE,WAAW;IACnB,SAAS,EAAE,GAAG;IACd,MAAM,EAAE,GAAG;IACX,KAAK,CAAC,QAAQ,CAAC,KAAgB,EAAE,KAAa;QAC5C,IAAI,CAAC,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,2CAA2C,CAAC,CAAC;QACzE,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,EAAE,iBAAiB,CAAC,CAAC;QACnE,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QACrC,OAAO;YACL,SAAS,EAAE,cAAc;YACzB,KAAK;YACL,QAAQ,EAAE,MAAM,CAAC,KAAK;YACtB,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,SAAS;YAC/B,SAAS,EAAE,MAAM,CAAC,SAAS;SAC5B,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,MAAM,2BAA2B,GAAgB;IAC/C,SAAS,EAAE,wBAAwB;IACnC,WAAW,EACT,sHAAsH;IACxH,KAAK,EAAE;QACL,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,YAAY,EAAE,WAAW,EAAE,2DAA2D,EAAE;QAC3G,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,mBAAmB,EAAE,WAAW,EAAE,oDAAoD,EAAE;QAC3G,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,aAAa,EAAE,WAAW,EAAE,gEAAgE,EAAE;QACjH,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE,WAAW,EAAE,oDAAoD,EAAE;QACzG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,qBAAqB,EAAE,WAAW,EAAE,oEAAoE,EAAE;KAC9H;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,qBAAqB,GAAkB;IAClD,IAAI,EAAE,wBAAwB;IAC9B,WAAW,EAAE,+CAA+C;IAC5D,YAAY,EAAE,CAAC,QAAQ,CAAC;IACxB,MAAM,EAAE,WAAW;IACnB,SAAS,EAAE,GAAG;IACd,MAAM,EAAE,GAAG;IACX,KAAK,CAAC,QAAQ,CAAC,KAAgB,EAAE,KAAa;QAC5C,IAAI,CAAC,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,qDAAqD,CAAC,CAAC;QACnF,MAAM,OAAO,GAAG,KAAK,CAAC,UAAU;YAC9B,CAAC,CAAC,8BAA8B,KAAK,CAAC,UAAU,EAAE;YAClD,CAAC,CAAC,SAAS,CAAC;QACd,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,EAAE,2BAA2B,EAAE,OAAO,CAAC,CAAC;QACtF,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QACrC,OAAO;YACL,SAAS,EAAE,wBAAwB;YACnC,KAAK;YACL,QAAQ,EAAE,MAAM,CAAC,KAAK;YACtB,QAAQ,EAAE,CAAC;YACX,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,SAAS;YAC/B,SAAS,EAAE,MAAM,CAAC,SAAS;SAC5B,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,MAAM,CAAC,MAAM,MAAM,GAAoB;IACrC,aAAa;IACb,oBAAoB;IACpB,WAAW;IACX,qBAAqB;CACtB,CAAC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { EvalCriterion } from '../types.js';
|
|
2
|
+
export declare const brevity: EvalCriterion;
|
|
3
|
+
export declare const actionability: EvalCriterion;
|
|
4
|
+
export declare const honesty: EvalCriterion;
|
|
5
|
+
export declare const review: EvalCriterion[];
|
|
6
|
+
//# sourceMappingURL=review.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"review.d.ts","sourceRoot":"","sources":["../../src/criteria/review.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAA4C,MAAM,aAAa,CAAC;AAI3F,eAAO,MAAM,OAAO,EAAE,aAyBrB,CAAC;AAeF,eAAO,MAAM,aAAa,EAAE,aAoB3B,CAAC;AAeF,eAAO,MAAM,OAAO,EAAE,aAoBrB,CAAC;AAEF,eAAO,MAAM,MAAM,EAAE,aAAa,EAAsC,CAAC"}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
const DEFAULT_REVIEW_MAX_CHARS = 2000;
|
|
2
|
+
export const brevity = {
|
|
3
|
+
name: 'brevity',
|
|
4
|
+
description: 'Review content is concise and under character budget',
|
|
5
|
+
contentTypes: ['review'],
|
|
6
|
+
method: 'deterministic',
|
|
7
|
+
threshold: 1.0,
|
|
8
|
+
weight: 0.5,
|
|
9
|
+
async evaluate(input) {
|
|
10
|
+
const maxChars = input.metadata?.maxReviewChars ?? DEFAULT_REVIEW_MAX_CHARS;
|
|
11
|
+
const len = input.content.length;
|
|
12
|
+
const passed = len <= maxChars;
|
|
13
|
+
const score = passed ? 1.0 : Math.max(0, 1 - (len - maxChars) / maxChars);
|
|
14
|
+
return {
|
|
15
|
+
criterion: 'brevity',
|
|
16
|
+
score,
|
|
17
|
+
rawScore: len,
|
|
18
|
+
maxScore: maxChars,
|
|
19
|
+
passed,
|
|
20
|
+
reasoning: passed
|
|
21
|
+
? `Review is ${len} chars, within ${maxChars} char budget`
|
|
22
|
+
: `Review is ${len} chars, exceeds ${maxChars} char budget`,
|
|
23
|
+
};
|
|
24
|
+
},
|
|
25
|
+
};
|
|
26
|
+
const actionabilityRubric = {
|
|
27
|
+
criterion: 'Actionability',
|
|
28
|
+
description: 'Does the review contain concrete, specific next steps the reader can take? Not vague encouragement but real actions.',
|
|
29
|
+
scale: [
|
|
30
|
+
{ score: 1, label: 'No actions', description: 'Pure commentary with no next steps' },
|
|
31
|
+
{ score: 2, label: 'Vague', description: 'Generic advice like "keep practicing" without specifics' },
|
|
32
|
+
{ score: 3, label: 'Some actions', description: 'Contains at least one concrete next step' },
|
|
33
|
+
{ score: 4, label: 'Actionable', description: 'Multiple specific, prioritized next steps' },
|
|
34
|
+
{ score: 5, label: 'Highly actionable', description: 'Clear roadmap with prioritized, specific, measurable actions' },
|
|
35
|
+
],
|
|
36
|
+
};
|
|
37
|
+
export const actionability = {
|
|
38
|
+
name: 'actionability',
|
|
39
|
+
description: 'Review contains concrete next steps',
|
|
40
|
+
contentTypes: ['review'],
|
|
41
|
+
method: 'llm_judge',
|
|
42
|
+
threshold: 0.5,
|
|
43
|
+
weight: 1.0,
|
|
44
|
+
async evaluate(input, judge) {
|
|
45
|
+
if (!judge)
|
|
46
|
+
throw new Error('Judge required for actionability criterion');
|
|
47
|
+
const result = await judge.score(input.content, actionabilityRubric);
|
|
48
|
+
const score = (result.score - 1) / 4;
|
|
49
|
+
return {
|
|
50
|
+
criterion: 'actionability',
|
|
51
|
+
score,
|
|
52
|
+
rawScore: result.score,
|
|
53
|
+
maxScore: 5,
|
|
54
|
+
passed: score >= this.threshold,
|
|
55
|
+
reasoning: result.reasoning,
|
|
56
|
+
};
|
|
57
|
+
},
|
|
58
|
+
};
|
|
59
|
+
const honestyRubric = {
|
|
60
|
+
criterion: 'Honesty',
|
|
61
|
+
description: "Does the review honestly assess gaps and weaknesses? Does it avoid sugarcoating or false encouragement?",
|
|
62
|
+
scale: [
|
|
63
|
+
{ score: 1, label: 'Dishonest', description: 'Pure cheerleading, ignores obvious gaps' },
|
|
64
|
+
{ score: 2, label: 'Sugarcoated', description: 'Acknowledges issues but softens them to the point of uselessness' },
|
|
65
|
+
{ score: 3, label: 'Balanced', description: 'Honest about strengths and weaknesses, constructive tone' },
|
|
66
|
+
{ score: 4, label: 'Direct', description: 'Clear-eyed assessment, names gaps specifically while remaining respectful' },
|
|
67
|
+
{ score: 5, label: 'Brutally honest', description: 'Unflinching but constructive, prioritizes gaps, no wasted praise' },
|
|
68
|
+
],
|
|
69
|
+
};
|
|
70
|
+
export const honesty = {
|
|
71
|
+
name: 'honesty',
|
|
72
|
+
description: "Review honestly assesses gaps without sugarcoating",
|
|
73
|
+
contentTypes: ['review'],
|
|
74
|
+
method: 'llm_judge',
|
|
75
|
+
threshold: 0.5,
|
|
76
|
+
weight: 1.0,
|
|
77
|
+
async evaluate(input, judge) {
|
|
78
|
+
if (!judge)
|
|
79
|
+
throw new Error('Judge required for honesty criterion');
|
|
80
|
+
const result = await judge.score(input.content, honestyRubric);
|
|
81
|
+
const score = (result.score - 1) / 4;
|
|
82
|
+
return {
|
|
83
|
+
criterion: 'honesty',
|
|
84
|
+
score,
|
|
85
|
+
rawScore: result.score,
|
|
86
|
+
maxScore: 5,
|
|
87
|
+
passed: score >= this.threshold,
|
|
88
|
+
reasoning: result.reasoning,
|
|
89
|
+
};
|
|
90
|
+
},
|
|
91
|
+
};
|
|
92
|
+
export const review = [brevity, actionability, honesty];
|
|
93
|
+
//# sourceMappingURL=review.js.map
|