@learning-commons/evaluators 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +318 -0
- package/dist/index.cjs +1899 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +1142 -0
- package/dist/index.d.ts +1142 -0
- package/dist/index.js +1866 -0
- package/dist/index.js.map +1 -0
- package/package.json +84 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1866 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { randomUUID } from 'crypto';
|
|
3
|
+
import { readFileSync, mkdirSync, writeFileSync } from 'fs';
|
|
4
|
+
import { dirname, join } from 'path';
|
|
5
|
+
import { homedir } from 'os';
|
|
6
|
+
import { fileURLToPath } from 'url';
|
|
7
|
+
import { generateText, Output } from 'ai';
|
|
8
|
+
import nlp from 'compromise';
|
|
9
|
+
import { syllable } from 'syllable';
|
|
10
|
+
import pLimit from 'p-limit';
|
|
11
|
+
|
|
12
|
+
// src/schemas/outputs.ts
|
|
13
|
+
var TextComplexityLevel = z.enum([
|
|
14
|
+
"Slightly complex",
|
|
15
|
+
"Moderately complex",
|
|
16
|
+
"Very complex",
|
|
17
|
+
"Exceedingly complex"
|
|
18
|
+
]);
|
|
19
|
+
var GradeBand = z.enum(["K-1", "2-3", "4-5", "6-8", "9-10", "11-CCR"]);
|
|
20
|
+
var GradeLevelAppropriatenessSchema = z.object({
|
|
21
|
+
reasoning: z.string().describe(
|
|
22
|
+
"Your reasoning for your answer in numbered bullet points for 4 steps with a 4th bullet point for synthesis."
|
|
23
|
+
),
|
|
24
|
+
grade: GradeBand.describe("The appropriate grade level for the text"),
|
|
25
|
+
alternative_grade: GradeBand.describe("An alternative grade level for the text"),
|
|
26
|
+
scaffolding_needed: z.string().describe("Scaffolding needed for the text to be appropriate for the alternative grade")
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
// src/errors.ts
|
|
30
|
+
var EvaluatorError = class extends Error {
|
|
31
|
+
constructor(message, code) {
|
|
32
|
+
super(message);
|
|
33
|
+
this.code = code;
|
|
34
|
+
this.name = "EvaluatorError";
|
|
35
|
+
if (Error.captureStackTrace) {
|
|
36
|
+
Error.captureStackTrace(this, this.constructor);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
};
|
|
40
|
+
var ConfigurationError = class extends EvaluatorError {
|
|
41
|
+
constructor(message) {
|
|
42
|
+
super(message, "CONFIGURATION_ERROR");
|
|
43
|
+
this.name = "ConfigurationError";
|
|
44
|
+
}
|
|
45
|
+
};
|
|
46
|
+
var ValidationError = class extends EvaluatorError {
|
|
47
|
+
constructor(message) {
|
|
48
|
+
super(message, "VALIDATION_ERROR");
|
|
49
|
+
this.name = "ValidationError";
|
|
50
|
+
}
|
|
51
|
+
};
|
|
52
|
+
var APIError = class extends EvaluatorError {
|
|
53
|
+
constructor(message, statusCode, retryable = false, code) {
|
|
54
|
+
super(message, code);
|
|
55
|
+
this.statusCode = statusCode;
|
|
56
|
+
this.retryable = retryable;
|
|
57
|
+
this.name = "APIError";
|
|
58
|
+
}
|
|
59
|
+
};
|
|
60
|
+
var AuthenticationError = class extends APIError {
|
|
61
|
+
constructor(message, statusCode) {
|
|
62
|
+
super(message, statusCode, false, "AUTHENTICATION_ERROR");
|
|
63
|
+
this.name = "AuthenticationError";
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
var RateLimitError = class extends APIError {
|
|
67
|
+
constructor(message, retryAfter) {
|
|
68
|
+
super(message, 429, true, "RATE_LIMIT_ERROR");
|
|
69
|
+
this.retryAfter = retryAfter;
|
|
70
|
+
this.name = "RateLimitError";
|
|
71
|
+
}
|
|
72
|
+
};
|
|
73
|
+
var NetworkError = class extends APIError {
|
|
74
|
+
constructor(message, retryable = true) {
|
|
75
|
+
super(message, void 0, retryable, "NETWORK_ERROR");
|
|
76
|
+
this.name = "NetworkError";
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
var TimeoutError = class extends APIError {
|
|
80
|
+
constructor(message = "Request timed out") {
|
|
81
|
+
super(message, 408, true, "TIMEOUT_ERROR");
|
|
82
|
+
this.name = "TimeoutError";
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
function parseProviderError(error) {
|
|
86
|
+
if (error instanceof Error) {
|
|
87
|
+
const message = error.message;
|
|
88
|
+
const statusMatch = message.match(/\b(4\d{2}|5\d{2})\b/);
|
|
89
|
+
const statusCode = statusMatch ? parseInt(statusMatch[1]) : void 0;
|
|
90
|
+
return {
|
|
91
|
+
message,
|
|
92
|
+
statusCode,
|
|
93
|
+
code: error.name !== "Error" ? error.name : void 0
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
return {
|
|
97
|
+
message: String(error)
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
function wrapProviderError(error, defaultMessage = "API request failed") {
|
|
101
|
+
const { message, statusCode, code } = parseProviderError(error);
|
|
102
|
+
if (statusCode === 401 || statusCode === 403) {
|
|
103
|
+
return new AuthenticationError(
|
|
104
|
+
message.includes("API key") ? message : "Invalid API key",
|
|
105
|
+
statusCode
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
if (statusCode === 429) {
|
|
109
|
+
const retryAfterMatch = message.match(/retry[- ]after[:\s]+(\d+)/i);
|
|
110
|
+
const retryAfter = retryAfterMatch ? parseInt(retryAfterMatch[1]) * 1e3 : void 0;
|
|
111
|
+
return new RateLimitError(
|
|
112
|
+
message.includes("rate limit") ? message : "Rate limit exceeded",
|
|
113
|
+
retryAfter
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
if (message.includes("ECONNREFUSED") || message.includes("ENOTFOUND") || message.includes("ETIMEDOUT") || message.includes("network") || message.includes("Network")) {
|
|
117
|
+
return new NetworkError(message);
|
|
118
|
+
}
|
|
119
|
+
if (message.includes("timeout") || message.includes("timed out")) {
|
|
120
|
+
return new TimeoutError(message);
|
|
121
|
+
}
|
|
122
|
+
return new APIError(
|
|
123
|
+
message || defaultMessage,
|
|
124
|
+
statusCode,
|
|
125
|
+
statusCode ? statusCode >= 500 : false,
|
|
126
|
+
// 5xx errors are retryable
|
|
127
|
+
code
|
|
128
|
+
);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// src/logger.ts
|
|
132
|
+
var LogLevel = /* @__PURE__ */ ((LogLevel2) => {
|
|
133
|
+
LogLevel2[LogLevel2["DEBUG"] = 0] = "DEBUG";
|
|
134
|
+
LogLevel2[LogLevel2["INFO"] = 1] = "INFO";
|
|
135
|
+
LogLevel2[LogLevel2["WARN"] = 2] = "WARN";
|
|
136
|
+
LogLevel2[LogLevel2["ERROR"] = 3] = "ERROR";
|
|
137
|
+
LogLevel2[LogLevel2["SILENT"] = 4] = "SILENT";
|
|
138
|
+
return LogLevel2;
|
|
139
|
+
})(LogLevel || {});
|
|
140
|
+
var ConsoleLogger = class {
|
|
141
|
+
constructor(level = 2 /* WARN */) {
|
|
142
|
+
this.level = level;
|
|
143
|
+
}
|
|
144
|
+
debug(message, context) {
|
|
145
|
+
if (this.level <= 0 /* DEBUG */) {
|
|
146
|
+
console.debug(`[DEBUG] ${message}`, context || "");
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
info(message, context) {
|
|
150
|
+
if (this.level <= 1 /* INFO */) {
|
|
151
|
+
console.info(`[INFO] ${message}`, context || "");
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
warn(message, context) {
|
|
155
|
+
if (this.level <= 2 /* WARN */) {
|
|
156
|
+
console.warn(`[WARN] ${message}`, context || "");
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
error(message, context) {
|
|
160
|
+
if (this.level <= 3 /* ERROR */) {
|
|
161
|
+
console.error(`[ERROR] ${message}`, context || "");
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
};
|
|
165
|
+
var SilentLogger = class {
|
|
166
|
+
debug() {
|
|
167
|
+
}
|
|
168
|
+
info() {
|
|
169
|
+
}
|
|
170
|
+
warn() {
|
|
171
|
+
}
|
|
172
|
+
error() {
|
|
173
|
+
}
|
|
174
|
+
};
|
|
175
|
+
function createLogger(customLogger, level = 2 /* WARN */) {
|
|
176
|
+
if (customLogger) {
|
|
177
|
+
return customLogger;
|
|
178
|
+
}
|
|
179
|
+
if (level === 4 /* SILENT */) {
|
|
180
|
+
return new SilentLogger();
|
|
181
|
+
}
|
|
182
|
+
return new ConsoleLogger(level);
|
|
183
|
+
}
|
|
184
|
+
var SentenceAnalysisSchema = z.object({
|
|
185
|
+
reasoning: z.string().describe("Step-by-step reasoning for the analysis"),
|
|
186
|
+
// Foundational
|
|
187
|
+
num_sentences: z.number().int().describe("Total number of sentences in the text"),
|
|
188
|
+
num_words: z.number().int().describe("Total number of words in the text"),
|
|
189
|
+
flesch_kincaid_grade: z.number().describe("Flesch-Kincaid Grade Level number"),
|
|
190
|
+
// Sentence Type
|
|
191
|
+
num_simple_sentences: z.number().int().describe("Number of simple sentences"),
|
|
192
|
+
num_compound_sentences: z.number().int().describe("Number of compound sentences"),
|
|
193
|
+
num_complex_sentences: z.number().int().describe("Number of complex sentences"),
|
|
194
|
+
num_compound_complex_sentences: z.number().int().describe("Number of compound-complex sentences"),
|
|
195
|
+
num_other_sentences: z.number().int().describe("Number of other sentence types"),
|
|
196
|
+
// Subordination
|
|
197
|
+
num_independent_clauses: z.number().int(),
|
|
198
|
+
num_subordinate_clauses: z.number().int(),
|
|
199
|
+
num_total_clauses: z.number().int(),
|
|
200
|
+
num_sentences_with_subordinate: z.number().int(),
|
|
201
|
+
num_sentences_with_multiple_subordinates: z.number().int(),
|
|
202
|
+
num_sentences_with_embedded_clauses: z.number().int(),
|
|
203
|
+
// Informational Phrases
|
|
204
|
+
num_prepositional_phrases: z.number().int(),
|
|
205
|
+
num_participle_phrases: z.number().int(),
|
|
206
|
+
num_appositive_phrases: z.number().int(),
|
|
207
|
+
// Cohesion
|
|
208
|
+
num_simple_transitions: z.number().int(),
|
|
209
|
+
num_sophisticated_transitions: z.number().int(),
|
|
210
|
+
// Sentence Type Density
|
|
211
|
+
words_in_simple_sentences: z.number().int(),
|
|
212
|
+
words_in_compound_sentences: z.number().int(),
|
|
213
|
+
words_in_complex_sentences: z.number().int(),
|
|
214
|
+
words_in_compound_complex_sentences: z.number().int(),
|
|
215
|
+
words_in_other_sentences: z.number().int(),
|
|
216
|
+
// Additional Features
|
|
217
|
+
sentence_word_counts: z.array(z.number().int()),
|
|
218
|
+
num_one_concept_sentences: z.number().int(),
|
|
219
|
+
num_multi_concept_sentences: z.number().int(),
|
|
220
|
+
num_cleft_sentences: z.number().int(),
|
|
221
|
+
max_clauses_in_any_sentence: z.number().int(),
|
|
222
|
+
// Grades 5-12 specific
|
|
223
|
+
num_compound: z.number().int().describe("Number of compound sentences"),
|
|
224
|
+
num_basic_complex: z.number().int().describe("Number of basic complex sentences"),
|
|
225
|
+
num_advanced_complex: z.number().int().describe("Number of advanced complex sentences"),
|
|
226
|
+
percentage_simple: z.number().describe("Percentage of simple sentences"),
|
|
227
|
+
percentage_compound: z.number().describe("Percentage of compound sentences"),
|
|
228
|
+
percentage_basic_complex: z.number().describe("Percentage of basic complex sentences"),
|
|
229
|
+
percentage_advanced_complex: z.number().describe("Percentage of advanced complex sentences")
|
|
230
|
+
});
|
|
231
|
+
var ComplexityClassificationSchema = z.object({
|
|
232
|
+
reasoning: z.string().describe("Detailed pedagogically appropriate reasoning"),
|
|
233
|
+
answer: TextComplexityLevel
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
// src/telemetry/client.ts
|
|
237
|
+
var TelemetryClient = class {
|
|
238
|
+
config;
|
|
239
|
+
logger;
|
|
240
|
+
constructor(config) {
|
|
241
|
+
this.config = config;
|
|
242
|
+
this.logger = config.logger;
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Send telemetry event to analytics service
|
|
246
|
+
*
|
|
247
|
+
* Fire-and-forget: Errors are logged but don't throw.
|
|
248
|
+
*/
|
|
249
|
+
async send(event) {
|
|
250
|
+
if (!this.config.enabled) {
|
|
251
|
+
return;
|
|
252
|
+
}
|
|
253
|
+
try {
|
|
254
|
+
const headers = {
|
|
255
|
+
"Content-Type": "application/json",
|
|
256
|
+
"X-Client-ID": this.config.clientId
|
|
257
|
+
};
|
|
258
|
+
if (this.config.partnerKey) {
|
|
259
|
+
headers["X-API-Key"] = this.config.partnerKey;
|
|
260
|
+
}
|
|
261
|
+
const response = await fetch(this.config.endpoint, {
|
|
262
|
+
method: "POST",
|
|
263
|
+
headers,
|
|
264
|
+
body: JSON.stringify(event),
|
|
265
|
+
// Don't block SDK operations on slow networks
|
|
266
|
+
signal: AbortSignal.timeout(5e3)
|
|
267
|
+
// 5 second timeout
|
|
268
|
+
});
|
|
269
|
+
if (!response.ok) {
|
|
270
|
+
this.logger.warn(
|
|
271
|
+
`[Telemetry] Failed to send event: ${response.status} ${response.statusText}`
|
|
272
|
+
);
|
|
273
|
+
}
|
|
274
|
+
} catch (error) {
|
|
275
|
+
if (error instanceof Error) {
|
|
276
|
+
if (error.name !== "TimeoutError" && error.name !== "AbortError") {
|
|
277
|
+
this.logger.warn(`[Telemetry] Error sending event: ${error.message}`);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
};
|
|
283
|
+
var __filename$1 = fileURLToPath(import.meta.url);
|
|
284
|
+
var __dirname$1 = dirname(__filename$1);
|
|
285
|
+
var cachedClientId;
|
|
286
|
+
function generateClientId() {
|
|
287
|
+
if (cachedClientId) {
|
|
288
|
+
return cachedClientId;
|
|
289
|
+
}
|
|
290
|
+
const configFile = getConfigFilePath();
|
|
291
|
+
try {
|
|
292
|
+
const data = JSON.parse(readFileSync(configFile, "utf-8"));
|
|
293
|
+
if (data?.telemetry?.clientId) {
|
|
294
|
+
cachedClientId = data.telemetry.clientId;
|
|
295
|
+
return cachedClientId;
|
|
296
|
+
}
|
|
297
|
+
} catch {
|
|
298
|
+
}
|
|
299
|
+
const clientId = randomUUID();
|
|
300
|
+
try {
|
|
301
|
+
mkdirSync(dirname(configFile), { recursive: true });
|
|
302
|
+
writeFileSync(configFile, JSON.stringify({ telemetry: { clientId } }, null, 2));
|
|
303
|
+
} catch {
|
|
304
|
+
}
|
|
305
|
+
cachedClientId = clientId;
|
|
306
|
+
return cachedClientId;
|
|
307
|
+
}
|
|
308
|
+
function getConfigFilePath() {
|
|
309
|
+
const configDir = process.platform === "win32" ? join(process.env.APPDATA ?? homedir(), "learning-commons") : join(homedir(), ".config", "learning-commons");
|
|
310
|
+
return join(configDir, "config.json");
|
|
311
|
+
}
|
|
312
|
+
var cachedVersion;
|
|
313
|
+
function getSDKVersion() {
|
|
314
|
+
if (cachedVersion) {
|
|
315
|
+
return cachedVersion;
|
|
316
|
+
}
|
|
317
|
+
const possiblePaths = [
|
|
318
|
+
join(__dirname$1, "../../package.json"),
|
|
319
|
+
// From src/
|
|
320
|
+
join(__dirname$1, "../package.json")
|
|
321
|
+
// From dist/
|
|
322
|
+
];
|
|
323
|
+
for (const path of possiblePaths) {
|
|
324
|
+
try {
|
|
325
|
+
const pkg = JSON.parse(readFileSync(path, "utf-8"));
|
|
326
|
+
cachedVersion = pkg.version || "0.0.0";
|
|
327
|
+
return cachedVersion;
|
|
328
|
+
} catch {
|
|
329
|
+
continue;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
cachedVersion = "0.0.0";
|
|
333
|
+
return cachedVersion;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// src/evaluators/base.ts
|
|
337
|
+
var VALIDATION_LIMITS = {
|
|
338
|
+
/** Minimum text length in characters */
|
|
339
|
+
MIN_TEXT_LENGTH: 10,
|
|
340
|
+
/** Maximum text length in characters (100K chars ≈ 25K tokens) */
|
|
341
|
+
MAX_TEXT_LENGTH: 1e5
|
|
342
|
+
};
|
|
343
|
+
var BaseEvaluator = class {
|
|
344
|
+
telemetryClient;
|
|
345
|
+
logger;
|
|
346
|
+
config;
|
|
347
|
+
/**
|
|
348
|
+
* Static metadata for the evaluator
|
|
349
|
+
*
|
|
350
|
+
* Concrete evaluators MUST define this property.
|
|
351
|
+
*
|
|
352
|
+
* @example
|
|
353
|
+
* ```typescript
|
|
354
|
+
* class MyEvaluator extends BaseEvaluator {
|
|
355
|
+
* static readonly metadata = {
|
|
356
|
+
* id: 'my-evaluator',
|
|
357
|
+
* name: 'My Evaluator',
|
|
358
|
+
* description: 'Does something useful',
|
|
359
|
+
* supportedGrades: ['3', '4', '5'],
|
|
360
|
+
* requiresGoogleKey: true,
|
|
361
|
+
* requiresOpenAIKey: false,
|
|
362
|
+
* };
|
|
363
|
+
* }
|
|
364
|
+
* ```
|
|
365
|
+
*/
|
|
366
|
+
static metadata;
|
|
367
|
+
constructor(config) {
|
|
368
|
+
this.logger = createLogger(config.logger, config.logLevel ?? 2 /* WARN */);
|
|
369
|
+
this.validateApiKeys(config);
|
|
370
|
+
const telemetryConfig = this.normalizeTelemetryConfig(config.telemetry);
|
|
371
|
+
this.config = {
|
|
372
|
+
maxRetries: config.maxRetries ?? 2,
|
|
373
|
+
telemetry: telemetryConfig
|
|
374
|
+
};
|
|
375
|
+
if (this.config.telemetry.enabled) {
|
|
376
|
+
this.telemetryClient = new TelemetryClient({
|
|
377
|
+
endpoint: "https://api.learningcommons.org/evaluators-telemetry/v1/events",
|
|
378
|
+
partnerKey: config.partnerKey,
|
|
379
|
+
clientId: generateClientId(),
|
|
380
|
+
enabled: true,
|
|
381
|
+
logger: this.logger
|
|
382
|
+
});
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
/**
|
|
386
|
+
* Get metadata for this evaluator instance
|
|
387
|
+
* @throws {ConfigurationError} If the subclass has not defined static metadata
|
|
388
|
+
*/
|
|
389
|
+
get metadata() {
|
|
390
|
+
const meta = this.constructor.metadata;
|
|
391
|
+
if (!meta) {
|
|
392
|
+
throw new ConfigurationError(
|
|
393
|
+
`${this.constructor.name} must define a static readonly metadata block.`
|
|
394
|
+
);
|
|
395
|
+
}
|
|
396
|
+
return meta;
|
|
397
|
+
}
|
|
398
|
+
/**
|
|
399
|
+
* Validate that required API keys are provided based on metadata
|
|
400
|
+
* @throws {ConfigurationError} If required API keys are missing
|
|
401
|
+
*/
|
|
402
|
+
validateApiKeys(config) {
|
|
403
|
+
if (this.metadata.requiresGoogleKey && !config.googleApiKey) {
|
|
404
|
+
throw new ConfigurationError(
|
|
405
|
+
`Google API key is required for ${this.metadata.name} evaluator. Pass googleApiKey in config.`
|
|
406
|
+
);
|
|
407
|
+
}
|
|
408
|
+
if (this.metadata.requiresOpenAIKey && !config.openaiApiKey) {
|
|
409
|
+
throw new ConfigurationError(
|
|
410
|
+
`OpenAI API key is required for ${this.metadata.name} evaluator. Pass openaiApiKey in config.`
|
|
411
|
+
);
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
/**
|
|
415
|
+
* Normalize telemetry config to standard format
|
|
416
|
+
*/
|
|
417
|
+
normalizeTelemetryConfig(telemetry) {
|
|
418
|
+
if (telemetry === false) {
|
|
419
|
+
return {
|
|
420
|
+
enabled: false,
|
|
421
|
+
recordInputs: false
|
|
422
|
+
};
|
|
423
|
+
}
|
|
424
|
+
if (telemetry === true || telemetry === void 0) {
|
|
425
|
+
return {
|
|
426
|
+
enabled: true,
|
|
427
|
+
recordInputs: false
|
|
428
|
+
};
|
|
429
|
+
}
|
|
430
|
+
return {
|
|
431
|
+
enabled: telemetry.enabled ?? true,
|
|
432
|
+
recordInputs: telemetry.recordInputs ?? false
|
|
433
|
+
};
|
|
434
|
+
}
|
|
435
|
+
/**
|
|
436
|
+
* Get the evaluator type identifier from metadata
|
|
437
|
+
* @returns The evaluator type ID (e.g., "vocabulary", "sentence-structure")
|
|
438
|
+
*/
|
|
439
|
+
getEvaluatorType() {
|
|
440
|
+
return this.metadata.id;
|
|
441
|
+
}
|
|
442
|
+
/**
|
|
443
|
+
* Validate text meets requirements
|
|
444
|
+
* Default implementation - can be overridden by concrete evaluators
|
|
445
|
+
*
|
|
446
|
+
* @throws {ValidationError} If text is invalid
|
|
447
|
+
*/
|
|
448
|
+
validateText(text) {
|
|
449
|
+
this.logger.debug("Validating text input", {
|
|
450
|
+
evaluator: this.getEvaluatorType(),
|
|
451
|
+
operation: "validateText",
|
|
452
|
+
textLength: text.length
|
|
453
|
+
});
|
|
454
|
+
const trimmedText = text.trim();
|
|
455
|
+
if (!trimmedText) {
|
|
456
|
+
throw new ValidationError("Text cannot be empty or contain only whitespace");
|
|
457
|
+
}
|
|
458
|
+
if (trimmedText.length < VALIDATION_LIMITS.MIN_TEXT_LENGTH) {
|
|
459
|
+
throw new ValidationError(
|
|
460
|
+
`Text is too short. Minimum length is ${VALIDATION_LIMITS.MIN_TEXT_LENGTH} characters, received ${trimmedText.length} characters`
|
|
461
|
+
);
|
|
462
|
+
}
|
|
463
|
+
if (trimmedText.length > VALIDATION_LIMITS.MAX_TEXT_LENGTH) {
|
|
464
|
+
throw new ValidationError(
|
|
465
|
+
`Text is too long. Maximum length is ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters, received ${trimmedText.length.toLocaleString()} characters`
|
|
466
|
+
);
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
/**
|
|
470
|
+
* Validate grade is in supported range
|
|
471
|
+
* Default implementation - can be overridden by concrete evaluators
|
|
472
|
+
*
|
|
473
|
+
* @param grade - Grade level to validate
|
|
474
|
+
* @param validGrades - Set of valid grades for this evaluator
|
|
475
|
+
* @throws {ValidationError} If grade is invalid
|
|
476
|
+
*/
|
|
477
|
+
validateGrade(grade, validGrades) {
|
|
478
|
+
this.logger.debug("Validating grade input", {
|
|
479
|
+
evaluator: this.getEvaluatorType(),
|
|
480
|
+
operation: "validateGrade",
|
|
481
|
+
grade
|
|
482
|
+
});
|
|
483
|
+
if (!validGrades.has(grade)) {
|
|
484
|
+
const validList = Array.from(validGrades).sort((a, b) => {
|
|
485
|
+
if (a === "K") return -1;
|
|
486
|
+
if (b === "K") return 1;
|
|
487
|
+
return parseInt(a) - parseInt(b);
|
|
488
|
+
}).join(", ");
|
|
489
|
+
throw new ValidationError(
|
|
490
|
+
`Invalid grade "${grade}". Supported grades for this evaluator: ${validList}`
|
|
491
|
+
);
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
/**
|
|
495
|
+
* Send telemetry event to analytics service
|
|
496
|
+
* Common helper for all evaluators
|
|
497
|
+
*/
|
|
498
|
+
async sendTelemetry(params) {
|
|
499
|
+
if (!this.telemetryClient) {
|
|
500
|
+
return;
|
|
501
|
+
}
|
|
502
|
+
await this.telemetryClient.send({
|
|
503
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
504
|
+
sdk_version: getSDKVersion(),
|
|
505
|
+
evaluator_type: this.getEvaluatorType(),
|
|
506
|
+
grade: params.grade,
|
|
507
|
+
status: params.status,
|
|
508
|
+
error_code: params.errorCode,
|
|
509
|
+
latency_ms: params.latencyMs,
|
|
510
|
+
text_length_chars: params.textLength,
|
|
511
|
+
provider: params.provider,
|
|
512
|
+
token_usage: params.tokenUsage,
|
|
513
|
+
metadata: params.metadata,
|
|
514
|
+
// Include input text only if recording is enabled
|
|
515
|
+
input_text: this.config.telemetry.recordInputs ? params.inputText : void 0
|
|
516
|
+
});
|
|
517
|
+
}
|
|
518
|
+
};
|
|
519
|
+
var DEFAULT_MODELS = {
|
|
520
|
+
openai: "gpt-4o",
|
|
521
|
+
anthropic: "claude-sonnet-4-5-20250929",
|
|
522
|
+
google: "gemini-2.5-pro"
|
|
523
|
+
};
|
|
524
|
+
var VercelAIProvider = class {
|
|
525
|
+
constructor(config) {
|
|
526
|
+
this.config = config;
|
|
527
|
+
if (config.type === "custom") {
|
|
528
|
+
throw new Error(
|
|
529
|
+
"VercelAIProvider does not support custom type. Use config.customProvider directly."
|
|
530
|
+
);
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
/**
|
|
534
|
+
* Generate structured output using Vercel AI SDK's generateText with output
|
|
535
|
+
*/
|
|
536
|
+
async generateStructured(request) {
|
|
537
|
+
const model = await this.getModel(request.model);
|
|
538
|
+
const startTime = Date.now();
|
|
539
|
+
const { output, usage } = await generateText({
|
|
540
|
+
model,
|
|
541
|
+
messages: request.messages,
|
|
542
|
+
output: Output.object({ schema: request.schema }),
|
|
543
|
+
temperature: request.temperature ?? 0,
|
|
544
|
+
maxRetries: this.config.maxRetries ?? 0,
|
|
545
|
+
...request.maxTokens !== void 0 ? { maxTokens: request.maxTokens } : {}
|
|
546
|
+
});
|
|
547
|
+
return {
|
|
548
|
+
data: output,
|
|
549
|
+
model: request.model || this.getDefaultModel(),
|
|
550
|
+
usage: {
|
|
551
|
+
inputTokens: usage.inputTokens || 0,
|
|
552
|
+
outputTokens: usage.outputTokens || 0
|
|
553
|
+
},
|
|
554
|
+
latencyMs: Date.now() - startTime
|
|
555
|
+
};
|
|
556
|
+
}
|
|
557
|
+
/**
|
|
558
|
+
* Generate plain text using Vercel AI SDK's generateText
|
|
559
|
+
*/
|
|
560
|
+
async generateText(messages, temperature) {
|
|
561
|
+
const model = await this.getModel();
|
|
562
|
+
const startTime = Date.now();
|
|
563
|
+
const { text, usage } = await generateText({
|
|
564
|
+
model,
|
|
565
|
+
messages,
|
|
566
|
+
temperature: temperature ?? this.config.temperature ?? 0,
|
|
567
|
+
maxRetries: this.config.maxRetries ?? 0
|
|
568
|
+
});
|
|
569
|
+
return {
|
|
570
|
+
text,
|
|
571
|
+
usage: {
|
|
572
|
+
inputTokens: usage.inputTokens || 0,
|
|
573
|
+
outputTokens: usage.outputTokens || 0
|
|
574
|
+
},
|
|
575
|
+
latencyMs: Date.now() - startTime
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
/**
|
|
579
|
+
* Get the configured language model.
|
|
580
|
+
* Uses dynamic imports so consumers only need to install the provider packages they use.
|
|
581
|
+
*/
|
|
582
|
+
async getModel(requestModel) {
|
|
583
|
+
const modelId = requestModel || this.config.model || this.getDefaultModel();
|
|
584
|
+
const apiKey = this.config.apiKey;
|
|
585
|
+
switch (this.config.type) {
|
|
586
|
+
case "openai": {
|
|
587
|
+
const { createOpenAI } = await import('@ai-sdk/openai').catch(() => {
|
|
588
|
+
throw new Error(
|
|
589
|
+
"To use the OpenAI provider, install its adapter: npm install @ai-sdk/openai"
|
|
590
|
+
);
|
|
591
|
+
});
|
|
592
|
+
return createOpenAI(apiKey ? { apiKey } : {})(modelId);
|
|
593
|
+
}
|
|
594
|
+
case "anthropic": {
|
|
595
|
+
const { createAnthropic } = await import('@ai-sdk/anthropic').catch(() => {
|
|
596
|
+
throw new Error(
|
|
597
|
+
"To use the Anthropic provider, install its adapter: npm install @ai-sdk/anthropic"
|
|
598
|
+
);
|
|
599
|
+
});
|
|
600
|
+
return createAnthropic(apiKey ? { apiKey } : {})(modelId);
|
|
601
|
+
}
|
|
602
|
+
case "google": {
|
|
603
|
+
const { createGoogleGenerativeAI } = await import('@ai-sdk/google').catch(() => {
|
|
604
|
+
throw new Error(
|
|
605
|
+
"To use the Google provider, install its adapter: npm install @ai-sdk/google"
|
|
606
|
+
);
|
|
607
|
+
});
|
|
608
|
+
return createGoogleGenerativeAI(apiKey ? { apiKey } : {})(modelId);
|
|
609
|
+
}
|
|
610
|
+
default:
|
|
611
|
+
throw new Error(`Unsupported provider type: ${this.config.type}`);
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
/**
|
|
615
|
+
* Get default model for the configured provider
|
|
616
|
+
*/
|
|
617
|
+
getDefaultModel() {
|
|
618
|
+
const providerType = this.config.type;
|
|
619
|
+
if (providerType === "custom") {
|
|
620
|
+
throw new Error("Cannot get default model for custom provider type");
|
|
621
|
+
}
|
|
622
|
+
return DEFAULT_MODELS[providerType];
|
|
623
|
+
}
|
|
624
|
+
};
|
|
625
|
+
function createProvider(config) {
|
|
626
|
+
if (config.type === "custom" && config.customProvider) {
|
|
627
|
+
return config.customProvider;
|
|
628
|
+
}
|
|
629
|
+
return new VercelAIProvider(config);
|
|
630
|
+
}
|
|
631
|
+
var VocabularyComplexitySchema = z.object({
|
|
632
|
+
tier_2_words: z.string().describe("List of Tier 2 words (academic words)"),
|
|
633
|
+
tier_3_words: z.string().describe("List of Tier 3 words (domain-specific)"),
|
|
634
|
+
archaic_words: z.string().describe("List of Archaic words"),
|
|
635
|
+
other_complex_words: z.string().describe("List of Other Complex words"),
|
|
636
|
+
complexity_score: TextComplexityLevel.describe(
|
|
637
|
+
"The complexity of the text vocabulary"
|
|
638
|
+
),
|
|
639
|
+
reasoning: z.string().describe("Detailed reasoning for the complexity rating")
|
|
640
|
+
});
|
|
641
|
+
function calculateFleschKincaidGrade(text) {
|
|
642
|
+
return calculateReadabilityMetrics(text).fleschKincaidGrade;
|
|
643
|
+
}
|
|
644
|
+
function calculateReadabilityMetrics(text) {
|
|
645
|
+
const doc = nlp(text);
|
|
646
|
+
const sentences = doc.sentences().length;
|
|
647
|
+
const terms = doc.terms();
|
|
648
|
+
const words = terms.length;
|
|
649
|
+
const characters = text.replace(/\s/g, "").length;
|
|
650
|
+
const allWords = terms.out("array");
|
|
651
|
+
const totalSyllables = allWords.reduce((sum, word) => sum + syllable(word), 0);
|
|
652
|
+
const avgWordsPerSentence = sentences > 0 ? words / sentences : 0;
|
|
653
|
+
const avgSyllablesPerWord = words > 0 ? totalSyllables / words : 0;
|
|
654
|
+
const fkGrade = 0.39 * avgWordsPerSentence + 11.8 * avgSyllablesPerWord - 15.59;
|
|
655
|
+
return {
|
|
656
|
+
sentenceCount: sentences,
|
|
657
|
+
wordCount: words,
|
|
658
|
+
characterCount: characters,
|
|
659
|
+
syllableCount: totalSyllables,
|
|
660
|
+
avgWordsPerSentence,
|
|
661
|
+
avgSyllablesPerWord,
|
|
662
|
+
fleschKincaidGrade: Math.round(Math.max(0, fkGrade) * 100) / 100
|
|
663
|
+
};
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
// src/features/sentence-features.ts
|
|
667
|
+
function safeDivision(numerator, denominator) {
|
|
668
|
+
return denominator === 0 ? 0 : numerator / denominator;
|
|
669
|
+
}
|
|
670
|
+
function standardDeviation(values) {
|
|
671
|
+
if (values.length <= 1) return 0;
|
|
672
|
+
const mean = values.reduce((sum, val) => sum + val, 0) / values.length;
|
|
673
|
+
const squaredDiffs = values.map((val) => Math.pow(val - mean, 2));
|
|
674
|
+
const variance = squaredDiffs.reduce((sum, val) => sum + val, 0) / values.length;
|
|
675
|
+
return Math.sqrt(variance);
|
|
676
|
+
}
|
|
677
|
+
function categorizeSentenceLengths(wordCounts) {
|
|
678
|
+
if (!wordCounts || wordCounts.length === 0) {
|
|
679
|
+
return {
|
|
680
|
+
percent_short_sentences: 0,
|
|
681
|
+
percent_medium_sentences: 0,
|
|
682
|
+
percent_long_sentences: 0,
|
|
683
|
+
percent_very_long_sentences: 0
|
|
684
|
+
};
|
|
685
|
+
}
|
|
686
|
+
let short = 0, medium = 0, long = 0, veryLong = 0;
|
|
687
|
+
for (const count of wordCounts) {
|
|
688
|
+
if (count <= 10) short++;
|
|
689
|
+
else if (count <= 20) medium++;
|
|
690
|
+
else if (count <= 30) long++;
|
|
691
|
+
else veryLong++;
|
|
692
|
+
}
|
|
693
|
+
const total = wordCounts.length;
|
|
694
|
+
return {
|
|
695
|
+
percent_short_sentences: short / total * 100,
|
|
696
|
+
percent_medium_sentences: medium / total * 100,
|
|
697
|
+
percent_long_sentences: long / total * 100,
|
|
698
|
+
percent_very_long_sentences: veryLong / total * 100
|
|
699
|
+
};
|
|
700
|
+
}
|
|
701
|
+
function addEngineeredFeatures(analysis) {
|
|
702
|
+
const numSentences = analysis.num_sentences;
|
|
703
|
+
const numWords = analysis.num_words;
|
|
704
|
+
const avg_words_per_sentence = safeDivision(numWords, numSentences);
|
|
705
|
+
const sentence_length_variation = standardDeviation(analysis.sentence_word_counts);
|
|
706
|
+
const lengthCategories = categorizeSentenceLengths(analysis.sentence_word_counts);
|
|
707
|
+
const percent_simple_sentences = safeDivision(analysis.num_simple_sentences, numSentences) * 100;
|
|
708
|
+
const percent_compound_sentences = safeDivision(analysis.num_compound_sentences, numSentences) * 100;
|
|
709
|
+
const percent_complex_sentences = safeDivision(analysis.num_complex_sentences, numSentences) * 100;
|
|
710
|
+
const percent_compound_complex_sentences = safeDivision(analysis.num_compound_complex_sentences, numSentences) * 100;
|
|
711
|
+
const percent_other_sentences = safeDivision(analysis.num_other_sentences, numSentences) * 100;
|
|
712
|
+
const percent_words_in_simple_sentences = safeDivision(analysis.words_in_simple_sentences, numWords) * 100;
|
|
713
|
+
const percent_words_in_compound_sentences = safeDivision(analysis.words_in_compound_sentences, numWords) * 100;
|
|
714
|
+
const percent_words_in_complex_sentences = safeDivision(analysis.words_in_complex_sentences, numWords) * 100;
|
|
715
|
+
const percent_words_in_compound_complex_sentences = safeDivision(analysis.words_in_compound_complex_sentences, numWords) * 100;
|
|
716
|
+
const percent_words_in_other_sentences = safeDivision(analysis.words_in_other_sentences, numWords) * 100;
|
|
717
|
+
const avg_subordinates_per_sentence = safeDivision(analysis.num_subordinate_clauses, numSentences);
|
|
718
|
+
const avg_clauses_per_sentence = safeDivision(analysis.num_total_clauses, numSentences);
|
|
719
|
+
const percent_sentences_with_subordinate = safeDivision(analysis.num_sentences_with_subordinate, numSentences) * 100;
|
|
720
|
+
const percent_sentences_with_multiple_subordinates = safeDivision(analysis.num_sentences_with_multiple_subordinates, numSentences) * 100;
|
|
721
|
+
const percent_sentences_with_embedded_clauses = safeDivision(analysis.num_sentences_with_embedded_clauses, numSentences) * 100;
|
|
722
|
+
const prep_phrase_density = safeDivision(analysis.num_prepositional_phrases, numWords) * 100;
|
|
723
|
+
const participle_phrase_density = safeDivision(analysis.num_participle_phrases, numWords) * 100;
|
|
724
|
+
const appositive_phrase_density = safeDivision(analysis.num_appositive_phrases, numWords) * 100;
|
|
725
|
+
const total_transitions = analysis.num_simple_transitions + analysis.num_sophisticated_transitions;
|
|
726
|
+
const avg_transitions_per_sentence = safeDivision(total_transitions, numSentences);
|
|
727
|
+
const percent_sophisticated_transitions = safeDivision(analysis.num_sophisticated_transitions, total_transitions) * 100;
|
|
728
|
+
const percent_sentences_w_one_concept = safeDivision(analysis.num_one_concept_sentences, numSentences) * 100;
|
|
729
|
+
const percent_sentences_w_multi_concept = safeDivision(analysis.num_multi_concept_sentences, numSentences) * 100;
|
|
730
|
+
const percent_cleft_sentences = safeDivision(analysis.num_cleft_sentences, numSentences) * 100;
|
|
731
|
+
return {
|
|
732
|
+
...analysis,
|
|
733
|
+
avg_words_per_sentence,
|
|
734
|
+
sentence_length_variation,
|
|
735
|
+
...lengthCategories,
|
|
736
|
+
percent_simple_sentences,
|
|
737
|
+
percent_compound_sentences,
|
|
738
|
+
percent_complex_sentences,
|
|
739
|
+
percent_compound_complex_sentences,
|
|
740
|
+
percent_other_sentences,
|
|
741
|
+
percent_words_in_simple_sentences,
|
|
742
|
+
percent_words_in_compound_sentences,
|
|
743
|
+
percent_words_in_complex_sentences,
|
|
744
|
+
percent_words_in_compound_complex_sentences,
|
|
745
|
+
percent_words_in_other_sentences,
|
|
746
|
+
avg_subordinates_per_sentence,
|
|
747
|
+
avg_clauses_per_sentence,
|
|
748
|
+
percent_sentences_with_subordinate,
|
|
749
|
+
percent_sentences_with_multiple_subordinates,
|
|
750
|
+
percent_sentences_with_embedded_clauses,
|
|
751
|
+
prep_phrase_density,
|
|
752
|
+
participle_phrase_density,
|
|
753
|
+
appositive_phrase_density,
|
|
754
|
+
avg_transitions_per_sentence,
|
|
755
|
+
percent_sophisticated_transitions,
|
|
756
|
+
percent_sentences_w_one_concept,
|
|
757
|
+
percent_sentences_w_multi_concept,
|
|
758
|
+
percent_cleft_sentences
|
|
759
|
+
};
|
|
760
|
+
}
|
|
761
|
+
var FEATURE_COLS = [
|
|
762
|
+
// Foundational & Distributional
|
|
763
|
+
"avg_words_per_sentence",
|
|
764
|
+
"sentence_length_variation",
|
|
765
|
+
"percent_short_sentences",
|
|
766
|
+
"percent_medium_sentences",
|
|
767
|
+
"percent_long_sentences",
|
|
768
|
+
"percent_very_long_sentences",
|
|
769
|
+
"flesch_kincaid_grade",
|
|
770
|
+
// Sentence Structure (Grammatical Type)
|
|
771
|
+
"percent_simple_sentences",
|
|
772
|
+
"percent_compound_sentences",
|
|
773
|
+
"percent_complex_sentences",
|
|
774
|
+
"percent_compound_complex_sentences",
|
|
775
|
+
"percent_other_sentences",
|
|
776
|
+
// Word Distribution
|
|
777
|
+
"percent_words_in_simple_sentences",
|
|
778
|
+
"percent_words_in_complex_sentences",
|
|
779
|
+
"percent_words_in_compound_sentences",
|
|
780
|
+
"percent_words_in_compound_complex_sentences",
|
|
781
|
+
"percent_words_in_other_sentences",
|
|
782
|
+
// Clausal & Subordination
|
|
783
|
+
"avg_subordinates_per_sentence",
|
|
784
|
+
"avg_clauses_per_sentence",
|
|
785
|
+
"percent_sentences_with_subordinate",
|
|
786
|
+
"percent_sentences_with_multiple_subordinates",
|
|
787
|
+
"percent_sentences_with_embedded_clauses",
|
|
788
|
+
// Phrase Density
|
|
789
|
+
"prep_phrase_density",
|
|
790
|
+
"participle_phrase_density",
|
|
791
|
+
"appositive_phrase_density",
|
|
792
|
+
// Cohesion & Transitions
|
|
793
|
+
"avg_transitions_per_sentence",
|
|
794
|
+
"percent_sophisticated_transitions",
|
|
795
|
+
// Conceptual & Other
|
|
796
|
+
"percent_sentences_w_one_concept",
|
|
797
|
+
"percent_sentences_w_multi_concept",
|
|
798
|
+
"percent_cleft_sentences",
|
|
799
|
+
"max_clauses_in_any_sentence",
|
|
800
|
+
// Grades 5-12
|
|
801
|
+
"num_sentences",
|
|
802
|
+
"num_simple_sentences",
|
|
803
|
+
"num_compound",
|
|
804
|
+
"num_basic_complex",
|
|
805
|
+
"num_advanced_complex",
|
|
806
|
+
"percentage_simple",
|
|
807
|
+
"percentage_compound",
|
|
808
|
+
"percentage_basic_complex",
|
|
809
|
+
"percentage_advanced_complex"
|
|
810
|
+
];
|
|
811
|
+
function featuresToJSON(features, decimals = 1, castToInt = true) {
|
|
812
|
+
const payload = {};
|
|
813
|
+
for (const col of FEATURE_COLS) {
|
|
814
|
+
const value = features[col];
|
|
815
|
+
if (typeof value === "number") {
|
|
816
|
+
const rounded = Math.round(value * Math.pow(10, decimals)) / Math.pow(10, decimals);
|
|
817
|
+
payload[col] = castToInt ? Math.round(rounded) : rounded;
|
|
818
|
+
} else {
|
|
819
|
+
payload[col] = null;
|
|
820
|
+
}
|
|
821
|
+
}
|
|
822
|
+
return JSON.stringify(payload, null, 2);
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
// ../../evals/prompts/vocabulary/background-knowledge.txt
|
|
826
|
+
var background_knowledge_default = `
|
|
827
|
+
Review the following text, which is an educational text written for students in the following grade band: {grade}.
|
|
828
|
+
|
|
829
|
+
Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
|
|
830
|
+
|
|
831
|
+
Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
|
|
832
|
+
|
|
833
|
+
Here's an example:
|
|
834
|
+
[START EXAMPLE]
|
|
835
|
+
Grade Band: 11th
|
|
836
|
+
Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
|
|
837
|
+
learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
|
|
838
|
+
not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
|
|
839
|
+
know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
|
|
840
|
+
are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
|
|
841
|
+
is the chief end of man here to "glorify God and enjoy him forever."
|
|
842
|
+
|
|
843
|
+
Background Knowledge Assumption: Assume they've studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
|
|
844
|
+
[END EXAMPLE]
|
|
845
|
+
|
|
846
|
+
You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
|
|
847
|
+
|
|
848
|
+
You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
|
|
849
|
+
[BEGIN TOPICS]
|
|
850
|
+
[
|
|
851
|
+
K: [
|
|
852
|
+
"Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
|
|
853
|
+
"The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
|
|
854
|
+
"America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
|
|
855
|
+
"Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
|
|
856
|
+
"How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
|
|
857
|
+
"How Has Our World Changed?", "Why Do People Have Jobs?"
|
|
858
|
+
],
|
|
859
|
+
1: [
|
|
860
|
+
"Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
|
|
861
|
+
"A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals' unique features?",
|
|
862
|
+
"Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
|
|
863
|
+
"Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
|
|
864
|
+
"What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
|
|
865
|
+
"How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
|
|
866
|
+
],
|
|
867
|
+
2: [
|
|
868
|
+
"Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
|
|
869
|
+
"A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
|
|
870
|
+
"Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
|
|
871
|
+
"Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
|
|
872
|
+
"Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
|
|
873
|
+
"How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
|
|
874
|
+
],
|
|
875
|
+
"3": [
|
|
876
|
+
"Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
|
|
877
|
+
"Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
|
|
878
|
+
"Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
|
|
879
|
+
"Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
|
|
880
|
+
"What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
|
|
881
|
+
"How Do People in a Community Meet Their Wants and Needs?"
|
|
882
|
+
],
|
|
883
|
+
4: [
|
|
884
|
+
"Poetry", "Animal Defense Mechanisms", "The American Revolution",
|
|
885
|
+
"Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
|
|
886
|
+
"A Great Heart: What does it mean to have a great heart, literally and figuratively?",
|
|
887
|
+
"Extreme Settings: How does a challenging setting or physical environment change a person?",
|
|
888
|
+
"American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
|
|
889
|
+
"Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
|
|
890
|
+
"How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
|
|
891
|
+
"How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
|
|
892
|
+
"How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
|
|
893
|
+
],
|
|
894
|
+
5: [
|
|
895
|
+
"Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
|
|
896
|
+
"Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
|
|
897
|
+
"Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
|
|
898
|
+
"Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
|
|
899
|
+
"Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
|
|
900
|
+
"How Were the Lives of Native Peoples Influenced by Where They Lived?",
|
|
901
|
+
"What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
|
|
902
|
+
"Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
|
|
903
|
+
"How Does the Constitution Help Us Understand What It Means to Be an American?",
|
|
904
|
+
"What Do the Early Years of the United States Reveal About the Character of the Nation?",
|
|
905
|
+
"What Was the Effect of the Civil War on U.S. Society?"
|
|
906
|
+
],
|
|
907
|
+
6: [
|
|
908
|
+
"Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
|
|
909
|
+
"Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
|
|
910
|
+
"A Hero's Journey: What is the significance and power of the hero's journey?",
|
|
911
|
+
"Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
|
|
912
|
+
"Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
|
|
913
|
+
"Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
|
|
914
|
+
"Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
|
|
915
|
+
"Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
|
|
916
|
+
"Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
|
|
917
|
+
"The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
|
|
918
|
+
"Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
|
|
919
|
+
"New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
|
|
920
|
+
"The Cold War", "Civil Rights and American Society", "America Since the 1970s"
|
|
921
|
+
],
|
|
922
|
+
7: [
|
|
923
|
+
"The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
|
|
924
|
+
"Identity in the Middle Ages: How does society both support and limit the development of identity?",
|
|
925
|
+
"Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
|
|
926
|
+
"Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
|
|
927
|
+
"Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
|
|
928
|
+
"Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
|
|
929
|
+
"Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
|
|
930
|
+
"Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
|
|
931
|
+
"China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
|
|
932
|
+
"Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
|
|
933
|
+
],
|
|
934
|
+
8: [
|
|
935
|
+
"Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
|
|
936
|
+
"The Poetics and Power of Storytelling: What is the power of storytelling?",
|
|
937
|
+
"The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
|
|
938
|
+
"Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
|
|
939
|
+
"Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
|
|
940
|
+
"Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
|
|
941
|
+
"Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
|
|
942
|
+
"Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
|
|
943
|
+
"Government and the Economy", "The Global Economy"
|
|
944
|
+
]
|
|
945
|
+
]
|
|
946
|
+
[END TOPICS]
|
|
947
|
+
|
|
948
|
+
Here is the text:
|
|
949
|
+
[BEGIN TEXT]
|
|
950
|
+
{text}
|
|
951
|
+
[END TEXT]
|
|
952
|
+
`;
|
|
953
|
+
|
|
954
|
+
// src/prompts/vocabulary/background-knowledge.ts
|
|
955
|
+
function getBackgroundKnowledgePrompt(text, grade) {
|
|
956
|
+
return background_knowledge_default.replaceAll("{grade}", grade).replaceAll("{text}", text);
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
// ../../evals/prompts/vocabulary/grades-3-4-system.txt
|
|
960
|
+
var grades_3_4_system_default = "\nYou are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.\n\nYou will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.\nIMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)\n\n**Resource 1: Qualitative Text Complexity rubric (SAP)**\n1. **Level 1: Slightly complex**\n * Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.\n * Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!\n2. **Level 2: Moderately complex**\n * Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK\n * Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.\n3. **Level 3: Very complex**\n * Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic\n * Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.\n4. **Level 4: Exceedingly complex**\n * Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.\n * Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.\n\n**Resource 2: Flesch-Kincaid Grade Level**\nUse the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.\n* grade 2-3: 1.98-5.34\n* grade 4-5: 4.51-7.73\n* grade 6-8: 6.51-10.34\n* grade 9-10: 8.32-12.12\n* grade 11-College: 10.34-14.2\n\n**Guidelines for Interpretation and Reasoning**\n\nYour reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:\n\n1. **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.\n2. **Contextual Scaffolding:** Assess how the text supports new vocabulary.\n * Are new, complex terms explicitly defined or explained with simple examples (e.g., \"volume... to see if it is big enough to hold a liter of food\")?\n * Is the surrounding language simple and conversational, making the meaning of new words easier to infer?\n * Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.\n3. **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).\n4. **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.\n5. **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.\n * **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.\n * **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.\n6. **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).\n\n**Final Analysis Format**\n\nProvide these information as your final analysis:\n1. **Complex vocabulary:**\n * Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.\n * Tier 3 words: Overly academic or domain-specific words.\n * Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., \"The jury retired to deliberate on their verdict.\" The use of \"retire\" to mean withdrawing to a private place is an archaic use.\n * Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).\n2. **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex\n3. **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.\n";
|
|
961
|
+
|
|
962
|
+
// ../../evals/prompts/vocabulary/other-grades-system.txt
|
|
963
|
+
var other_grades_system_default = "\nYou are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.\n\nYou will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.\n\nIMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)\nIMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.\n\nPlease first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).\n";
|
|
964
|
+
|
|
965
|
+
// src/prompts/vocabulary/system.ts
|
|
966
|
+
function getSystemPrompt(grade) {
|
|
967
|
+
if (grade === "3" || grade === "4") {
|
|
968
|
+
return grades_3_4_system_default;
|
|
969
|
+
}
|
|
970
|
+
return other_grades_system_default;
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
// ../../evals/prompts/vocabulary/grades-3-4-user.txt
|
|
974
|
+
var grades_3_4_user_default = "\nBelow is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.\n\n- It is intended for grade {student_grade_level}.\n\n- You can assume the student has the following background knowledge about the text \u2014 this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}\n\n- Text Flesch-Kincaid grade level: {fk_level}\n\n- Text to evaluate: [BEGIN TEXT]\n{text}\n[END TEXT]\n";
|
|
975
|
+
|
|
976
|
+
// ../../evals/prompts/vocabulary/other-grades-user.txt
|
|
977
|
+
var other_grades_user_default = `
|
|
978
|
+
Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly \u2014 do not introduce any additional criteria or lenses for judging the complexity of the text.
|
|
979
|
+
|
|
980
|
+
[BEGIN ANNOTATION GUIDE AND RUBRIC]
|
|
981
|
+
Instructions
|
|
982
|
+
For the following task, please assume that:
|
|
983
|
+
- The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).
|
|
984
|
+
- The student is moving through a common progression of topics (detailed here).
|
|
985
|
+
- The student is fluent in speaking English.
|
|
986
|
+
- The student has an "average" amount of background knowledge on topics not commonly covered in curriculum.
|
|
987
|
+
- The student will use this material for independent reading/work, without direct instruction.
|
|
988
|
+
- The text is reasonable for the given grade level.
|
|
989
|
+
|
|
990
|
+
Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket").
|
|
991
|
+
|
|
992
|
+
Please do be sure to consider:
|
|
993
|
+
- all of the different types of vocabulary (listed below)
|
|
994
|
+
- the overall proportion of complex words in the text - including repeated complex words.
|
|
995
|
+
- the resulting holistic complexity of the vocabulary (described in the Summary section below).
|
|
996
|
+
|
|
997
|
+
Level 1:
|
|
998
|
+
Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
|
|
999
|
+
|
|
1000
|
+
Level 2:
|
|
1001
|
+
Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.
|
|
1002
|
+
|
|
1003
|
+
Level 3:
|
|
1004
|
+
Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
|
|
1005
|
+
|
|
1006
|
+
Level 4:
|
|
1007
|
+
Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading
|
|
1008
|
+
|
|
1009
|
+
And here are some relevant definitions:
|
|
1010
|
+
- Conversational: Everyday language.
|
|
1011
|
+
- Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge.
|
|
1012
|
+
- Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.
|
|
1013
|
+
- For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data").
|
|
1014
|
+
- Note:
|
|
1015
|
+
- Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.
|
|
1016
|
+
- For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb."
|
|
1017
|
+
- The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.
|
|
1018
|
+
- Unfamiliar proper nouns:
|
|
1019
|
+
- A person's name, even if unfamiliar, generally does not add to complexity.
|
|
1020
|
+
- Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.
|
|
1021
|
+
|
|
1022
|
+
- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.
|
|
1023
|
+
- Overly-academic: Words that are excessively formal, complex, or specialized.
|
|
1024
|
+
- For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture"
|
|
1025
|
+
- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.
|
|
1026
|
+
- For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict."
|
|
1027
|
+
- The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use.
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
Examples
|
|
1031
|
+
The student is on-grade-level:
|
|
1032
|
+
- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level. If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.
|
|
1033
|
+
|
|
1034
|
+
Figurative Language
|
|
1035
|
+
- Kicked the bucket.
|
|
1036
|
+
- The pen is mightier than the sword.
|
|
1037
|
+
- The classroom was a zoo.
|
|
1038
|
+
- He ran faster than the speed of light.
|
|
1039
|
+
[END ANNOTATION GUIDE AND RUBRIC]
|
|
1040
|
+
|
|
1041
|
+
Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:
|
|
1042
|
+
[BEGIN EXAMPLES]
|
|
1043
|
+
|
|
1044
|
+
*** EXAMPLE 1 ***
|
|
1045
|
+
The following text was intended for grade level 11 and received a complexity level of 1.
|
|
1046
|
+
|
|
1047
|
+
Here is the background knowledge assumption for that text: N/A
|
|
1048
|
+
|
|
1049
|
+
Here is the text:
|
|
1050
|
+
// START TEXT //
|
|
1051
|
+
"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.
|
|
1052
|
+
|
|
1053
|
+
He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.
|
|
1054
|
+
|
|
1055
|
+
I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.
|
|
1056
|
+
|
|
1057
|
+
I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.
|
|
1058
|
+
|
|
1059
|
+
Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized."
|
|
1060
|
+
// END TEXT //
|
|
1061
|
+
|
|
1062
|
+
Here is the reasoning for that complexity level:
|
|
1063
|
+
// START REASONING //
|
|
1064
|
+
This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.
|
|
1065
|
+
|
|
1066
|
+
It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.
|
|
1067
|
+
// END REASONING //
|
|
1068
|
+
*** EXAMPLE 2 ***
|
|
1069
|
+
The following text was intended for grade level 5 and received a complexity level of 2.
|
|
1070
|
+
|
|
1071
|
+
Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).
|
|
1072
|
+
|
|
1073
|
+
Here is the text:
|
|
1074
|
+
// START TEXT //
|
|
1075
|
+
Great whirling storms roar out of the oceans in many parts of the world. They are called by several names\u2014hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
|
|
1076
|
+
|
|
1077
|
+
Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
|
|
1078
|
+
// END TEXT //
|
|
1079
|
+
|
|
1080
|
+
Here is the reasoning for that complexity level:
|
|
1081
|
+
// START REASONING //
|
|
1082
|
+
I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.
|
|
1083
|
+
// END REASONING //
|
|
1084
|
+
|
|
1085
|
+
*** EXAMPLE 3 ***
|
|
1086
|
+
The following text was intended for grade level 6 and received a complexity level of 3.
|
|
1087
|
+
|
|
1088
|
+
Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.
|
|
1089
|
+
|
|
1090
|
+
Here is the text:
|
|
1091
|
+
// START TEXT //
|
|
1092
|
+
Geology is the scientific study of Earth. Geologists study the planet\u2014its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.
|
|
1093
|
+
|
|
1094
|
+
Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.
|
|
1095
|
+
|
|
1096
|
+
Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.
|
|
1097
|
+
// END TEXT //
|
|
1098
|
+
|
|
1099
|
+
Here is the reasoning for that complexity level:
|
|
1100
|
+
// START REASONING //
|
|
1101
|
+
To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools\u2013annotation guide, scoring rubric, and examples to remind myself of the expectations for rating. I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4\u2013more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.
|
|
1102
|
+
// END REASONING //
|
|
1103
|
+
[END EXAMPLES]
|
|
1104
|
+
|
|
1105
|
+
Below is the text you need to evaluate. It is intended for grade {student_grade_level}.
|
|
1106
|
+
|
|
1107
|
+
As you read the text, you can assume the student has the following background knowledge about the text \u2014 this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}
|
|
1108
|
+
|
|
1109
|
+
[BEGIN TEXT]
|
|
1110
|
+
{text}
|
|
1111
|
+
[END TEXT]
|
|
1112
|
+
|
|
1113
|
+
In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
|
|
1114
|
+
`;
|
|
1115
|
+
|
|
1116
|
+
// src/prompts/vocabulary/user.ts
|
|
1117
|
+
function getUserPrompt(text, studentGradeLevel, studentBackgroundKnowledge, fkLevel) {
|
|
1118
|
+
const template = studentGradeLevel === "3" || studentGradeLevel === "4" ? grades_3_4_user_default : other_grades_user_default;
|
|
1119
|
+
return template.replaceAll("{student_grade_level}", studentGradeLevel).replaceAll("{student_background_knowledge}", studentBackgroundKnowledge).replaceAll("{fk_level}", fkLevel.toString()).replaceAll("{text}", text);
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
// src/evaluators/vocabulary.ts
|
|
1123
|
+
var VocabularyEvaluator = class _VocabularyEvaluator extends BaseEvaluator {
|
|
1124
|
+
static metadata = {
|
|
1125
|
+
id: "vocabulary",
|
|
1126
|
+
name: "Vocabulary",
|
|
1127
|
+
description: "Evaluates vocabulary complexity of educational texts relative to grade level",
|
|
1128
|
+
supportedGrades: ["3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
|
|
1129
|
+
requiresGoogleKey: true,
|
|
1130
|
+
requiresOpenAIKey: true
|
|
1131
|
+
};
|
|
1132
|
+
grades34ComplexityProvider;
|
|
1133
|
+
otherGradesComplexityProvider;
|
|
1134
|
+
backgroundKnowledgeProvider;
|
|
1135
|
+
constructor(config) {
|
|
1136
|
+
super(config);
|
|
1137
|
+
this.grades34ComplexityProvider = createProvider({
|
|
1138
|
+
type: "google",
|
|
1139
|
+
model: "gemini-2.5-pro",
|
|
1140
|
+
apiKey: config.googleApiKey,
|
|
1141
|
+
maxRetries: this.config.maxRetries
|
|
1142
|
+
});
|
|
1143
|
+
this.otherGradesComplexityProvider = createProvider({
|
|
1144
|
+
type: "openai",
|
|
1145
|
+
model: "gpt-4.1-2025-04-14",
|
|
1146
|
+
apiKey: config.openaiApiKey,
|
|
1147
|
+
maxRetries: this.config.maxRetries
|
|
1148
|
+
});
|
|
1149
|
+
this.backgroundKnowledgeProvider = createProvider({
|
|
1150
|
+
type: "openai",
|
|
1151
|
+
model: "gpt-4o-2024-11-20",
|
|
1152
|
+
apiKey: config.openaiApiKey,
|
|
1153
|
+
maxRetries: this.config.maxRetries
|
|
1154
|
+
});
|
|
1155
|
+
}
|
|
1156
|
+
/**
|
|
1157
|
+
* Evaluate vocabulary complexity for a given text and grade level
|
|
1158
|
+
*
|
|
1159
|
+
* @param text - The text to evaluate
|
|
1160
|
+
* @param grade - The target grade level (3-12)
|
|
1161
|
+
* @returns Evaluation result with complexity score and detailed analysis
|
|
1162
|
+
* @throws {ValidationError} If text is empty, too short/long, or grade is invalid
|
|
1163
|
+
* @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError)
|
|
1164
|
+
*/
|
|
1165
|
+
async evaluate(text, grade) {
|
|
1166
|
+
this.logger.info("Starting vocabulary evaluation", {
|
|
1167
|
+
evaluator: "vocabulary",
|
|
1168
|
+
operation: "evaluate",
|
|
1169
|
+
grade,
|
|
1170
|
+
textLength: text.length
|
|
1171
|
+
});
|
|
1172
|
+
const startTime = Date.now();
|
|
1173
|
+
const stageDetails = [];
|
|
1174
|
+
const complexityProviderName = grade === "3" || grade === "4" ? "google:gemini-2.5-pro" : "openai:gpt-4.1-2025-04-14";
|
|
1175
|
+
try {
|
|
1176
|
+
this.validateText(text);
|
|
1177
|
+
this.validateGrade(grade, new Set(_VocabularyEvaluator.metadata.supportedGrades));
|
|
1178
|
+
this.logger.debug("Stage 1: Generating background knowledge", {
|
|
1179
|
+
evaluator: "vocabulary",
|
|
1180
|
+
operation: "background_knowledge"
|
|
1181
|
+
});
|
|
1182
|
+
const bgResponse = await this.getBackgroundKnowledgeAssumption(text, grade);
|
|
1183
|
+
stageDetails.push({
|
|
1184
|
+
stage: "background_knowledge",
|
|
1185
|
+
provider: "openai:gpt-4o-2024-11-20",
|
|
1186
|
+
latency_ms: bgResponse.latencyMs,
|
|
1187
|
+
token_usage: {
|
|
1188
|
+
input_tokens: bgResponse.usage.inputTokens,
|
|
1189
|
+
output_tokens: bgResponse.usage.outputTokens
|
|
1190
|
+
}
|
|
1191
|
+
});
|
|
1192
|
+
const fkLevel = calculateFleschKincaidGrade(text);
|
|
1193
|
+
const complexityResponse = await this.evaluateComplexity(
|
|
1194
|
+
text,
|
|
1195
|
+
grade,
|
|
1196
|
+
bgResponse.knowledge.assumption,
|
|
1197
|
+
fkLevel
|
|
1198
|
+
);
|
|
1199
|
+
stageDetails.push({
|
|
1200
|
+
stage: "complexity_evaluation",
|
|
1201
|
+
provider: complexityProviderName,
|
|
1202
|
+
latency_ms: complexityResponse.latencyMs,
|
|
1203
|
+
token_usage: {
|
|
1204
|
+
input_tokens: complexityResponse.usage.inputTokens,
|
|
1205
|
+
output_tokens: complexityResponse.usage.outputTokens
|
|
1206
|
+
}
|
|
1207
|
+
});
|
|
1208
|
+
const latencyMs = Date.now() - startTime;
|
|
1209
|
+
const totalTokenUsage = {
|
|
1210
|
+
input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
|
|
1211
|
+
output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
|
|
1212
|
+
};
|
|
1213
|
+
const result = {
|
|
1214
|
+
score: complexityResponse.data.complexity_score,
|
|
1215
|
+
reasoning: complexityResponse.data.reasoning,
|
|
1216
|
+
metadata: {
|
|
1217
|
+
model: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`,
|
|
1218
|
+
processingTimeMs: latencyMs
|
|
1219
|
+
},
|
|
1220
|
+
_internal: complexityResponse.data
|
|
1221
|
+
};
|
|
1222
|
+
this.sendTelemetry({
|
|
1223
|
+
status: "success",
|
|
1224
|
+
latencyMs,
|
|
1225
|
+
textLength: text.length,
|
|
1226
|
+
grade,
|
|
1227
|
+
provider: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`,
|
|
1228
|
+
tokenUsage: totalTokenUsage,
|
|
1229
|
+
metadata: {
|
|
1230
|
+
stage_details: stageDetails
|
|
1231
|
+
},
|
|
1232
|
+
inputText: text
|
|
1233
|
+
}).catch(() => {
|
|
1234
|
+
});
|
|
1235
|
+
this.logger.info("Vocabulary evaluation completed successfully", {
|
|
1236
|
+
evaluator: "vocabulary",
|
|
1237
|
+
operation: "evaluate",
|
|
1238
|
+
grade,
|
|
1239
|
+
score: result.score,
|
|
1240
|
+
processingTimeMs: latencyMs
|
|
1241
|
+
});
|
|
1242
|
+
return result;
|
|
1243
|
+
} catch (error) {
|
|
1244
|
+
const latencyMs = Date.now() - startTime;
|
|
1245
|
+
this.logger.error("Vocabulary evaluation failed", {
|
|
1246
|
+
evaluator: "vocabulary",
|
|
1247
|
+
operation: "evaluate",
|
|
1248
|
+
grade,
|
|
1249
|
+
error: error instanceof Error ? error : void 0,
|
|
1250
|
+
processingTimeMs: latencyMs,
|
|
1251
|
+
completedStages: stageDetails.length
|
|
1252
|
+
});
|
|
1253
|
+
const totalTokenUsage = stageDetails.length > 0 ? {
|
|
1254
|
+
input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
|
|
1255
|
+
output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
|
|
1256
|
+
} : void 0;
|
|
1257
|
+
this.sendTelemetry({
|
|
1258
|
+
status: "error",
|
|
1259
|
+
latencyMs,
|
|
1260
|
+
textLength: text.length,
|
|
1261
|
+
grade,
|
|
1262
|
+
provider: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`,
|
|
1263
|
+
tokenUsage: totalTokenUsage,
|
|
1264
|
+
errorCode: error instanceof Error ? error.name : "UnknownError",
|
|
1265
|
+
metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : void 0,
|
|
1266
|
+
inputText: text
|
|
1267
|
+
}).catch(() => {
|
|
1268
|
+
});
|
|
1269
|
+
if (error instanceof ValidationError) {
|
|
1270
|
+
throw error;
|
|
1271
|
+
}
|
|
1272
|
+
throw wrapProviderError(error, "Vocabulary evaluation failed");
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
/**
|
|
1276
|
+
* Stage 1: Generate background knowledge assumption
|
|
1277
|
+
*
|
|
1278
|
+
* Estimates what topics the student at the given grade level would be familiar with
|
|
1279
|
+
* based on Common Core curriculum progression.
|
|
1280
|
+
*/
|
|
1281
|
+
async getBackgroundKnowledgeAssumption(text, grade) {
|
|
1282
|
+
const prompt = getBackgroundKnowledgePrompt(text, grade);
|
|
1283
|
+
const response = await this.backgroundKnowledgeProvider.generateText(
|
|
1284
|
+
[{ role: "user", content: prompt }],
|
|
1285
|
+
0
|
|
1286
|
+
// temperature = 0 for consistency
|
|
1287
|
+
);
|
|
1288
|
+
return {
|
|
1289
|
+
knowledge: {
|
|
1290
|
+
assumption: response.text.trim(),
|
|
1291
|
+
grade
|
|
1292
|
+
},
|
|
1293
|
+
usage: response.usage,
|
|
1294
|
+
latencyMs: response.latencyMs
|
|
1295
|
+
};
|
|
1296
|
+
}
|
|
1297
|
+
/**
|
|
1298
|
+
* Stage 2: Evaluate vocabulary complexity
|
|
1299
|
+
*
|
|
1300
|
+
* Uses the Qual Text Complexity rubric (SAP) and background knowledge to evaluate vocabulary complexity.
|
|
1301
|
+
* Grades 3-4 use Gemini 2.5 Pro; grades 5-12 use GPT-4.1.
|
|
1302
|
+
*/
|
|
1303
|
+
async evaluateComplexity(text, grade, backgroundKnowledge, fkLevel) {
|
|
1304
|
+
const systemPrompt = getSystemPrompt(grade);
|
|
1305
|
+
const userPrompt = getUserPrompt(text, grade, backgroundKnowledge, fkLevel);
|
|
1306
|
+
const provider = grade === "3" || grade === "4" ? this.grades34ComplexityProvider : this.otherGradesComplexityProvider;
|
|
1307
|
+
const response = await provider.generateStructured({
|
|
1308
|
+
messages: [
|
|
1309
|
+
{ role: "system", content: systemPrompt },
|
|
1310
|
+
{ role: "user", content: userPrompt }
|
|
1311
|
+
],
|
|
1312
|
+
schema: VocabularyComplexitySchema,
|
|
1313
|
+
temperature: 0
|
|
1314
|
+
});
|
|
1315
|
+
return {
|
|
1316
|
+
data: response.data,
|
|
1317
|
+
usage: response.usage,
|
|
1318
|
+
latencyMs: response.latencyMs
|
|
1319
|
+
};
|
|
1320
|
+
}
|
|
1321
|
+
};
|
|
1322
|
+
async function evaluateVocabulary(text, grade, config) {
|
|
1323
|
+
const evaluator = new VocabularyEvaluator(config);
|
|
1324
|
+
return evaluator.evaluate(text, grade);
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
// ../../evals/prompts/sentence-structure/analysis-system.txt
|
|
1328
|
+
var analysis_system_default = "You are an expert in grammar and literacy.";
|
|
1329
|
+
|
|
1330
|
+
// ../../evals/prompts/sentence-structure/analysis-user.txt
|
|
1331
|
+
var analysis_user_default = `
|
|
1332
|
+
# Task
|
|
1333
|
+
I am going to give you a text, and I need you to look through the text sentence-by-sentence to perform a comprehensive grammatical analysis. Use the computational counts as a reference; they can be incorrect in ambiguous cases.
|
|
1334
|
+
|
|
1335
|
+
# Definitions
|
|
1336
|
+
* Sentences: Count a complete grammatical unit ending in a terminal punctuation mark.
|
|
1337
|
+
* Words: Count any sequence of characters separated by a space as one word. Treat hyphenated words (e.g., "state-of-the-art") and numbers (e.g., "2025") as single words.
|
|
1338
|
+
* Independent Clauses: Clauses that can stand alone as a complete sentence.
|
|
1339
|
+
* Subordinate Clauses: Clauses that are dependent on the main clause and cannot stand alone as a complete sentence.
|
|
1340
|
+
* Simple Sentences: Sentences with one independent clause and no subordinate clauses.
|
|
1341
|
+
* Compound Sentences: Sentences with two or more independent clauses and no subordinate clauses.
|
|
1342
|
+
* Complex Sentences: Sentences with one independent clause and at least one subordinate clause.
|
|
1343
|
+
* Compound-Complex Sentences: Sentences with two or more independent clauses and at least one subordinate clause.
|
|
1344
|
+
* Other / Non-Canonical Sentences: Sentences that cannot be reliably classified as simple, compound, complex, or compound-complex (e.g., sentence fragments, run-ons, elliptical responses, headlines, imperatives lacking an explicit subject, or stylized dialogue tags).
|
|
1345
|
+
* Subordinate Clauses: Clauses that are dependent on the main clause and cannot stand alone as a complete sentence.
|
|
1346
|
+
* Embedded Clauses: Clauses that are nested within another clause.
|
|
1347
|
+
* Prepositional Phrases: Phrases that begin with a preposition and end with a noun phrase.
|
|
1348
|
+
* Participle Phrases: Phrases that begin with a participle and end with a noun phrase.
|
|
1349
|
+
* Appositive Phrases: Phrases that rename or identify a noun phrase.
|
|
1350
|
+
* Simple Transitions: Basic coordinating conjunctions and chronological adverbs. Examples: 'and', 'but', 'or', 'so', 'then', 'next', 'first'.
|
|
1351
|
+
* Sophisticated Transitions: Conjunctive adverbs and phrases signaling logical relationships. Examples: 'however', 'therefore', 'consequently', 'as a result', 'for example', 'although'.
|
|
1352
|
+
* One-Concept Sentence: A sentence with ZERO subordinate clauses AND ZERO transition words/phrases (neither simple nor sophisticated).
|
|
1353
|
+
* Multi-Concept Sentence: Any sentence that has \u22651 subordinate clause OR \u22651 transition word/phrase (or both).
|
|
1354
|
+
* Basic Complex Sentences: Sentences with exactly one independent clause and at one dependent (subordinate) clause.
|
|
1355
|
+
* Advanced Complex Sentences: Sentences with two or more of any of those following (can include a mix, doesn't have to be two of the same type) subordinate phrases, clauses, transition words, or any other meaningful "interruptions" to the flow of the sentence (like not-only-but-also constructions, dashes, semicolons, and lengthy appositives). A sentence can be advanced complex if it has just one subordinate phrase or clause alongside a transition phrase, like: "For example, the British favored trade with Hong Kong, assuming favorable trade conditions.
|
|
1356
|
+
|
|
1357
|
+
# Computational Counts
|
|
1358
|
+
Use these as reference, your internal heuristics can be more reliable.
|
|
1359
|
+
{ground_truth_counts}
|
|
1360
|
+
|
|
1361
|
+
# Text to Analyze
|
|
1362
|
+
[BEGIN TEXT]
|
|
1363
|
+
{text}
|
|
1364
|
+
[END TEXT]
|
|
1365
|
+
|
|
1366
|
+
IMPORTANT: Your response should be a single JSON object with the following structure. Do not produce anything outside of the JSON object.
|
|
1367
|
+
|
|
1368
|
+
{format_instructions}
|
|
1369
|
+
`;
|
|
1370
|
+
|
|
1371
|
+
// src/prompts/sentence-structure/analysis.ts
|
|
1372
|
+
function getSystemPromptAnalysis() {
|
|
1373
|
+
return analysis_system_default;
|
|
1374
|
+
}
|
|
1375
|
+
function getUserPromptAnalysis(text, groundTruthCounts) {
|
|
1376
|
+
return analysis_user_default.replace("{text}", text).replace("{ground_truth_counts}", groundTruthCounts).replace("{format_instructions}", "");
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
// ../../evals/prompts/sentence-structure/complexity-system.txt
|
|
1380
|
+
var complexity_system_default = "You are an expert in grammar and literacy, and understand K-12 and Qualitative Text Complexity rubric (SAP).";
|
|
1381
|
+
|
|
1382
|
+
// ../../evals/prompts/sentence-structure/complexity-user.txt
|
|
1383
|
+
var complexity_user_default = '\nYour task is to perform a text complexity analysis for a Grade {grade} student. You will be given a text excerpt and a set of quantitative sentence-level statistics for that text.\n\nYou must integrate both the qualitative aspects of the text and the quantitative statistics to make your final judgment. Do not rely on the numbers alone.\n\n1. Read the TEXT EXCERPT to understand its topic, conceptual load, and overall structure.\n2. Review the TEXT STATISTICS as a guide for complexity level.\n3. Synthesize your findings in your reasoning. Explain how the structure (qualitative) interact with the text statistics (quantitative) to determine the complexity. For example, a text with simple sentences might still be complex if the topic is very dense or abstract.\n\nYour final answer must be one of ["Slightly Complex," "Moderately Complex," "Very Complex", "Exceedingly Complex"].\n\n# GRADE {grade} RUBRIC\n{rubric}\n\n# TEXT EXCERPT\n[BEGIN TEXT]\n{excerpt}\n[END TEXT]\n\n# TEXT STATISTICS\n{sentence_features}\n\n# OUTPUT FORMAT\n{format_instructions}\n';
|
|
1384
|
+
|
|
1385
|
+
// ../../evals/prompts/sentence-structure/rubric-grade-3.txt
|
|
1386
|
+
var rubric_grade_3_default = '\n **Instructions for Analysis:** First, evaluate if the text meets the criteria for "Slightly Complex" or "Exceedingly Complex". If it does not fit into these categories, then decide between "Moderately Complex" and "Very Complex".\n\n **Slightly Complex:**\n * **Description:** The text consists of simple, straightforward language and sentence structures.\n * **Statistical Guidelines:** The text is likely "Slightly Complex" if it meets at least TWO of the following criteria:\n * **Sentence Type:** Primarily simple sentences. (`percent_simple_sentences` is typically > 60%).\n * **Sentence Length:** Short sentences. (`avg_sentence_length` is typically < 12 words).\n * **Subordination:** Very low use of clauses. (`percent_sentences_with_subordinate` is typically < 25%).\n\n **Moderately Complex:**\n * **Description:** The text shows a mix of simple and more complex sentences, introducing some variety in structure without being overly demanding.\n * **Statistical Guidelines:** If the text is not "Slightly Complex", consider "Moderately Complex" if it generally aligns with these ranges:\n * **Sentence Type:** A balanced mix of sentence types. (`percent_simple_sentences` is typically between 40% and 60%).\n * **Sentence Length:** Medium length sentences. (`avg_sentence_length` is typically between 12 and 16 words).\n * **Subordination:** A moderate use of clauses. (`percent_sentences_with_subordinate` is typically between 25% and 45%).\n\n **Very Complex:**\n * **Description:** The text features more elaborate sentences with multiple clauses and ideas, requiring more effort from the reader to parse. This is often the default category for grade-level text that isn\'t simple or exceptionally difficult.\n * **Statistical Guidelines:** If the text is more complex than "Moderately" but does not meet the "Exceedingly" criteria, it is likely "Very Complex". Key indicators include:\n * **Sentence Type:** Complex structures are common. (`percent_simple_sentences` is a minority, typically < 40%).\n * **Sentence Length:** Longer sentences are frequent. (`avg_sentence_length` is typically between 16 and 19 words).\n * **Subordination:** Subordinate clauses are a key feature. (`percent_sentences_with_subordinate` is typically > 45%).\n\n **Exceedingly Complex:**\n * **Description:** The text is dense with very long, intricate sentences and a high degree of subordination, making it exceptionally challenging for this grade level.\n * **Statistical Guidelines:** The text is "Exceedingly Complex" if it shows an extreme combination of sentence length and structural density. It should meet at least **TWO** of the following criteria, including at least **ONE** from the "Structural Density" group.\n * **Structural Density Indicators:**\n * High Subordination: `percent_sentences_with_subordinate` is extensive (typically > 50%).\n * Multiple Subordinates: `percent_sentences_with_multiple_subordinates` is consistently present (typically > 12%).\n * High Syntactic Complexity: `percent_compound_complex_sentences` is significant (typically > 15%).\n * **Length Indicators:**\n * Extreme Sentence Length: `avg_sentence_length` is very long (typically > 19 words).\n * Low Simplicity: `percent_simple_sentences` is very low (typically < 30%).\n * Concentrated Length: `percent_very_long_sentences` is notable (typically > 10%).\n';
|
|
1387
|
+
|
|
1388
|
+
// ../../evals/prompts/sentence-structure/rubric-grade-4.txt
|
|
1389
|
+
var rubric_grade_4_default = '\n **Instructions for Analysis:** First, evaluate if the text meets the criteria for "Slightly Complex" or "Exceedingly Complex". If it does not fit into these categories, then decide between "Moderately Complex" and "Very Complex".\n\n **Slightly Complex:**\n * **Description:** The text uses clear, direct language with basic sentence structures appropriate for developing readers.\n * **Statistical Guidelines:** The text is likely "Slightly Complex" if it meets at least TWO of the following criteria:\n * **Sentence Type:** Dominated by simple sentences. (`percent_simple_sentences` is typically > 55%).\n * **Sentence Length:** Short to medium sentences. (`avg_sentence_length` is typically < 13 words).\n * **Subordination:** Infrequent use of clauses. (`percent_sentences_with_subordinate` is typically < 30%).\n\n **Moderately Complex:**\n * **Description:** The text contains a variety of sentence structures, including compound and complex sentences, but remains accessible.\n * **Statistical Guidelines:** If the text is not "Slightly Complex", consider "Moderately Complex" if it generally aligns with these ranges:\n * **Sentence Type:** A healthy mix of sentence types. (`percent_simple_sentences` is typically between 40% and 55%).\n * **Sentence Length:** Medium length sentences. (`avg_sentence_length` is typically between 13 and 17 words).\n * **Subordination:** A moderate number of clauses. (`percent_sentences_with_subordinate` is typically between 30% and 50%).\n\n **Very Complex:**\n * **Description:** The text is characterized by longer sentences and the regular use of dependent clauses, requiring readers to track multiple ideas. This is the default for challenging, on-grade-level texts.\n * **Statistical Guidelines:** If the text is more complex than "Moderately" but does not meet the "Exceedingly" criteria, it is likely "Very Complex". Key indicators include:\n * **Sentence Type:** Simple sentences are a clear minority. (`percent_simple_sentences` is typically < 40%).\n * **Sentence Length:** Sentences are consistently long. (`avg_sentence_length` is typically between 17 and 22 words).\n * **Subordination:** Subordination is a major feature. (`percent_sentences_with_subordinate` is typically > 50%).\n * **Multiple Subordination:** Sentences with multiple clauses appear more often. (`percent_sentences_with_multiple_subordinates` is typically > 8%).\n\n **Exceedingly Complex:**\n * **Description:** The text\'s structure is highly sophisticated and dense, marked by extensive use of embedded clauses and long, flowing sentences that are well above grade-level expectations.\n * **Statistical Guidelines:** A text is "Exceedingly Complex" if its structure is highly sophisticated and dense. It should meet at least **TWO** of the following criteria, including at least **ONE** from the "Structural Density" group.\n * **Structural Density Indicators:**\n * High Subordination: `percent_sentences_with_subordinate` is very high (typically > 60%).\n * Multiple Subordinates: `percent_sentences_with_multiple_subordinates` is high and consistent (typically > 15%).\n * High Syntactic Complexity: `percent_compound_complex_sentences` is a notable feature (typically > 20%).\n * **Length Indicators:**\n * Extreme Sentence Length: `avg_sentence_length` is exceptionally long (typically > 22 words).\n * Low Simplicity: `percent_simple_sentences` is very low (typically < 25%).\n * Concentrated Length: `percent_very_long_sentences` is significant (typically > 15%).\n';
|
|
1390
|
+
|
|
1391
|
+
// ../../evals/prompts/sentence-structure/rubric-grades-5-12.txt
|
|
1392
|
+
var rubric_grades_5_12_default = "\n **Slightly Complex:** A text is in the Slightly Complex bucket if it has at least 50% simple sentences. If it doesn't, the text is a higher level of complexity. If the % of simple sentences is >= 50% and the % of compound sentences is >= 20%, the text is Moderately Complex, otherwise, the text is Slightly Complex. Slightly Complex texts NEVER have advanced complex sentences \u2014 the presence of an advanced complex sentence always leads to a higher level of complexity than Slightly.\n **For Moderately Complex:** These texts can take on any distribution of sentence types as long as there aren't more than 2 advanced complex sentences and as long as there aren't so many simple sentences that the text becomes Slightly Complex. That means Moderately Complex texts may have many simple sentences (although not so many that the text is Slightly Complex), compound sentences, and/or basic complex sentences. It's also possible for a moderately complex text to contain one or two advanced complex sentences, as long as there aren't more than 2. If there are more than 2, then the text is either Very or Exceedingly complex.\n **Very Complex:** These texts contain 3 or more advanced complex sentences (unless the percentage of advanced complex sentences is >= 65)%, in which case the text becomes Exceedingly Complex). They may still contain many simple, compound, and basic complex sentences, but a text is not Very Complex unless there are 3 or more advanced complex sentences.\n **Exceedingly Complex:** These texts have 65%+ of their sentences being advanced complex sentences.\n";
|
|
1393
|
+
|
|
1394
|
+
// src/prompts/sentence-structure/complexity.ts
|
|
1395
|
+
function getSystemPromptComplexity() {
|
|
1396
|
+
return complexity_system_default;
|
|
1397
|
+
}
|
|
1398
|
+
function getRubricForGrade(grade) {
|
|
1399
|
+
if (grade === "3") {
|
|
1400
|
+
return rubric_grade_3_default;
|
|
1401
|
+
} else if (grade === "4") {
|
|
1402
|
+
return rubric_grade_4_default;
|
|
1403
|
+
} else {
|
|
1404
|
+
return rubric_grades_5_12_default;
|
|
1405
|
+
}
|
|
1406
|
+
}
|
|
1407
|
+
function getUserPromptComplexity(sentenceFeatures, grade, excerpt) {
|
|
1408
|
+
const rubric = getRubricForGrade(grade);
|
|
1409
|
+
return complexity_user_default.replace("{sentence_features}", sentenceFeatures).replace("{grade}", grade).replace("{rubric}", rubric).replace("{excerpt}", excerpt).replace("{format_instructions}", "");
|
|
1410
|
+
}
|
|
1411
|
+
|
|
1412
|
+
// src/evaluators/sentence-structure.ts
|
|
1413
|
+
function normalizeLabel(label) {
|
|
1414
|
+
if (!label) {
|
|
1415
|
+
return null;
|
|
1416
|
+
}
|
|
1417
|
+
const normalized = label.trim().toLowerCase().replace(/_/g, " ");
|
|
1418
|
+
const mapping = {
|
|
1419
|
+
"slightly complex": "Slightly complex",
|
|
1420
|
+
"moderately complex": "Moderately complex",
|
|
1421
|
+
"very complex": "Very complex",
|
|
1422
|
+
"exceedingly complex": "Exceedingly complex",
|
|
1423
|
+
"extremely complex": "Exceedingly complex"
|
|
1424
|
+
};
|
|
1425
|
+
return mapping[normalized] ?? null;
|
|
1426
|
+
}
|
|
1427
|
+
var SentenceStructureEvaluator = class _SentenceStructureEvaluator extends BaseEvaluator {
|
|
1428
|
+
static metadata = {
|
|
1429
|
+
id: "sentence-structure",
|
|
1430
|
+
name: "Sentence Structure",
|
|
1431
|
+
description: "Evaluates sentence structure complexity based on grammatical features",
|
|
1432
|
+
supportedGrades: ["3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
|
|
1433
|
+
requiresGoogleKey: false,
|
|
1434
|
+
requiresOpenAIKey: true
|
|
1435
|
+
};
|
|
1436
|
+
analysisProvider;
|
|
1437
|
+
complexityProvider;
|
|
1438
|
+
constructor(config) {
|
|
1439
|
+
super(config);
|
|
1440
|
+
this.analysisProvider = createProvider({
|
|
1441
|
+
type: "openai",
|
|
1442
|
+
model: "gpt-4o",
|
|
1443
|
+
apiKey: config.openaiApiKey,
|
|
1444
|
+
maxRetries: this.config.maxRetries
|
|
1445
|
+
});
|
|
1446
|
+
this.complexityProvider = createProvider({
|
|
1447
|
+
type: "openai",
|
|
1448
|
+
model: "gpt-4o",
|
|
1449
|
+
apiKey: config.openaiApiKey,
|
|
1450
|
+
maxRetries: this.config.maxRetries
|
|
1451
|
+
});
|
|
1452
|
+
}
|
|
1453
|
+
/**
|
|
1454
|
+
* Evaluate sentence structure complexity for a given text and grade level
|
|
1455
|
+
*
|
|
1456
|
+
* @param text - The text to evaluate
|
|
1457
|
+
* @param grade - The target grade level (3-12)
|
|
1458
|
+
* @returns Evaluation result with complexity score and detailed analysis
|
|
1459
|
+
* @throws {ValidationError} If text is empty, too short/long, or grade is invalid
|
|
1460
|
+
* @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError)
|
|
1461
|
+
*/
|
|
1462
|
+
async evaluate(text, grade) {
|
|
1463
|
+
this.logger.info("Starting sentence structure evaluation", {
|
|
1464
|
+
evaluator: "sentence-structure",
|
|
1465
|
+
operation: "evaluate",
|
|
1466
|
+
grade,
|
|
1467
|
+
textLength: text.length
|
|
1468
|
+
});
|
|
1469
|
+
const startTime = Date.now();
|
|
1470
|
+
const stageDetails = [];
|
|
1471
|
+
try {
|
|
1472
|
+
this.validateText(text);
|
|
1473
|
+
this.validateGrade(grade, new Set(_SentenceStructureEvaluator.metadata.supportedGrades));
|
|
1474
|
+
this.logger.debug("Stage 1: Analyzing sentence structure", {
|
|
1475
|
+
evaluator: "sentence-structure",
|
|
1476
|
+
operation: "sentence_analysis"
|
|
1477
|
+
});
|
|
1478
|
+
const analysisResponse = await this.analyzeSentenceStructure(text);
|
|
1479
|
+
stageDetails.push({
|
|
1480
|
+
stage: "sentence_analysis",
|
|
1481
|
+
provider: "openai:gpt-4o",
|
|
1482
|
+
latency_ms: analysisResponse.latencyMs,
|
|
1483
|
+
token_usage: {
|
|
1484
|
+
input_tokens: analysisResponse.usage.inputTokens,
|
|
1485
|
+
output_tokens: analysisResponse.usage.outputTokens
|
|
1486
|
+
}
|
|
1487
|
+
});
|
|
1488
|
+
const features = addEngineeredFeatures(analysisResponse.data);
|
|
1489
|
+
this.logger.debug("Stage 2: Classifying complexity", {
|
|
1490
|
+
evaluator: "sentence-structure",
|
|
1491
|
+
operation: "complexity_classification"
|
|
1492
|
+
});
|
|
1493
|
+
const complexityResponse = await this.classifyComplexity(features, grade, text);
|
|
1494
|
+
stageDetails.push({
|
|
1495
|
+
stage: "complexity_classification",
|
|
1496
|
+
provider: "openai:gpt-4o",
|
|
1497
|
+
latency_ms: complexityResponse.latencyMs,
|
|
1498
|
+
token_usage: {
|
|
1499
|
+
input_tokens: complexityResponse.usage.inputTokens,
|
|
1500
|
+
output_tokens: complexityResponse.usage.outputTokens
|
|
1501
|
+
}
|
|
1502
|
+
});
|
|
1503
|
+
const latencyMs = Date.now() - startTime;
|
|
1504
|
+
const totalTokenUsage = {
|
|
1505
|
+
input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
|
|
1506
|
+
output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
|
|
1507
|
+
};
|
|
1508
|
+
const result = {
|
|
1509
|
+
score: complexityResponse.data.answer,
|
|
1510
|
+
reasoning: complexityResponse.data.reasoning,
|
|
1511
|
+
metadata: {
|
|
1512
|
+
model: "openai:gpt-4o",
|
|
1513
|
+
processingTimeMs: latencyMs
|
|
1514
|
+
},
|
|
1515
|
+
_internal: {
|
|
1516
|
+
sentenceAnalysis: analysisResponse.data,
|
|
1517
|
+
features,
|
|
1518
|
+
complexity: complexityResponse.data
|
|
1519
|
+
}
|
|
1520
|
+
};
|
|
1521
|
+
this.sendTelemetry({
|
|
1522
|
+
status: "success",
|
|
1523
|
+
latencyMs,
|
|
1524
|
+
textLength: text.length,
|
|
1525
|
+
grade,
|
|
1526
|
+
provider: "openai:gpt-4o",
|
|
1527
|
+
tokenUsage: totalTokenUsage,
|
|
1528
|
+
metadata: {
|
|
1529
|
+
stage_details: stageDetails
|
|
1530
|
+
},
|
|
1531
|
+
inputText: text
|
|
1532
|
+
}).catch(() => {
|
|
1533
|
+
});
|
|
1534
|
+
this.logger.info("Sentence structure evaluation completed successfully", {
|
|
1535
|
+
evaluator: "sentence-structure",
|
|
1536
|
+
operation: "evaluate",
|
|
1537
|
+
grade,
|
|
1538
|
+
score: result.score,
|
|
1539
|
+
processingTimeMs: latencyMs
|
|
1540
|
+
});
|
|
1541
|
+
return result;
|
|
1542
|
+
} catch (error) {
|
|
1543
|
+
const latencyMs = Date.now() - startTime;
|
|
1544
|
+
this.logger.error("Sentence structure evaluation failed", {
|
|
1545
|
+
evaluator: "sentence-structure",
|
|
1546
|
+
operation: "evaluate",
|
|
1547
|
+
grade,
|
|
1548
|
+
error: error instanceof Error ? error : void 0,
|
|
1549
|
+
processingTimeMs: latencyMs,
|
|
1550
|
+
completedStages: stageDetails.length
|
|
1551
|
+
});
|
|
1552
|
+
const totalTokenUsage = stageDetails.length > 0 ? {
|
|
1553
|
+
input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
|
|
1554
|
+
output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
|
|
1555
|
+
} : void 0;
|
|
1556
|
+
this.sendTelemetry({
|
|
1557
|
+
status: "error",
|
|
1558
|
+
latencyMs,
|
|
1559
|
+
textLength: text.length,
|
|
1560
|
+
grade,
|
|
1561
|
+
provider: "openai:gpt-4o",
|
|
1562
|
+
tokenUsage: totalTokenUsage,
|
|
1563
|
+
errorCode: error instanceof Error ? error.name : "UnknownError",
|
|
1564
|
+
metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : void 0,
|
|
1565
|
+
inputText: text
|
|
1566
|
+
}).catch(() => {
|
|
1567
|
+
});
|
|
1568
|
+
if (error instanceof ValidationError) {
|
|
1569
|
+
throw error;
|
|
1570
|
+
}
|
|
1571
|
+
throw wrapProviderError(error, "Sentence structure evaluation failed");
|
|
1572
|
+
}
|
|
1573
|
+
}
|
|
1574
|
+
/**
|
|
1575
|
+
* Stage 1: Analyze sentence grammatical structure
|
|
1576
|
+
*
|
|
1577
|
+
* Analyzes sentence types, clauses, phrases, transitions, and other grammatical features
|
|
1578
|
+
*/
|
|
1579
|
+
async analyzeSentenceStructure(text) {
|
|
1580
|
+
const metrics = calculateReadabilityMetrics(text);
|
|
1581
|
+
const gtCountsStr = [
|
|
1582
|
+
`num_sentences: ${metrics.sentenceCount}`,
|
|
1583
|
+
`num_words: ${metrics.wordCount}`,
|
|
1584
|
+
`num_char: ${metrics.characterCount}`,
|
|
1585
|
+
`num_syllable: ${metrics.syllableCount}`,
|
|
1586
|
+
`flesch_kincaid_grade: ${metrics.fleschKincaidGrade}`
|
|
1587
|
+
].join("\n");
|
|
1588
|
+
const userPrompt = getUserPromptAnalysis(text, gtCountsStr);
|
|
1589
|
+
const response = await this.analysisProvider.generateStructured({
|
|
1590
|
+
messages: [
|
|
1591
|
+
{ role: "system", content: getSystemPromptAnalysis() },
|
|
1592
|
+
{ role: "user", content: userPrompt }
|
|
1593
|
+
],
|
|
1594
|
+
schema: SentenceAnalysisSchema,
|
|
1595
|
+
temperature: 0
|
|
1596
|
+
});
|
|
1597
|
+
return {
|
|
1598
|
+
data: response.data,
|
|
1599
|
+
usage: response.usage,
|
|
1600
|
+
latencyMs: response.latencyMs
|
|
1601
|
+
};
|
|
1602
|
+
}
|
|
1603
|
+
/**
|
|
1604
|
+
* Stage 2: Classify sentence structure complexity
|
|
1605
|
+
*
|
|
1606
|
+
* Uses engineered features and grade-specific rubric to classify complexity level
|
|
1607
|
+
*/
|
|
1608
|
+
async classifyComplexity(features, grade, excerpt) {
|
|
1609
|
+
const featuresJSON = featuresToJSON(features, 1, true);
|
|
1610
|
+
const userPrompt = getUserPromptComplexity(featuresJSON, grade, excerpt);
|
|
1611
|
+
const response = await this.complexityProvider.generateStructured({
|
|
1612
|
+
messages: [
|
|
1613
|
+
{ role: "system", content: getSystemPromptComplexity() },
|
|
1614
|
+
{ role: "user", content: userPrompt }
|
|
1615
|
+
],
|
|
1616
|
+
schema: ComplexityClassificationSchema,
|
|
1617
|
+
temperature: 0
|
|
1618
|
+
});
|
|
1619
|
+
const normalizedAnswer = normalizeLabel(response.data.answer);
|
|
1620
|
+
if (!normalizedAnswer) {
|
|
1621
|
+
throw new Error(
|
|
1622
|
+
`Failed to normalize complexity label. Received unexpected value: "${response.data.answer}". Expected one of: Slightly Complex, Moderately Complex, Very Complex, Exceedingly Complex, Extremely Complex.`
|
|
1623
|
+
);
|
|
1624
|
+
}
|
|
1625
|
+
return {
|
|
1626
|
+
data: {
|
|
1627
|
+
...response.data,
|
|
1628
|
+
answer: normalizedAnswer
|
|
1629
|
+
},
|
|
1630
|
+
usage: response.usage,
|
|
1631
|
+
latencyMs: response.latencyMs
|
|
1632
|
+
};
|
|
1633
|
+
}
|
|
1634
|
+
};
|
|
1635
|
+
async function evaluateSentenceStructure(text, grade, config) {
|
|
1636
|
+
const evaluator = new SentenceStructureEvaluator(config);
|
|
1637
|
+
return evaluator.evaluate(text, grade);
|
|
1638
|
+
}
|
|
1639
|
+
|
|
1640
|
+
// ../../evals/prompts/grade-level-appropriateness/system.txt
|
|
1641
|
+
var system_default = "\nYou are an expert in English literature education for K-12.\nYour job is to help evaluate the grade level appropriateness of a given text.\n\nYou will be given a text and you should determine which grade level the text is appropriate for (grade levels include: K-1, 2-3, 4-5, 6-8, 9-10, 11-CCR)\n\nIMPORTANT: You should pay attention to the vocabulary used, topics of the text and readability of text.\n\nPlease first reason out loud about the vocabulary complexity of the text and then provide an answer between grade level options: K-1, 2-3, 4-5, 6-8, 9-10, 11-CCR.\n\n";
|
|
1642
|
+
|
|
1643
|
+
// ../../evals/prompts/grade-level-appropriateness/user.txt
|
|
1644
|
+
var user_default = '\nUse these steps to determine appropriate grade level for a text:\n1. Calculate word count and Flesch-Kincaid Grade Level of the text, and generate a grade band.\nHere are the bands guideline for word count\n\n2-3: 200-800 words\n4-5: 200-800 words\n6-8: 400-1000 words\n9-10: 500-1500 words\n11-12: 1501 words and more\n\nHere is the formula for Flesch-Kincaid Grade Level:\nFlesch-Kincaid Grade Level = 0.39 * (total words / total sentences) + 11.8 * (total syllables / total words) - 15.59\n\n\n2. Determine the qualitative complexity using this text complexity rubric:\nTEXT STRUCTURE\n\nExceedingly Complex\n \u2022 Deep, intricate, often ambiguous connections between many ideas/processes/events\n \u2022 Organization is intricate or discipline-specific\n \u2022 Text features are essential for understanding\n \u2022 Graphics are intricate, extensive, and integral to meaning; may convey unique information\n\nVery Complex\n \u2022 Expanded ideas/processes/events with implicit or subtle connections\n \u2022 Organization may have multiple pathways or discipline-specific traits\n \u2022 Text features directly enhance understanding\n \u2022 Graphics support or are integral to understanding\n\nModerately Complex\n \u2022 Some implicit/subtle connections between ideas/events\n \u2022 Organization is evident and generally sequential or chronological\n \u2022 Text features enhance understanding\n \u2022 Graphics are mostly supplementary\n\nSlightly Complex\n \u2022 Explicit and clear connections between ideas/events\n \u2022 Organization is chronological, sequential, or predictable\n \u2022 Text features help navigation but are not essential\n \u2022 Graphics are simple, not necessary, but may assist understanding\n\n\u2E3B\n\nLANGUAGE FEATURES\n\nExceedingly Complex\n \u2022 Dense, abstract, ironic, and/or figurative language\n \u2022 Complex, unfamiliar, archaic, subject-specific, or ambiguous vocabulary\n \u2022 Mainly complex sentences with multiple subordinate clauses and transitions\n\nVery Complex\n \u2022 Fairly complex; some abstract, ironic, and/or figurative language\n \u2022 Some unfamiliar, archaic, or overly academic vocabulary\n \u2022 Many complex sentences with subordinate phrases/clauses\n\nModerately Complex\n \u2022 Mostly explicit language with some complex meaning\n \u2022 Mostly familiar and conversational vocabulary\n \u2022 Primarily simple and compound sentences, with some complex ones\n\nSlightly Complex\n \u2022 Explicit, literal, straightforward language\n \u2022 Contemporary, familiar, conversational vocabulary\n \u2022 Mainly simple sentences\n\n\u2E3B\n\nPURPOSE\n\nExceedingly Complex\n \u2022 Subtle, intricate, and difficult to determine\n \u2022 Includes many theoretical or abstract elements\n\nVery Complex\n \u2022 Implicit or subtle, fairly easy to infer\n \u2022 More theoretical or abstract than concrete\n\nModerately Complex\n \u2022 Implied but easy to identify based on context or source\n\nSlightly Complex\n \u2022 Explicitly stated, clear, concrete, and narrowly focused\n\n\u2E3B\n\nKNOWLEDGE DEMANDS\n\nExceedingly Complex\n \u2022 Requires extensive discipline-specific or theoretical knowledge\n \u2022 Many references/allusions to other texts or ideas\n\nVery Complex\n \u2022 Requires moderate discipline-specific knowledge\n \u2022 Some references/allusions to other texts or ideas\n\nModerately Complex\n \u2022 Requires common knowledge and some discipline-specific knowledge\n \u2022 Few references/allusions\n\nSlightly Complex\n \u2022 Requires everyday, practical knowledge\n \u2022 No references/allusions\n\n3. Background knowledge:\nAt which grade level would student have enough background knowledge to understand the text?\n\n4. Use your judgement of the above three steps. First use the quantitative signal to get first signal of the appropriate grade level range, then use qualitative analysis to refine your decisions and consider if student at such grade will have enough background knowledge to arrive at a final grade level band. Also consider if the text can be for a lower grade with additional scaffolding.\n\n<begin of text to evaluate>\n<text>{text}</text>\n<end of text to evaluate>\n\nWhen providing your response, first think out loud of your reasoning and then provide your answer from one of the grade band options above. Your reasoning and answer needs to be in JSON format. Strictly follow the following format for your response.\n\nYour final answer should be in the "grade" property for the target grade band for the text aimed for independent reading. If there is alternative appropriate grade students can read and comprehend with scaffold (eg. picture, graph, additional context, etc) or for read-aloud purposes for lower grade, provide it in the "alternative_grade" property and provide the types of scaffolding in the "scaffolding_needed" property.\n\nIn your reasoning, provide numbered bullet points for each of the analyses in each of the 3 steps. At the end, give me the 4th bullet point called "synthesis" to summarize your analysis from the above 3 steps that help you arrive at the final decision.\n\n{format_instructions}\n';
|
|
1645
|
+
|
|
1646
|
+
// src/prompts/grade-level-appropriateness/index.ts
|
|
1647
|
+
function getSystemPrompt2() {
|
|
1648
|
+
return system_default;
|
|
1649
|
+
}
|
|
1650
|
+
function getUserPrompt2(text) {
|
|
1651
|
+
return user_default.replace("{text}", text).replace("{format_instructions}", "");
|
|
1652
|
+
}
|
|
1653
|
+
|
|
1654
|
+
// src/evaluators/grade-level-appropriateness.ts
|
|
1655
|
+
var GradeLevelAppropriatenessEvaluator = class extends BaseEvaluator {
|
|
1656
|
+
static metadata = {
|
|
1657
|
+
id: "grade-level-appropriateness",
|
|
1658
|
+
name: "Grade Level Appropriateness",
|
|
1659
|
+
description: "Determines appropriate grade level for text with scaffolding recommendations",
|
|
1660
|
+
supportedGrades: [],
|
|
1661
|
+
// No grade parameter required - evaluates what grade the text is appropriate for
|
|
1662
|
+
requiresGoogleKey: true,
|
|
1663
|
+
requiresOpenAIKey: false
|
|
1664
|
+
};
|
|
1665
|
+
provider;
|
|
1666
|
+
constructor(config) {
|
|
1667
|
+
super(config);
|
|
1668
|
+
this.provider = createProvider({
|
|
1669
|
+
type: "google",
|
|
1670
|
+
model: "gemini-2.5-pro",
|
|
1671
|
+
apiKey: config.googleApiKey,
|
|
1672
|
+
maxRetries: this.config.maxRetries
|
|
1673
|
+
});
|
|
1674
|
+
}
|
|
1675
|
+
/**
|
|
1676
|
+
* Evaluate grade level appropriateness for a given text
|
|
1677
|
+
*
|
|
1678
|
+
* @param text - The text to evaluate
|
|
1679
|
+
* @returns Evaluation result with grade recommendations and scaffolding suggestions
|
|
1680
|
+
* @throws {ValidationError} If text is empty or too short/long
|
|
1681
|
+
* @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError)
|
|
1682
|
+
*/
|
|
1683
|
+
async evaluate(text) {
|
|
1684
|
+
this.logger.info("Starting grade level appropriateness evaluation", {
|
|
1685
|
+
evaluator: "grade-level-appropriateness",
|
|
1686
|
+
operation: "evaluate",
|
|
1687
|
+
textLength: text.length
|
|
1688
|
+
});
|
|
1689
|
+
const startTime = Date.now();
|
|
1690
|
+
try {
|
|
1691
|
+
this.validateText(text);
|
|
1692
|
+
this.logger.debug("Evaluating grade level appropriateness", {
|
|
1693
|
+
evaluator: "grade-level-appropriateness",
|
|
1694
|
+
operation: "grade_evaluation"
|
|
1695
|
+
});
|
|
1696
|
+
const userPrompt = getUserPrompt2(text);
|
|
1697
|
+
const response = await this.provider.generateStructured({
|
|
1698
|
+
messages: [
|
|
1699
|
+
{ role: "system", content: getSystemPrompt2() },
|
|
1700
|
+
{ role: "user", content: userPrompt }
|
|
1701
|
+
],
|
|
1702
|
+
schema: GradeLevelAppropriatenessSchema,
|
|
1703
|
+
temperature: 0.25
|
|
1704
|
+
});
|
|
1705
|
+
const latencyMs = Date.now() - startTime;
|
|
1706
|
+
const tokenUsage = {
|
|
1707
|
+
input_tokens: response.usage.inputTokens,
|
|
1708
|
+
output_tokens: response.usage.outputTokens
|
|
1709
|
+
};
|
|
1710
|
+
const result = {
|
|
1711
|
+
score: response.data.grade,
|
|
1712
|
+
reasoning: response.data.reasoning,
|
|
1713
|
+
metadata: {
|
|
1714
|
+
model: "google:gemini-2.5-pro",
|
|
1715
|
+
processingTimeMs: latencyMs
|
|
1716
|
+
},
|
|
1717
|
+
_internal: response.data
|
|
1718
|
+
};
|
|
1719
|
+
this.sendTelemetry({
|
|
1720
|
+
status: "success",
|
|
1721
|
+
latencyMs,
|
|
1722
|
+
textLength: text.length,
|
|
1723
|
+
provider: "google:gemini-2.5-pro",
|
|
1724
|
+
tokenUsage,
|
|
1725
|
+
// No metadata.stage_details for single-stage evaluator
|
|
1726
|
+
inputText: text
|
|
1727
|
+
}).catch(() => {
|
|
1728
|
+
});
|
|
1729
|
+
this.logger.info("Grade level appropriateness evaluation completed successfully", {
|
|
1730
|
+
evaluator: "grade-level-appropriateness",
|
|
1731
|
+
operation: "evaluate",
|
|
1732
|
+
grade: result.score,
|
|
1733
|
+
processingTimeMs: latencyMs
|
|
1734
|
+
});
|
|
1735
|
+
return result;
|
|
1736
|
+
} catch (error) {
|
|
1737
|
+
const latencyMs = Date.now() - startTime;
|
|
1738
|
+
this.logger.error("Grade level appropriateness evaluation failed", {
|
|
1739
|
+
evaluator: "grade-level-appropriateness",
|
|
1740
|
+
operation: "evaluate",
|
|
1741
|
+
error: error instanceof Error ? error : void 0,
|
|
1742
|
+
processingTimeMs: latencyMs
|
|
1743
|
+
});
|
|
1744
|
+
this.sendTelemetry({
|
|
1745
|
+
status: "error",
|
|
1746
|
+
latencyMs,
|
|
1747
|
+
textLength: text.length,
|
|
1748
|
+
provider: "google:gemini-2.5-pro",
|
|
1749
|
+
errorCode: error instanceof Error ? error.name : "UnknownError",
|
|
1750
|
+
inputText: text
|
|
1751
|
+
}).catch(() => {
|
|
1752
|
+
});
|
|
1753
|
+
if (error instanceof ValidationError) {
|
|
1754
|
+
throw error;
|
|
1755
|
+
}
|
|
1756
|
+
throw wrapProviderError(error, "Grade level appropriateness evaluation failed");
|
|
1757
|
+
}
|
|
1758
|
+
}
|
|
1759
|
+
};
|
|
1760
|
+
async function evaluateGradeLevelAppropriateness(text, config) {
|
|
1761
|
+
const evaluator = new GradeLevelAppropriatenessEvaluator(config);
|
|
1762
|
+
return evaluator.evaluate(text);
|
|
1763
|
+
}
|
|
1764
|
+
var TextComplexityEvaluator = class _TextComplexityEvaluator extends BaseEvaluator {
|
|
1765
|
+
static metadata = {
|
|
1766
|
+
id: "text-complexity",
|
|
1767
|
+
name: "Text Complexity",
|
|
1768
|
+
description: "Composite evaluator analyzing vocabulary and sentence structure complexity",
|
|
1769
|
+
supportedGrades: ["3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
|
|
1770
|
+
requiresGoogleKey: true,
|
|
1771
|
+
requiresOpenAIKey: true
|
|
1772
|
+
};
|
|
1773
|
+
vocabularyEvaluator;
|
|
1774
|
+
sentenceStructureEvaluator;
|
|
1775
|
+
limit;
|
|
1776
|
+
constructor(config) {
|
|
1777
|
+
super(config);
|
|
1778
|
+
this.vocabularyEvaluator = new VocabularyEvaluator(config);
|
|
1779
|
+
this.sentenceStructureEvaluator = new SentenceStructureEvaluator(config);
|
|
1780
|
+
this.limit = pLimit(3);
|
|
1781
|
+
}
|
|
1782
|
+
/**
|
|
1783
|
+
* Evaluate text complexity for a given text and grade level
|
|
1784
|
+
*
|
|
1785
|
+
* Runs vocabulary and sentence structure evaluations in parallel with concurrency control.
|
|
1786
|
+
* If both sub-evaluators fail, throws an error. Otherwise returns a result map where
|
|
1787
|
+
* failed sub-evaluators are represented as `{ error: Error }`.
|
|
1788
|
+
*
|
|
1789
|
+
* @param text - The text to evaluate
|
|
1790
|
+
* @param grade - The target grade level (3-12)
|
|
1791
|
+
* @returns Map of sub-evaluator results
|
|
1792
|
+
* @throws {ValidationError} If text is empty or grade is invalid
|
|
1793
|
+
* @throws {Error} If all sub-evaluators fail
|
|
1794
|
+
*/
|
|
1795
|
+
async evaluate(text, grade) {
|
|
1796
|
+
this.logger.info("Starting text complexity evaluation", {
|
|
1797
|
+
evaluator: "text-complexity",
|
|
1798
|
+
operation: "evaluate",
|
|
1799
|
+
grade,
|
|
1800
|
+
textLength: text.length
|
|
1801
|
+
});
|
|
1802
|
+
this.validateText(text);
|
|
1803
|
+
this.validateGrade(grade, new Set(_TextComplexityEvaluator.metadata.supportedGrades));
|
|
1804
|
+
const startTime = Date.now();
|
|
1805
|
+
const [vocabResult, sentenceResult] = await Promise.all([
|
|
1806
|
+
this.limit(() => this.runSubEvaluator(this.vocabularyEvaluator, text, grade)),
|
|
1807
|
+
this.limit(() => this.runSubEvaluator(this.sentenceStructureEvaluator, text, grade))
|
|
1808
|
+
]);
|
|
1809
|
+
const latencyMs = Date.now() - startTime;
|
|
1810
|
+
const vocabFailed = "error" in vocabResult;
|
|
1811
|
+
const sentenceFailed = "error" in sentenceResult;
|
|
1812
|
+
const hasFailures = vocabFailed || sentenceFailed;
|
|
1813
|
+
if (hasFailures) {
|
|
1814
|
+
const errors = [];
|
|
1815
|
+
if (vocabFailed) errors.push(`Vocabulary: ${vocabResult.error.message}`);
|
|
1816
|
+
if (sentenceFailed) errors.push(`Sentence structure: ${sentenceResult.error.message}`);
|
|
1817
|
+
this.logger.error("Text complexity evaluation completed with errors", {
|
|
1818
|
+
evaluator: "text-complexity",
|
|
1819
|
+
operation: "evaluate",
|
|
1820
|
+
grade,
|
|
1821
|
+
errors,
|
|
1822
|
+
processingTimeMs: latencyMs
|
|
1823
|
+
});
|
|
1824
|
+
if (vocabFailed && sentenceFailed) {
|
|
1825
|
+
throw new Error(`Text complexity evaluation failed: ${errors.join("; ")}`);
|
|
1826
|
+
}
|
|
1827
|
+
}
|
|
1828
|
+
this.sendTelemetry({
|
|
1829
|
+
status: hasFailures ? "error" : "success",
|
|
1830
|
+
latencyMs,
|
|
1831
|
+
textLength: text.length,
|
|
1832
|
+
grade,
|
|
1833
|
+
provider: "composite:google+openai",
|
|
1834
|
+
errorCode: hasFailures ? "PartialFailure" : void 0,
|
|
1835
|
+
inputText: text
|
|
1836
|
+
}).catch(() => {
|
|
1837
|
+
});
|
|
1838
|
+
this.logger.info("Text complexity evaluation completed", {
|
|
1839
|
+
evaluator: "text-complexity",
|
|
1840
|
+
operation: "evaluate",
|
|
1841
|
+
grade,
|
|
1842
|
+
processingTimeMs: latencyMs,
|
|
1843
|
+
hasFailures
|
|
1844
|
+
});
|
|
1845
|
+
return { vocabulary: vocabResult, sentenceStructure: sentenceResult };
|
|
1846
|
+
}
|
|
1847
|
+
/**
|
|
1848
|
+
* Run a sub-evaluator with error handling.
|
|
1849
|
+
* Returns the evaluation result or `{ error: Error }` if the evaluator throws.
|
|
1850
|
+
*/
|
|
1851
|
+
async runSubEvaluator(evaluator, text, grade) {
|
|
1852
|
+
try {
|
|
1853
|
+
return await evaluator.evaluate(text, grade);
|
|
1854
|
+
} catch (error) {
|
|
1855
|
+
return { error: error instanceof Error ? error : new Error(String(error)) };
|
|
1856
|
+
}
|
|
1857
|
+
}
|
|
1858
|
+
};
|
|
1859
|
+
async function evaluateTextComplexity(text, grade, config) {
|
|
1860
|
+
const evaluator = new TextComplexityEvaluator(config);
|
|
1861
|
+
return evaluator.evaluate(text, grade);
|
|
1862
|
+
}
|
|
1863
|
+
|
|
1864
|
+
export { APIError, AuthenticationError, ComplexityClassificationSchema, ConfigurationError, EvaluatorError, GradeBand, GradeLevelAppropriatenessEvaluator, GradeLevelAppropriatenessSchema, LogLevel, NetworkError, RateLimitError, SentenceAnalysisSchema, SentenceStructureEvaluator, TextComplexityEvaluator, TextComplexityLevel, TimeoutError, ValidationError, VocabularyEvaluator, addEngineeredFeatures, calculateFleschKincaidGrade, calculateReadabilityMetrics, evaluateGradeLevelAppropriateness, evaluateSentenceStructure, evaluateTextComplexity, evaluateVocabulary, featuresToJSON };
|
|
1865
|
+
//# sourceMappingURL=index.js.map
|
|
1866
|
+
//# sourceMappingURL=index.js.map
|