@learning-commons/evaluators 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4326 @@
1
+ #!/usr/bin/env node
2
+ import * as fs2 from 'fs';
3
+ import { readFileSync, mkdirSync, writeFileSync } from 'fs';
4
+ import * as path from 'path';
5
+ import { dirname, join } from 'path';
6
+ import { exec } from 'child_process';
7
+ import prompts from 'prompts';
8
+ import pLimit from 'p-limit';
9
+ import { randomUUID } from 'crypto';
10
+ import { homedir } from 'os';
11
+ import { fileURLToPath } from 'url';
12
+ import { generateText, Output } from 'ai';
13
+ import { z } from 'zod';
14
+ import nlp from 'compromise';
15
+ import { syllable } from 'syllable';
16
+ import textReadability from 'text-readability';
17
+ import { parse } from 'csv-parse/sync';
18
+
19
+ // src/telemetry/client.ts
20
+ var TelemetryClient = class {
21
+ config;
22
+ logger;
23
+ constructor(config) {
24
+ this.config = config;
25
+ this.logger = config.logger;
26
+ }
27
+ /**
28
+ * Send telemetry event to analytics service
29
+ *
30
+ * Fire-and-forget: Errors are logged but don't throw.
31
+ */
32
+ async send(event) {
33
+ if (!this.config.enabled) {
34
+ return;
35
+ }
36
+ try {
37
+ const headers = {
38
+ "Content-Type": "application/json",
39
+ "X-Client-ID": this.config.clientId
40
+ };
41
+ if (this.config.partnerKey) {
42
+ headers["X-API-Key"] = this.config.partnerKey;
43
+ }
44
+ const response = await fetch(this.config.endpoint, {
45
+ method: "POST",
46
+ headers,
47
+ body: JSON.stringify(event),
48
+ // Don't block SDK operations on slow networks
49
+ signal: AbortSignal.timeout(5e3)
50
+ // 5 second timeout
51
+ });
52
+ if (!response.ok) {
53
+ this.logger.warn(
54
+ `[Telemetry] Failed to send event: ${response.status} ${response.statusText}`
55
+ );
56
+ }
57
+ } catch (error) {
58
+ if (error instanceof Error) {
59
+ if (error.name !== "TimeoutError" && error.name !== "AbortError") {
60
+ this.logger.warn(`[Telemetry] Error sending event: ${error.message}`);
61
+ }
62
+ }
63
+ }
64
+ }
65
+ };
66
+ var __filename$1 = fileURLToPath(import.meta.url);
67
+ var __dirname$1 = dirname(__filename$1);
68
+ var cachedClientId;
69
+ function generateClientId() {
70
+ if (cachedClientId) {
71
+ return cachedClientId;
72
+ }
73
+ const configFile = getConfigFilePath();
74
+ try {
75
+ const data = JSON.parse(readFileSync(configFile, "utf-8"));
76
+ if (data?.telemetry?.clientId) {
77
+ cachedClientId = data.telemetry.clientId;
78
+ return cachedClientId;
79
+ }
80
+ } catch {
81
+ }
82
+ const clientId = randomUUID();
83
+ try {
84
+ mkdirSync(dirname(configFile), { recursive: true });
85
+ writeFileSync(configFile, JSON.stringify({ telemetry: { clientId } }, null, 2));
86
+ } catch {
87
+ }
88
+ cachedClientId = clientId;
89
+ return cachedClientId;
90
+ }
91
+ function getConfigFilePath() {
92
+ const configDir = process.platform === "win32" ? join(process.env.APPDATA ?? homedir(), "learning-commons") : join(homedir(), ".config", "learning-commons");
93
+ return join(configDir, "config.json");
94
+ }
95
+ var cachedVersion;
96
+ function getSDKVersion() {
97
+ if (cachedVersion) {
98
+ return cachedVersion;
99
+ }
100
+ const possiblePaths = [
101
+ join(__dirname$1, "../../package.json"),
102
+ // From src/
103
+ join(__dirname$1, "../package.json")
104
+ // From dist/
105
+ ];
106
+ for (const path2 of possiblePaths) {
107
+ try {
108
+ const pkg = JSON.parse(readFileSync(path2, "utf-8"));
109
+ cachedVersion = pkg.version || "0.0.0";
110
+ return cachedVersion;
111
+ } catch {
112
+ continue;
113
+ }
114
+ }
115
+ cachedVersion = "0.0.0";
116
+ return cachedVersion;
117
+ }
118
+
119
+ // src/errors.ts
120
+ var EvaluatorError = class extends Error {
121
+ constructor(message, code) {
122
+ super(message);
123
+ this.code = code;
124
+ this.name = "EvaluatorError";
125
+ if (Error.captureStackTrace) {
126
+ Error.captureStackTrace(this, this.constructor);
127
+ }
128
+ }
129
+ };
130
+ var ConfigurationError = class extends EvaluatorError {
131
+ constructor(message) {
132
+ super(message, "CONFIGURATION_ERROR");
133
+ this.name = "ConfigurationError";
134
+ }
135
+ };
136
+ var ValidationError = class extends EvaluatorError {
137
+ constructor(message) {
138
+ super(message, "VALIDATION_ERROR");
139
+ this.name = "ValidationError";
140
+ }
141
+ };
142
+ var APIError = class extends EvaluatorError {
143
+ constructor(message, statusCode, retryable = false, code) {
144
+ super(message, code);
145
+ this.statusCode = statusCode;
146
+ this.retryable = retryable;
147
+ this.name = "APIError";
148
+ }
149
+ };
150
+ var AuthenticationError = class extends APIError {
151
+ constructor(message, statusCode) {
152
+ super(message, statusCode, false, "AUTHENTICATION_ERROR");
153
+ this.name = "AuthenticationError";
154
+ }
155
+ };
156
+ var RateLimitError = class extends APIError {
157
+ constructor(message, retryAfter) {
158
+ super(message, 429, true, "RATE_LIMIT_ERROR");
159
+ this.retryAfter = retryAfter;
160
+ this.name = "RateLimitError";
161
+ }
162
+ };
163
+ var NetworkError = class extends APIError {
164
+ constructor(message, retryable = true) {
165
+ super(message, void 0, retryable, "NETWORK_ERROR");
166
+ this.name = "NetworkError";
167
+ }
168
+ };
169
+ var TimeoutError = class extends APIError {
170
+ constructor(message = "Request timed out") {
171
+ super(message, 408, true, "TIMEOUT_ERROR");
172
+ this.name = "TimeoutError";
173
+ }
174
+ };
175
+ function parseProviderError(error) {
176
+ if (error instanceof Error) {
177
+ const message = error.message;
178
+ const err = error;
179
+ const statusMatch = message.match(/\b(4\d{2}|5\d{2})\b/);
180
+ const statusCode = err.statusCode ?? err.status ?? (statusMatch ? parseInt(statusMatch[1]) : void 0);
181
+ return {
182
+ message,
183
+ statusCode,
184
+ code: error.name !== "Error" ? error.name : void 0
185
+ };
186
+ }
187
+ return {
188
+ message: String(error)
189
+ };
190
+ }
191
+ function wrapProviderError(error, defaultMessage = "API request failed") {
192
+ const { message, statusCode, code } = parseProviderError(error);
193
+ if (statusCode === 404 || statusCode === 400 && /\bmodel\b.*(not found|does not exist|invalid)/i.test(message)) {
194
+ return new ConfigurationError(
195
+ `Model not found or invalid: ${message}. Check the model ID passed to the provider.`
196
+ );
197
+ }
198
+ if (statusCode === 401 || statusCode === 403) {
199
+ return new AuthenticationError(
200
+ message.includes("API key") ? message : "Invalid API key",
201
+ statusCode
202
+ );
203
+ }
204
+ if (statusCode === 429) {
205
+ const retryAfterMatch = message.match(/retry[- ]after[:\s]+(\d+)/i);
206
+ const retryAfter = retryAfterMatch ? parseInt(retryAfterMatch[1]) * 1e3 : void 0;
207
+ return new RateLimitError(
208
+ message.includes("rate limit") ? message : "Rate limit exceeded",
209
+ retryAfter
210
+ );
211
+ }
212
+ if (message.includes("ECONNREFUSED") || message.includes("ENOTFOUND") || message.includes("ETIMEDOUT") || message.includes("network") || message.includes("Network")) {
213
+ return new NetworkError(message);
214
+ }
215
+ if (message.includes("timeout") || message.includes("timed out")) {
216
+ return new TimeoutError(message);
217
+ }
218
+ return new APIError(
219
+ message || defaultMessage,
220
+ statusCode,
221
+ statusCode ? statusCode >= 500 : false,
222
+ // 5xx errors are retryable
223
+ code
224
+ );
225
+ }
226
+
227
+ // src/logger.ts
228
+ var ConsoleLogger = class {
229
+ constructor(level = 2 /* WARN */) {
230
+ this.level = level;
231
+ }
232
+ debug(message, context) {
233
+ if (this.level <= 0 /* DEBUG */) {
234
+ console.debug(`[DEBUG] ${message}`, context || "");
235
+ }
236
+ }
237
+ info(message, context) {
238
+ if (this.level <= 1 /* INFO */) {
239
+ console.info(`[INFO] ${message}`, context || "");
240
+ }
241
+ }
242
+ warn(message, context) {
243
+ if (this.level <= 2 /* WARN */) {
244
+ console.warn(`[WARN] ${message}`, context || "");
245
+ }
246
+ }
247
+ error(message, context) {
248
+ if (this.level <= 3 /* ERROR */) {
249
+ console.error(`[ERROR] ${message}`, context || "");
250
+ }
251
+ }
252
+ };
253
+ var SilentLogger = class {
254
+ debug() {
255
+ }
256
+ info() {
257
+ }
258
+ warn() {
259
+ }
260
+ error() {
261
+ }
262
+ };
263
+ function createLogger(customLogger, level = 2 /* WARN */) {
264
+ if (customLogger) {
265
+ return customLogger;
266
+ }
267
+ if (level === 4 /* SILENT */) {
268
+ return new SilentLogger();
269
+ }
270
+ return new ConsoleLogger(level);
271
+ }
272
+ var VercelAIProvider = class {
273
+ constructor(config) {
274
+ this.config = config;
275
+ if (config.type === "custom") {
276
+ throw new Error(
277
+ "VercelAIProvider does not support custom type. Use config.customProvider directly."
278
+ );
279
+ }
280
+ if (!config.model || config.model.trim() === "") {
281
+ throw new Error(
282
+ `model is required for VercelAIProvider (type: "${config.type}"). No default is assumed.`
283
+ );
284
+ }
285
+ this.model = config.model;
286
+ this.label = `${config.type}:${config.model}`;
287
+ }
288
+ label;
289
+ model;
290
+ /**
291
+ * Generate structured output using Vercel AI SDK's generateText with output
292
+ */
293
+ async generateStructured(request) {
294
+ const model = await this.getModel();
295
+ const startTime = Date.now();
296
+ const { output, usage } = await generateText({
297
+ model,
298
+ messages: request.messages,
299
+ output: Output.object({ schema: request.schema }),
300
+ temperature: request.temperature ?? 0,
301
+ maxRetries: this.config.maxRetries ?? 0,
302
+ ...request.maxTokens !== void 0 ? { maxTokens: request.maxTokens } : {}
303
+ });
304
+ return {
305
+ data: output,
306
+ model: this.model,
307
+ usage: {
308
+ inputTokens: usage.inputTokens || 0,
309
+ outputTokens: usage.outputTokens || 0
310
+ },
311
+ latencyMs: Date.now() - startTime
312
+ };
313
+ }
314
+ /**
315
+ * Generate plain text using Vercel AI SDK's generateText
316
+ */
317
+ async generateText(messages, temperature) {
318
+ const model = await this.getModel();
319
+ const startTime = Date.now();
320
+ const { text, usage } = await generateText({
321
+ model,
322
+ messages,
323
+ temperature: temperature ?? this.config.temperature ?? 0,
324
+ maxRetries: this.config.maxRetries ?? 0
325
+ });
326
+ return {
327
+ text,
328
+ usage: {
329
+ inputTokens: usage.inputTokens || 0,
330
+ outputTokens: usage.outputTokens || 0
331
+ },
332
+ latencyMs: Date.now() - startTime
333
+ };
334
+ }
335
+ /**
336
+ * Get the configured language model.
337
+ * Uses dynamic imports so consumers only need to install the provider packages they use.
338
+ */
339
+ async getModel() {
340
+ const apiKey = this.config.apiKey;
341
+ switch (this.config.type) {
342
+ case "openai": {
343
+ const { createOpenAI } = await import('@ai-sdk/openai').catch(() => {
344
+ throw new Error(
345
+ "To use the OpenAI provider, install its adapter: npm install @ai-sdk/openai"
346
+ );
347
+ });
348
+ return createOpenAI(apiKey ? { apiKey } : {})(this.model);
349
+ }
350
+ case "anthropic": {
351
+ const { createAnthropic } = await import('@ai-sdk/anthropic').catch(() => {
352
+ throw new Error(
353
+ "To use the Anthropic provider, install its adapter: npm install @ai-sdk/anthropic"
354
+ );
355
+ });
356
+ return createAnthropic(apiKey ? { apiKey } : {})(this.model);
357
+ }
358
+ case "google": {
359
+ const { createGoogleGenerativeAI } = await import('@ai-sdk/google').catch(() => {
360
+ throw new Error(
361
+ "To use the Google provider, install its adapter: npm install @ai-sdk/google"
362
+ );
363
+ });
364
+ return createGoogleGenerativeAI(apiKey ? { apiKey } : {})(this.model);
365
+ }
366
+ default:
367
+ throw new Error(`Unsupported provider type: ${this.config.type}`);
368
+ }
369
+ }
370
+ };
371
+ function createProvider(config) {
372
+ if (config.type === "custom" && config.customProvider) {
373
+ return config.customProvider;
374
+ }
375
+ return new VercelAIProvider(config);
376
+ }
377
+
378
+ // src/evaluators/base.ts
379
+ var VALIDATION_LIMITS = {
380
+ /** Minimum text length in characters */
381
+ MIN_TEXT_LENGTH: 10,
382
+ /** Maximum text length in characters (100K chars ≈ 25K tokens) */
383
+ MAX_TEXT_LENGTH: 1e5
384
+ };
385
+ var Provider = /* @__PURE__ */ ((Provider2) => {
386
+ Provider2["OpenAI"] = "openai";
387
+ Provider2["Google"] = "google";
388
+ Provider2["Anthropic"] = "anthropic";
389
+ return Provider2;
390
+ })(Provider || {});
391
+ var BaseEvaluator = class {
392
+ telemetryClient;
393
+ logger;
394
+ config;
395
+ /**
396
+ * Static metadata for the evaluator
397
+ *
398
+ * Concrete evaluators MUST define this property.
399
+ *
400
+ * @example
401
+ * ```typescript
402
+ * class MyEvaluator extends BaseEvaluator {
403
+ * static readonly metadata = {
404
+ * id: 'my-evaluator',
405
+ * name: 'My Evaluator',
406
+ * description: 'Does something useful',
407
+ * supportedGrades: ['3', '4', '5'],
408
+ * defaultProviders: [Provider.Google],
409
+ * };
410
+ * }
411
+ * ```
412
+ */
413
+ static metadata;
414
+ /**
415
+ * @throws {ConfigurationError} If the subclass has not defined static metadata
416
+ * @throws {ConfigurationError} If modelOverride has an invalid provider or empty model
417
+ * @throws {ConfigurationError} If a required API key is missing
418
+ */
419
+ constructor(config) {
420
+ this.logger = createLogger(config.logger, config.logLevel ?? 2 /* WARN */);
421
+ this.validateModelOverride(config);
422
+ this.validateApiKeys(config);
423
+ const telemetryConfig = this.normalizeTelemetryConfig(config.telemetry);
424
+ this.config = {
425
+ maxRetries: config.maxRetries ?? 2,
426
+ telemetry: telemetryConfig,
427
+ modelOverride: config.modelOverride,
428
+ googleApiKey: config.googleApiKey,
429
+ openaiApiKey: config.openaiApiKey,
430
+ anthropicApiKey: config.anthropicApiKey
431
+ };
432
+ if (config.modelOverride) {
433
+ this.logger.warn(
434
+ `modelOverride is active: using ${config.modelOverride.provider}:${config.modelOverride.model} instead of the default model. Evaluation quality may differ from recommended defaults.`
435
+ );
436
+ }
437
+ if (this.config.telemetry.enabled) {
438
+ this.telemetryClient = new TelemetryClient({
439
+ endpoint: "https://api.learningcommons.org/evaluators-telemetry/v1/events",
440
+ partnerKey: config.partnerKey,
441
+ clientId: generateClientId(),
442
+ enabled: true,
443
+ logger: this.logger
444
+ });
445
+ }
446
+ }
447
+ /**
448
+ * Get metadata for this evaluator instance
449
+ * @throws {ConfigurationError} If the subclass has not defined static metadata
450
+ */
451
+ get metadata() {
452
+ const meta = this.constructor.metadata;
453
+ if (!meta) {
454
+ throw new ConfigurationError(
455
+ `${this.constructor.name} must define a static readonly metadata block.`
456
+ );
457
+ }
458
+ return meta;
459
+ }
460
+ /**
461
+ * Validate modelOverride shape: provider must be a known Provider value and
462
+ * model must be a non-empty string.
463
+ * @throws {ConfigurationError} If the override is malformed
464
+ */
465
+ validateModelOverride(config) {
466
+ if (!config.modelOverride) return;
467
+ const validProviders = Object.values(Provider);
468
+ if (!validProviders.includes(config.modelOverride.provider)) {
469
+ throw new ConfigurationError(
470
+ `Invalid provider "${config.modelOverride.provider}" in modelOverride. Valid providers are: ${validProviders.join(", ")}.`
471
+ );
472
+ }
473
+ if (!config.modelOverride.model || config.modelOverride.model.trim() === "") {
474
+ throw new ConfigurationError(
475
+ `modelOverride.model is required. Specify the model ID for provider "${config.modelOverride.provider}".`
476
+ );
477
+ }
478
+ }
479
+ /**
480
+ * Validate that the required API key is present.
481
+ * When modelOverride is set, checks the override provider's key.
482
+ * Otherwise checks the keys required by the evaluator's default providers.
483
+ * @throws {ConfigurationError} If a required key is missing
484
+ */
485
+ validateApiKeys(config) {
486
+ const keyFor = {
487
+ ["openai" /* OpenAI */]: config.openaiApiKey?.trim() || void 0,
488
+ ["google" /* Google */]: config.googleApiKey?.trim() || void 0,
489
+ ["anthropic" /* Anthropic */]: config.anthropicApiKey?.trim() || void 0
490
+ };
491
+ const humanName = {
492
+ ["openai" /* OpenAI */]: "OpenAI API key",
493
+ ["google" /* Google */]: "Google API key",
494
+ ["anthropic" /* Anthropic */]: "Anthropic API key"
495
+ };
496
+ const configKey = {
497
+ ["openai" /* OpenAI */]: "openaiApiKey",
498
+ ["google" /* Google */]: "googleApiKey",
499
+ ["anthropic" /* Anthropic */]: "anthropicApiKey"
500
+ };
501
+ if (config.modelOverride) {
502
+ if (!keyFor[config.modelOverride.provider]) {
503
+ throw new ConfigurationError(
504
+ `${humanName[config.modelOverride.provider]} is required when using modelOverride with provider "${config.modelOverride.provider}". Pass ${configKey[config.modelOverride.provider]} in config.`
505
+ );
506
+ }
507
+ return;
508
+ }
509
+ for (const provider of this.metadata.defaultProviders) {
510
+ if (!keyFor[provider]) {
511
+ throw new ConfigurationError(
512
+ `${humanName[provider]} is required for ${this.metadata.name} evaluator. Pass ${configKey[provider]} in config.`
513
+ );
514
+ }
515
+ }
516
+ }
517
+ /**
518
+ * Normalize telemetry config to standard format
519
+ */
520
+ normalizeTelemetryConfig(telemetry) {
521
+ if (telemetry === false) {
522
+ return {
523
+ enabled: false,
524
+ recordInputs: false
525
+ };
526
+ }
527
+ if (telemetry === true || telemetry === void 0) {
528
+ return {
529
+ enabled: true,
530
+ recordInputs: false
531
+ };
532
+ }
533
+ return {
534
+ enabled: telemetry.enabled ?? true,
535
+ recordInputs: telemetry.recordInputs ?? false
536
+ };
537
+ }
538
+ /**
539
+ * Get the evaluator type identifier from metadata
540
+ * @returns The evaluator type ID (e.g., "vocabulary", "sentence-structure")
541
+ */
542
+ getEvaluatorType() {
543
+ return this.metadata.id;
544
+ }
545
+ /**
546
+ * Validate text meets requirements
547
+ * Default implementation - can be overridden by concrete evaluators
548
+ *
549
+ * @throws {ValidationError} If text is invalid
550
+ */
551
+ validateText(text) {
552
+ this.logger.debug("Validating text input", {
553
+ evaluator: this.getEvaluatorType(),
554
+ operation: "validateText",
555
+ textLength: text.length
556
+ });
557
+ const trimmedText = text.trim();
558
+ if (!trimmedText) {
559
+ throw new ValidationError("Text cannot be empty or contain only whitespace");
560
+ }
561
+ if (trimmedText.length < VALIDATION_LIMITS.MIN_TEXT_LENGTH) {
562
+ throw new ValidationError(
563
+ `Text is too short. Minimum length is ${VALIDATION_LIMITS.MIN_TEXT_LENGTH} characters, received ${trimmedText.length} characters`
564
+ );
565
+ }
566
+ if (trimmedText.length > VALIDATION_LIMITS.MAX_TEXT_LENGTH) {
567
+ throw new ValidationError(
568
+ `Text is too long. Maximum length is ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters, received ${trimmedText.length.toLocaleString()} characters`
569
+ );
570
+ }
571
+ }
572
+ /**
573
+ * Validate grade is in supported range
574
+ * Default implementation - can be overridden by concrete evaluators
575
+ *
576
+ * @param grade - Grade level to validate
577
+ * @param validGrades - Set of valid grades for this evaluator
578
+ * @throws {ValidationError} If grade is invalid
579
+ */
580
+ validateGrade(grade, validGrades) {
581
+ this.logger.debug("Validating grade input", {
582
+ evaluator: this.getEvaluatorType(),
583
+ operation: "validateGrade",
584
+ grade
585
+ });
586
+ if (!validGrades.has(grade)) {
587
+ const validList = Array.from(validGrades).sort((a, b) => {
588
+ if (a === "K") return -1;
589
+ if (b === "K") return 1;
590
+ return parseInt(a, 10) - parseInt(b, 10);
591
+ }).join(", ");
592
+ throw new ValidationError(
593
+ `Invalid grade "${grade}". Supported grades for this evaluator: ${validList}`
594
+ );
595
+ }
596
+ }
597
+ /**
598
+ * Create an LLM provider, honouring modelOverride if set.
599
+ * When override is active, the key for the override provider is resolved
600
+ * from the matching top-level config field (e.g. anthropicApiKey for Anthropic).
601
+ */
602
+ createConfiguredProvider(defaultType, defaultModel, defaultApiKey) {
603
+ const override = this.config.modelOverride;
604
+ if (override) {
605
+ const apiKeyFor = {
606
+ ["openai" /* OpenAI */]: this.config.openaiApiKey,
607
+ ["google" /* Google */]: this.config.googleApiKey,
608
+ ["anthropic" /* Anthropic */]: this.config.anthropicApiKey
609
+ };
610
+ return createProvider({
611
+ type: override.provider,
612
+ model: override.model,
613
+ apiKey: apiKeyFor[override.provider],
614
+ maxRetries: this.config.maxRetries
615
+ });
616
+ }
617
+ return createProvider({
618
+ type: defaultType,
619
+ model: defaultModel,
620
+ apiKey: defaultApiKey,
621
+ maxRetries: this.config.maxRetries
622
+ });
623
+ }
624
+ /**
625
+ * Send telemetry event to analytics service
626
+ * Common helper for all evaluators
627
+ */
628
+ async sendTelemetry(params) {
629
+ if (!this.telemetryClient) {
630
+ return;
631
+ }
632
+ await this.telemetryClient.send({
633
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
634
+ sdk_version: getSDKVersion(),
635
+ evaluator_type: this.getEvaluatorType(),
636
+ grade: params.grade,
637
+ status: params.status,
638
+ error_code: params.errorCode,
639
+ latency_ms: params.latencyMs,
640
+ text_length_chars: params.textLength,
641
+ provider: params.provider,
642
+ token_usage: params.tokenUsage,
643
+ metadata: params.metadata,
644
+ model_override: this.config.modelOverride ? true : void 0,
645
+ // Include input text only if recording is enabled
646
+ input_text: this.config.telemetry.recordInputs ? params.inputText : void 0
647
+ });
648
+ }
649
+ };
650
+ var TextComplexityLevel = z.enum([
651
+ "Slightly complex",
652
+ "Moderately complex",
653
+ "Very complex",
654
+ "Exceedingly complex"
655
+ ]);
656
+
657
+ // src/schemas/vocabulary.ts
658
+ var VocabularyComplexitySchema = z.object({
659
+ tier_2_words: z.string().describe("List of Tier 2 words (academic words)"),
660
+ tier_3_words: z.string().describe("List of Tier 3 words (domain-specific)"),
661
+ archaic_words: z.string().describe("List of Archaic words"),
662
+ other_complex_words: z.string().describe("List of Other Complex words"),
663
+ complexity_score: TextComplexityLevel.describe(
664
+ "The complexity of the text vocabulary"
665
+ ),
666
+ reasoning: z.string().describe("Detailed reasoning for the complexity rating")
667
+ });
668
+ function calculateFleschKincaidGrade(text) {
669
+ return calculateReadabilityMetrics(text).fleschKincaidGrade;
670
+ }
671
+ function calculateReadabilityMetrics(text) {
672
+ const doc = nlp(text);
673
+ const sentences = doc.sentences().length;
674
+ const terms = doc.terms();
675
+ const words = terms.length;
676
+ const characters = text.replace(/\s/g, "").length;
677
+ const allWords = terms.out("array");
678
+ const totalSyllables = allWords.reduce((sum, word) => sum + syllable(word), 0);
679
+ const avgWordsPerSentence = sentences > 0 ? words / sentences : 0;
680
+ const avgSyllablesPerWord = words > 0 ? totalSyllables / words : 0;
681
+ const fkGrade = 0.39 * avgWordsPerSentence + 11.8 * avgSyllablesPerWord - 15.59;
682
+ return {
683
+ sentenceCount: sentences,
684
+ wordCount: words,
685
+ characterCount: characters,
686
+ syllableCount: totalSyllables,
687
+ avgWordsPerSentence,
688
+ avgSyllablesPerWord,
689
+ fleschKincaidGrade: Math.round(Math.max(0, fkGrade) * 100) / 100
690
+ };
691
+ }
692
+
693
+ // src/features/sentence-features.ts
694
+ function safeDivision(numerator, denominator) {
695
+ return denominator === 0 ? 0 : numerator / denominator;
696
+ }
697
+ function standardDeviation(values) {
698
+ if (values.length <= 1) return 0;
699
+ const mean = values.reduce((sum, val) => sum + val, 0) / values.length;
700
+ const squaredDiffs = values.map((val) => Math.pow(val - mean, 2));
701
+ const variance = squaredDiffs.reduce((sum, val) => sum + val, 0) / values.length;
702
+ return Math.sqrt(variance);
703
+ }
704
+ function categorizeSentenceLengths(wordCounts) {
705
+ if (!wordCounts || wordCounts.length === 0) {
706
+ return {
707
+ percent_short_sentences: 0,
708
+ percent_medium_sentences: 0,
709
+ percent_long_sentences: 0,
710
+ percent_very_long_sentences: 0
711
+ };
712
+ }
713
+ let short = 0, medium = 0, long = 0, veryLong = 0;
714
+ for (const count of wordCounts) {
715
+ if (count <= 10) short++;
716
+ else if (count <= 20) medium++;
717
+ else if (count <= 30) long++;
718
+ else veryLong++;
719
+ }
720
+ const total = wordCounts.length;
721
+ return {
722
+ percent_short_sentences: short / total * 100,
723
+ percent_medium_sentences: medium / total * 100,
724
+ percent_long_sentences: long / total * 100,
725
+ percent_very_long_sentences: veryLong / total * 100
726
+ };
727
+ }
728
+ function addEngineeredFeatures(analysis) {
729
+ const numSentences = analysis.num_sentences;
730
+ const numWords = analysis.num_words;
731
+ const avg_words_per_sentence = safeDivision(numWords, numSentences);
732
+ const sentence_length_variation = standardDeviation(analysis.sentence_word_counts);
733
+ const lengthCategories = categorizeSentenceLengths(analysis.sentence_word_counts);
734
+ const percent_simple_sentences = safeDivision(analysis.num_simple_sentences, numSentences) * 100;
735
+ const percent_compound_sentences = safeDivision(analysis.num_compound_sentences, numSentences) * 100;
736
+ const percent_complex_sentences = safeDivision(analysis.num_complex_sentences, numSentences) * 100;
737
+ const percent_compound_complex_sentences = safeDivision(analysis.num_compound_complex_sentences, numSentences) * 100;
738
+ const percent_other_sentences = safeDivision(analysis.num_other_sentences, numSentences) * 100;
739
+ const percent_words_in_simple_sentences = safeDivision(analysis.words_in_simple_sentences, numWords) * 100;
740
+ const percent_words_in_compound_sentences = safeDivision(analysis.words_in_compound_sentences, numWords) * 100;
741
+ const percent_words_in_complex_sentences = safeDivision(analysis.words_in_complex_sentences, numWords) * 100;
742
+ const percent_words_in_compound_complex_sentences = safeDivision(analysis.words_in_compound_complex_sentences, numWords) * 100;
743
+ const percent_words_in_other_sentences = safeDivision(analysis.words_in_other_sentences, numWords) * 100;
744
+ const avg_subordinates_per_sentence = safeDivision(analysis.num_subordinate_clauses, numSentences);
745
+ const avg_clauses_per_sentence = safeDivision(analysis.num_total_clauses, numSentences);
746
+ const percent_sentences_with_subordinate = safeDivision(analysis.num_sentences_with_subordinate, numSentences) * 100;
747
+ const percent_sentences_with_multiple_subordinates = safeDivision(analysis.num_sentences_with_multiple_subordinates, numSentences) * 100;
748
+ const percent_sentences_with_embedded_clauses = safeDivision(analysis.num_sentences_with_embedded_clauses, numSentences) * 100;
749
+ const prep_phrase_density = safeDivision(analysis.num_prepositional_phrases, numWords) * 100;
750
+ const participle_phrase_density = safeDivision(analysis.num_participle_phrases, numWords) * 100;
751
+ const appositive_phrase_density = safeDivision(analysis.num_appositive_phrases, numWords) * 100;
752
+ const total_transitions = analysis.num_simple_transitions + analysis.num_sophisticated_transitions;
753
+ const avg_transitions_per_sentence = safeDivision(total_transitions, numSentences);
754
+ const percent_sophisticated_transitions = safeDivision(analysis.num_sophisticated_transitions, total_transitions) * 100;
755
+ const percent_sentences_w_one_concept = safeDivision(analysis.num_one_concept_sentences, numSentences) * 100;
756
+ const percent_sentences_w_multi_concept = safeDivision(analysis.num_multi_concept_sentences, numSentences) * 100;
757
+ const percent_cleft_sentences = safeDivision(analysis.num_cleft_sentences, numSentences) * 100;
758
+ return {
759
+ ...analysis,
760
+ avg_words_per_sentence,
761
+ sentence_length_variation,
762
+ ...lengthCategories,
763
+ percent_simple_sentences,
764
+ percent_compound_sentences,
765
+ percent_complex_sentences,
766
+ percent_compound_complex_sentences,
767
+ percent_other_sentences,
768
+ percent_words_in_simple_sentences,
769
+ percent_words_in_compound_sentences,
770
+ percent_words_in_complex_sentences,
771
+ percent_words_in_compound_complex_sentences,
772
+ percent_words_in_other_sentences,
773
+ avg_subordinates_per_sentence,
774
+ avg_clauses_per_sentence,
775
+ percent_sentences_with_subordinate,
776
+ percent_sentences_with_multiple_subordinates,
777
+ percent_sentences_with_embedded_clauses,
778
+ prep_phrase_density,
779
+ participle_phrase_density,
780
+ appositive_phrase_density,
781
+ avg_transitions_per_sentence,
782
+ percent_sophisticated_transitions,
783
+ percent_sentences_w_one_concept,
784
+ percent_sentences_w_multi_concept,
785
+ percent_cleft_sentences
786
+ };
787
+ }
788
+ var FEATURE_COLS = [
789
+ // Foundational & Distributional
790
+ "avg_words_per_sentence",
791
+ "sentence_length_variation",
792
+ "percent_short_sentences",
793
+ "percent_medium_sentences",
794
+ "percent_long_sentences",
795
+ "percent_very_long_sentences",
796
+ "flesch_kincaid_grade",
797
+ // Sentence Structure (Grammatical Type)
798
+ "percent_simple_sentences",
799
+ "percent_compound_sentences",
800
+ "percent_complex_sentences",
801
+ "percent_compound_complex_sentences",
802
+ "percent_other_sentences",
803
+ // Word Distribution
804
+ "percent_words_in_simple_sentences",
805
+ "percent_words_in_complex_sentences",
806
+ "percent_words_in_compound_sentences",
807
+ "percent_words_in_compound_complex_sentences",
808
+ "percent_words_in_other_sentences",
809
+ // Clausal & Subordination
810
+ "avg_subordinates_per_sentence",
811
+ "avg_clauses_per_sentence",
812
+ "percent_sentences_with_subordinate",
813
+ "percent_sentences_with_multiple_subordinates",
814
+ "percent_sentences_with_embedded_clauses",
815
+ // Phrase Density
816
+ "prep_phrase_density",
817
+ "participle_phrase_density",
818
+ "appositive_phrase_density",
819
+ // Cohesion & Transitions
820
+ "avg_transitions_per_sentence",
821
+ "percent_sophisticated_transitions",
822
+ // Conceptual & Other
823
+ "percent_sentences_w_one_concept",
824
+ "percent_sentences_w_multi_concept",
825
+ "percent_cleft_sentences",
826
+ "max_clauses_in_any_sentence",
827
+ // Grades 5-12
828
+ "num_sentences",
829
+ "num_simple_sentences",
830
+ "num_compound",
831
+ "num_basic_complex",
832
+ "num_advanced_complex",
833
+ "percentage_simple",
834
+ "percentage_compound",
835
+ "percentage_basic_complex",
836
+ "percentage_advanced_complex"
837
+ ];
838
+ function featuresToJSON(features, decimals = 1, castToInt = true) {
839
+ const payload = {};
840
+ for (const col of FEATURE_COLS) {
841
+ const value = features[col];
842
+ if (typeof value === "number") {
843
+ const rounded = Math.round(value * Math.pow(10, decimals)) / Math.pow(10, decimals);
844
+ payload[col] = castToInt ? Math.round(rounded) : rounded;
845
+ } else {
846
+ payload[col] = null;
847
+ }
848
+ }
849
+ return JSON.stringify(payload, null, 2);
850
+ }
851
+ var LIBRARY_ADAPTERS = {
852
+ "text-readability": {
853
+ call(fnName, text) {
854
+ const fn = textReadability[fnName];
855
+ if (typeof fn !== "function") {
856
+ throw new Error(`Function "${fnName}" not found in text-readability.`);
857
+ }
858
+ return fn.call(textReadability, text);
859
+ }
860
+ }
861
+ };
862
+ var POST_TRANSFORMS = {
863
+ round(value, { precision = 0 }) {
864
+ const factor = 10 ** precision;
865
+ return Math.round(value * factor) / factor;
866
+ }
867
+ };
868
+ function runPreprocessingStep(text, impl) {
869
+ const adapter = LIBRARY_ADAPTERS[impl.library];
870
+ if (!adapter) {
871
+ const supported = Object.keys(LIBRARY_ADAPTERS).join(", ");
872
+ throw new Error(
873
+ `Unsupported preprocessing library "${impl.library}". Supported: ${supported}.`
874
+ );
875
+ }
876
+ let result = adapter.call(impl.function, text);
877
+ if (impl.post_transform) {
878
+ const transform = POST_TRANSFORMS[impl.post_transform.type];
879
+ if (!transform) {
880
+ const supported = Object.keys(POST_TRANSFORMS).join(", ");
881
+ throw new Error(
882
+ `Unsupported post_transform type "${impl.post_transform.type}". Supported: ${supported}.`
883
+ );
884
+ }
885
+ result = transform(result, impl.post_transform);
886
+ }
887
+ return result;
888
+ }
889
+
890
+ // ../../evals/prompts/vocabulary/background-knowledge.txt
891
+ var background_knowledge_default = `
892
+ Review the following text, which is an educational text written for students in the following grade band: {grade}.
893
+
894
+ Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
895
+
896
+ Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
897
+
898
+ Here's an example:
899
+ [START EXAMPLE]
900
+ Grade Band: 11th
901
+ Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
902
+ learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
903
+ not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
904
+ know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
905
+ are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
906
+ is the chief end of man here to "glorify God and enjoy him forever."
907
+
908
+ Background Knowledge Assumption: Assume they've studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
909
+ [END EXAMPLE]
910
+
911
+ You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
912
+
913
+ You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
914
+ [BEGIN TOPICS]
915
+ [
916
+ K: [
917
+ "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
918
+ "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
919
+ "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
920
+ "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
921
+ "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
922
+ "How Has Our World Changed?", "Why Do People Have Jobs?"
923
+ ],
924
+ 1: [
925
+ "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
926
+ "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals' unique features?",
927
+ "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
928
+ "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
929
+ "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
930
+ "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
931
+ ],
932
+ 2: [
933
+ "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
934
+ "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
935
+ "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
936
+ "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
937
+ "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
938
+ "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
939
+ ],
940
+ "3": [
941
+ "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
942
+ "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
943
+ "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
944
+ "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
945
+ "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
946
+ "How Do People in a Community Meet Their Wants and Needs?"
947
+ ],
948
+ 4: [
949
+ "Poetry", "Animal Defense Mechanisms", "The American Revolution",
950
+ "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
951
+ "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
952
+ "Extreme Settings: How does a challenging setting or physical environment change a person?",
953
+ "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
954
+ "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
955
+ "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
956
+ "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
957
+ "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
958
+ ],
959
+ 5: [
960
+ "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
961
+ "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
962
+ "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
963
+ "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
964
+ "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
965
+ "How Were the Lives of Native Peoples Influenced by Where They Lived?",
966
+ "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
967
+ "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
968
+ "How Does the Constitution Help Us Understand What It Means to Be an American?",
969
+ "What Do the Early Years of the United States Reveal About the Character of the Nation?",
970
+ "What Was the Effect of the Civil War on U.S. Society?"
971
+ ],
972
+ 6: [
973
+ "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
974
+ "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
975
+ "A Hero's Journey: What is the significance and power of the hero's journey?",
976
+ "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
977
+ "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
978
+ "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
979
+ "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
980
+ "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
981
+ "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
982
+ "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
983
+ "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
984
+ "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
985
+ "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
986
+ ],
987
+ 7: [
988
+ "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
989
+ "Identity in the Middle Ages: How does society both support and limit the development of identity?",
990
+ "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
991
+ "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
992
+ "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
993
+ "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
994
+ "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
995
+ "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
996
+ "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
997
+ "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
998
+ ],
999
+ 8: [
1000
+ "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
1001
+ "The Poetics and Power of Storytelling: What is the power of storytelling?",
1002
+ "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
1003
+ "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
1004
+ "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
1005
+ "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
1006
+ "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
1007
+ "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
1008
+ "Government and the Economy", "The Global Economy"
1009
+ ]
1010
+ ]
1011
+ [END TOPICS]
1012
+
1013
+ Here is the text:
1014
+ [BEGIN TEXT]
1015
+ {text}
1016
+ [END TEXT]
1017
+ `;
1018
+
1019
+ // src/prompts/vocabulary/background-knowledge.ts
1020
+ function getBackgroundKnowledgePrompt(text, grade) {
1021
+ return background_knowledge_default.replaceAll("{grade}", grade).replaceAll("{text}", text);
1022
+ }
1023
+
1024
+ // ../../evals/prompts/vocabulary/grades-3-4-system.txt
1025
+ var grades_3_4_system_default = "\nYou are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.\n\nYou will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.\nIMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)\n\n**Resource 1: Qualitative Text Complexity rubric (SAP)**\n1. **Level 1: Slightly complex**\n * Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.\n * Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!\n2. **Level 2: Moderately complex**\n * Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK\n * Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.\n3. **Level 3: Very complex**\n * Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic\n * Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.\n4. **Level 4: Exceedingly complex**\n * Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.\n * Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.\n\n**Resource 2: Flesch-Kincaid Grade Level**\nUse the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.\n* grade 2-3: 1.98-5.34\n* grade 4-5: 4.51-7.73\n* grade 6-8: 6.51-10.34\n* grade 9-10: 8.32-12.12\n* grade 11-College: 10.34-14.2\n\n**Guidelines for Interpretation and Reasoning**\n\nYour reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:\n\n1. **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.\n2. **Contextual Scaffolding:** Assess how the text supports new vocabulary.\n * Are new, complex terms explicitly defined or explained with simple examples (e.g., \"volume... to see if it is big enough to hold a liter of food\")?\n * Is the surrounding language simple and conversational, making the meaning of new words easier to infer?\n * Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.\n3. **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).\n4. **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.\n5. **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.\n * **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.\n * **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.\n6. **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).\n\n**Final Analysis Format**\n\nProvide these information as your final analysis:\n1. **Complex vocabulary:**\n * Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.\n * Tier 3 words: Overly academic or domain-specific words.\n * Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., \"The jury retired to deliberate on their verdict.\" The use of \"retire\" to mean withdrawing to a private place is an archaic use.\n * Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).\n2. **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex\n3. **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.\n";
1026
+
1027
+ // ../../evals/prompts/vocabulary/other-grades-system.txt
1028
+ var other_grades_system_default = "\nYou are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.\n\nYou will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.\n\nIMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)\nIMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.\n\nPlease first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).\n";
1029
+
1030
+ // src/prompts/vocabulary/system.ts
1031
+ function getSystemPrompt(grade) {
1032
+ if (grade === "3" || grade === "4") {
1033
+ return grades_3_4_system_default;
1034
+ }
1035
+ return other_grades_system_default;
1036
+ }
1037
+
1038
+ // ../../evals/prompts/vocabulary/grades-3-4-user.txt
1039
+ var grades_3_4_user_default = "\nBelow is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.\n\n- It is intended for grade {student_grade_level}.\n\n- You can assume the student has the following background knowledge about the text \u2014 this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}\n\n- Text Flesch-Kincaid grade level: {fk_level}\n\n- Text to evaluate: [BEGIN TEXT]\n{text}\n[END TEXT]\n";
1040
+
1041
+ // ../../evals/prompts/vocabulary/other-grades-user.txt
1042
+ var other_grades_user_default = `
1043
+ Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly \u2014 do not introduce any additional criteria or lenses for judging the complexity of the text.
1044
+
1045
+ [BEGIN ANNOTATION GUIDE AND RUBRIC]
1046
+ Instructions
1047
+ For the following task, please assume that:
1048
+ - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).
1049
+ - The student is moving through a common progression of topics (detailed here).
1050
+ - The student is fluent in speaking English.
1051
+ - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum.
1052
+ - The student will use this material for independent reading/work, without direct instruction.
1053
+ - The text is reasonable for the given grade level.
1054
+
1055
+ Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket").
1056
+
1057
+ Please do be sure to consider:
1058
+ - all of the different types of vocabulary (listed below)
1059
+ - the overall proportion of complex words in the text - including repeated complex words.
1060
+ - the resulting holistic complexity of the vocabulary (described in the Summary section below).
1061
+
1062
+ Level 1:
1063
+ Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
1064
+
1065
+ Level 2:
1066
+ Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.
1067
+
1068
+ Level 3:
1069
+ Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
1070
+
1071
+ Level 4:
1072
+ Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading
1073
+
1074
+ And here are some relevant definitions:
1075
+ - Conversational: Everyday language.
1076
+ - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge.
1077
+ - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.
1078
+ - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data").
1079
+ - Note:
1080
+ - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.
1081
+ - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb."
1082
+ - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.
1083
+ - Unfamiliar proper nouns:
1084
+ - A person's name, even if unfamiliar, generally does not add to complexity.
1085
+ - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.
1086
+
1087
+ - Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.
1088
+ - Overly-academic: Words that are excessively formal, complex, or specialized.
1089
+ - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture"
1090
+ - Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.
1091
+ - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict."
1092
+ - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use.
1093
+
1094
+
1095
+ Examples
1096
+ The student is on-grade-level:
1097
+ - Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level. If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.
1098
+
1099
+ Figurative Language
1100
+ - Kicked the bucket.
1101
+ - The pen is mightier than the sword.
1102
+ - The classroom was a zoo.
1103
+ - He ran faster than the speed of light.
1104
+ [END ANNOTATION GUIDE AND RUBRIC]
1105
+
1106
+ Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:
1107
+ [BEGIN EXAMPLES]
1108
+
1109
+ *** EXAMPLE 1 ***
1110
+ The following text was intended for grade level 11 and received a complexity level of 1.
1111
+
1112
+ Here is the background knowledge assumption for that text: N/A
1113
+
1114
+ Here is the text:
1115
+ // START TEXT //
1116
+ "In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.
1117
+
1118
+ He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.
1119
+
1120
+ I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.
1121
+
1122
+ I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.
1123
+
1124
+ Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized."
1125
+ // END TEXT //
1126
+
1127
+ Here is the reasoning for that complexity level:
1128
+ // START REASONING //
1129
+ This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.
1130
+
1131
+ It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.
1132
+ // END REASONING //
1133
+ *** EXAMPLE 2 ***
1134
+ The following text was intended for grade level 5 and received a complexity level of 2.
1135
+
1136
+ Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).
1137
+
1138
+ Here is the text:
1139
+ // START TEXT //
1140
+ Great whirling storms roar out of the oceans in many parts of the world. They are called by several names\u2014hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
1141
+
1142
+ Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
1143
+ // END TEXT //
1144
+
1145
+ Here is the reasoning for that complexity level:
1146
+ // START REASONING //
1147
+ I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.
1148
+ // END REASONING //
1149
+
1150
+ *** EXAMPLE 3 ***
1151
+ The following text was intended for grade level 6 and received a complexity level of 3.
1152
+
1153
+ Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.
1154
+
1155
+ Here is the text:
1156
+ // START TEXT //
1157
+ Geology is the scientific study of Earth. Geologists study the planet\u2014its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.
1158
+
1159
+ Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.
1160
+
1161
+ Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.
1162
+ // END TEXT //
1163
+
1164
+ Here is the reasoning for that complexity level:
1165
+ // START REASONING //
1166
+ To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools\u2013annotation guide, scoring rubric, and examples to remind myself of the expectations for rating. I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4\u2013more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.
1167
+ // END REASONING //
1168
+ [END EXAMPLES]
1169
+
1170
+ Below is the text you need to evaluate. It is intended for grade {student_grade_level}.
1171
+
1172
+ As you read the text, you can assume the student has the following background knowledge about the text \u2014 this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}
1173
+
1174
+ [BEGIN TEXT]
1175
+ {text}
1176
+ [END TEXT]
1177
+
1178
+ In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
1179
+ `;
1180
+
1181
+ // src/prompts/vocabulary/user.ts
1182
+ function getUserPrompt(text, studentGradeLevel, studentBackgroundKnowledge, fkLevel) {
1183
+ const template = studentGradeLevel === "3" || studentGradeLevel === "4" ? grades_3_4_user_default : other_grades_user_default;
1184
+ return template.replaceAll("{student_grade_level}", studentGradeLevel).replaceAll("{student_background_knowledge}", studentBackgroundKnowledge).replaceAll("{fk_level}", fkLevel.toString()).replaceAll("{text}", text);
1185
+ }
1186
+
1187
+ // src/evaluators/vocabulary.ts
1188
+ var VocabularyEvaluator = class _VocabularyEvaluator extends BaseEvaluator {
1189
+ static metadata = {
1190
+ id: "vocabulary",
1191
+ name: "Vocabulary",
1192
+ description: "Evaluates vocabulary complexity of educational texts relative to grade level",
1193
+ supportedGrades: ["3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
1194
+ defaultProviders: ["google" /* Google */, "openai" /* OpenAI */]
1195
+ };
1196
+ grades34ComplexityProvider;
1197
+ otherGradesComplexityProvider;
1198
+ backgroundKnowledgeProvider;
1199
+ constructor(config) {
1200
+ super(config);
1201
+ this.grades34ComplexityProvider = this.createConfiguredProvider(
1202
+ "google" /* Google */,
1203
+ "gemini-2.5-pro",
1204
+ config.googleApiKey
1205
+ );
1206
+ this.otherGradesComplexityProvider = this.createConfiguredProvider(
1207
+ "openai" /* OpenAI */,
1208
+ "gpt-4.1-2025-04-14",
1209
+ config.openaiApiKey
1210
+ );
1211
+ this.backgroundKnowledgeProvider = this.createConfiguredProvider(
1212
+ "openai" /* OpenAI */,
1213
+ "gpt-4o-2024-11-20",
1214
+ config.openaiApiKey
1215
+ );
1216
+ }
1217
+ /**
1218
+ * Evaluate vocabulary complexity for a given text and grade level
1219
+ *
1220
+ * @param text - The text to evaluate
1221
+ * @param grade - The target grade level (3-12)
1222
+ * @returns Evaluation result with complexity score and detailed analysis
1223
+ * @throws {ValidationError} If text is empty, too short/long, or grade is invalid
1224
+ * @throws {ConfigurationError} If modelOverride specifies a model ID that the provider rejects
1225
+ * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError)
1226
+ */
1227
+ async evaluate(text, grade) {
1228
+ this.logger.info("Starting vocabulary evaluation", {
1229
+ evaluator: "vocabulary",
1230
+ operation: "evaluate",
1231
+ grade,
1232
+ textLength: text.length
1233
+ });
1234
+ const startTime = Date.now();
1235
+ const stageDetails = [];
1236
+ const complexityProviderLabel = grade === "3" || grade === "4" ? this.grades34ComplexityProvider.label : this.otherGradesComplexityProvider.label;
1237
+ const backgroundProviderLabel = this.backgroundKnowledgeProvider.label;
1238
+ const modelLabel = this.config.modelOverride ? backgroundProviderLabel : `${backgroundProviderLabel}+${complexityProviderLabel}`;
1239
+ try {
1240
+ this.validateText(text);
1241
+ this.validateGrade(grade, new Set(_VocabularyEvaluator.metadata.supportedGrades));
1242
+ this.logger.debug("Stage 1: Generating background knowledge", {
1243
+ evaluator: "vocabulary",
1244
+ operation: "background_knowledge"
1245
+ });
1246
+ const bgResponse = await this.getBackgroundKnowledgeAssumption(text, grade);
1247
+ stageDetails.push({
1248
+ stage: "background_knowledge",
1249
+ provider: backgroundProviderLabel,
1250
+ latency_ms: bgResponse.latencyMs,
1251
+ token_usage: {
1252
+ input_tokens: bgResponse.usage.inputTokens,
1253
+ output_tokens: bgResponse.usage.outputTokens
1254
+ }
1255
+ });
1256
+ const fkLevel = calculateFleschKincaidGrade(text);
1257
+ const complexityResponse = await this.evaluateComplexity(
1258
+ text,
1259
+ grade,
1260
+ bgResponse.knowledge.assumption,
1261
+ fkLevel
1262
+ );
1263
+ stageDetails.push({
1264
+ stage: "complexity_evaluation",
1265
+ provider: complexityProviderLabel,
1266
+ latency_ms: complexityResponse.latencyMs,
1267
+ token_usage: {
1268
+ input_tokens: complexityResponse.usage.inputTokens,
1269
+ output_tokens: complexityResponse.usage.outputTokens
1270
+ }
1271
+ });
1272
+ const latencyMs = Date.now() - startTime;
1273
+ const totalTokenUsage = {
1274
+ input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
1275
+ output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
1276
+ };
1277
+ const result = {
1278
+ score: complexityResponse.data.complexity_score,
1279
+ reasoning: complexityResponse.data.reasoning,
1280
+ metadata: {
1281
+ model: modelLabel,
1282
+ processingTimeMs: latencyMs
1283
+ },
1284
+ _internal: complexityResponse.data
1285
+ };
1286
+ this.sendTelemetry({
1287
+ status: "success",
1288
+ latencyMs,
1289
+ textLength: text.length,
1290
+ grade,
1291
+ provider: modelLabel,
1292
+ tokenUsage: totalTokenUsage,
1293
+ metadata: {
1294
+ stage_details: stageDetails
1295
+ },
1296
+ inputText: text
1297
+ }).catch(() => {
1298
+ });
1299
+ this.logger.info("Vocabulary evaluation completed successfully", {
1300
+ evaluator: "vocabulary",
1301
+ operation: "evaluate",
1302
+ grade,
1303
+ score: result.score,
1304
+ processingTimeMs: latencyMs
1305
+ });
1306
+ return result;
1307
+ } catch (error) {
1308
+ const latencyMs = Date.now() - startTime;
1309
+ this.logger.error("Vocabulary evaluation failed", {
1310
+ evaluator: "vocabulary",
1311
+ operation: "evaluate",
1312
+ grade,
1313
+ error: error instanceof Error ? error : void 0,
1314
+ processingTimeMs: latencyMs,
1315
+ completedStages: stageDetails.length
1316
+ });
1317
+ const totalTokenUsage = stageDetails.length > 0 ? {
1318
+ input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
1319
+ output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
1320
+ } : void 0;
1321
+ this.sendTelemetry({
1322
+ status: "error",
1323
+ latencyMs,
1324
+ textLength: text.length,
1325
+ grade,
1326
+ provider: modelLabel,
1327
+ tokenUsage: totalTokenUsage,
1328
+ errorCode: error instanceof Error ? error.name : "UnknownError",
1329
+ metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : void 0,
1330
+ inputText: text
1331
+ }).catch(() => {
1332
+ });
1333
+ if (error instanceof ValidationError) {
1334
+ throw error;
1335
+ }
1336
+ throw wrapProviderError(error, "Vocabulary evaluation failed");
1337
+ }
1338
+ }
1339
+ /**
1340
+ * Stage 1: Generate background knowledge assumption
1341
+ *
1342
+ * Estimates what topics the student at the given grade level would be familiar with
1343
+ * based on Common Core curriculum progression.
1344
+ */
1345
+ async getBackgroundKnowledgeAssumption(text, grade) {
1346
+ const prompt = getBackgroundKnowledgePrompt(text, grade);
1347
+ const response = await this.backgroundKnowledgeProvider.generateText(
1348
+ [{ role: "user", content: prompt }],
1349
+ 0
1350
+ // temperature = 0 for consistency
1351
+ );
1352
+ return {
1353
+ knowledge: {
1354
+ assumption: response.text.trim(),
1355
+ grade
1356
+ },
1357
+ usage: response.usage,
1358
+ latencyMs: response.latencyMs
1359
+ };
1360
+ }
1361
+ /**
1362
+ * Stage 2: Evaluate vocabulary complexity
1363
+ *
1364
+ * Uses the Qual Text Complexity rubric (SAP) and background knowledge to evaluate vocabulary complexity.
1365
+ * Grades 3-4 use Gemini 2.5 Pro; grades 5-12 use GPT-4.1.
1366
+ */
1367
+ async evaluateComplexity(text, grade, backgroundKnowledge, fkLevel) {
1368
+ const systemPrompt = getSystemPrompt(grade);
1369
+ const userPrompt = getUserPrompt(text, grade, backgroundKnowledge, fkLevel);
1370
+ const provider = grade === "3" || grade === "4" ? this.grades34ComplexityProvider : this.otherGradesComplexityProvider;
1371
+ const response = await provider.generateStructured({
1372
+ messages: [
1373
+ { role: "system", content: systemPrompt },
1374
+ { role: "user", content: userPrompt }
1375
+ ],
1376
+ schema: VocabularyComplexitySchema,
1377
+ temperature: 0
1378
+ });
1379
+ return {
1380
+ data: response.data,
1381
+ usage: response.usage,
1382
+ latencyMs: response.latencyMs
1383
+ };
1384
+ }
1385
+ };
1386
+ var SentenceAnalysisSchema = z.object({
1387
+ reasoning: z.string().describe("Step-by-step reasoning for the analysis"),
1388
+ // Foundational
1389
+ num_sentences: z.number().int().describe("Total number of sentences in the text"),
1390
+ num_words: z.number().int().describe("Total number of words in the text"),
1391
+ flesch_kincaid_grade: z.number().describe("Flesch-Kincaid Grade Level number"),
1392
+ // Sentence Type
1393
+ num_simple_sentences: z.number().int().describe("Number of simple sentences"),
1394
+ num_compound_sentences: z.number().int().describe("Number of compound sentences"),
1395
+ num_complex_sentences: z.number().int().describe("Number of complex sentences"),
1396
+ num_compound_complex_sentences: z.number().int().describe("Number of compound-complex sentences"),
1397
+ num_other_sentences: z.number().int().describe("Number of other sentence types"),
1398
+ // Subordination
1399
+ num_independent_clauses: z.number().int(),
1400
+ num_subordinate_clauses: z.number().int(),
1401
+ num_total_clauses: z.number().int(),
1402
+ num_sentences_with_subordinate: z.number().int(),
1403
+ num_sentences_with_multiple_subordinates: z.number().int(),
1404
+ num_sentences_with_embedded_clauses: z.number().int(),
1405
+ // Informational Phrases
1406
+ num_prepositional_phrases: z.number().int(),
1407
+ num_participle_phrases: z.number().int(),
1408
+ num_appositive_phrases: z.number().int(),
1409
+ // Cohesion
1410
+ num_simple_transitions: z.number().int(),
1411
+ num_sophisticated_transitions: z.number().int(),
1412
+ // Sentence Type Density
1413
+ words_in_simple_sentences: z.number().int(),
1414
+ words_in_compound_sentences: z.number().int(),
1415
+ words_in_complex_sentences: z.number().int(),
1416
+ words_in_compound_complex_sentences: z.number().int(),
1417
+ words_in_other_sentences: z.number().int(),
1418
+ // Additional Features
1419
+ sentence_word_counts: z.array(z.number().int()),
1420
+ num_one_concept_sentences: z.number().int(),
1421
+ num_multi_concept_sentences: z.number().int(),
1422
+ num_cleft_sentences: z.number().int(),
1423
+ max_clauses_in_any_sentence: z.number().int(),
1424
+ // Grades 5-12 specific
1425
+ num_compound: z.number().int().describe("Number of compound sentences"),
1426
+ num_basic_complex: z.number().int().describe("Number of basic complex sentences"),
1427
+ num_advanced_complex: z.number().int().describe("Number of advanced complex sentences"),
1428
+ percentage_simple: z.number().describe("Percentage of simple sentences"),
1429
+ percentage_compound: z.number().describe("Percentage of compound sentences"),
1430
+ percentage_basic_complex: z.number().describe("Percentage of basic complex sentences"),
1431
+ percentage_advanced_complex: z.number().describe("Percentage of advanced complex sentences")
1432
+ });
1433
+ var ComplexityClassificationSchema = z.object({
1434
+ reasoning: z.string().describe("Detailed pedagogically appropriate reasoning"),
1435
+ answer: TextComplexityLevel
1436
+ });
1437
+
1438
+ // ../../evals/prompts/sentence-structure/analysis-system.txt
1439
+ var analysis_system_default = "You are an expert in grammar and literacy.";
1440
+
1441
+ // ../../evals/prompts/sentence-structure/analysis-user.txt
1442
+ var analysis_user_default = `
1443
+ # Task
1444
+ I am going to give you a text, and I need you to look through the text sentence-by-sentence to perform a comprehensive grammatical analysis. Use the computational counts as a reference; they can be incorrect in ambiguous cases.
1445
+
1446
+ # Definitions
1447
+ * Sentences: Count a complete grammatical unit ending in a terminal punctuation mark.
1448
+ * Words: Count any sequence of characters separated by a space as one word. Treat hyphenated words (e.g., "state-of-the-art") and numbers (e.g., "2025") as single words.
1449
+ * Independent Clauses: Clauses that can stand alone as a complete sentence.
1450
+ * Subordinate Clauses: Clauses that are dependent on the main clause and cannot stand alone as a complete sentence.
1451
+ * Simple Sentences: Sentences with one independent clause and no subordinate clauses.
1452
+ * Compound Sentences: Sentences with two or more independent clauses and no subordinate clauses.
1453
+ * Complex Sentences: Sentences with one independent clause and at least one subordinate clause.
1454
+ * Compound-Complex Sentences: Sentences with two or more independent clauses and at least one subordinate clause.
1455
+ * Other / Non-Canonical Sentences: Sentences that cannot be reliably classified as simple, compound, complex, or compound-complex (e.g., sentence fragments, run-ons, elliptical responses, headlines, imperatives lacking an explicit subject, or stylized dialogue tags).
1456
+ * Subordinate Clauses: Clauses that are dependent on the main clause and cannot stand alone as a complete sentence.
1457
+ * Embedded Clauses: Clauses that are nested within another clause.
1458
+ * Prepositional Phrases: Phrases that begin with a preposition and end with a noun phrase.
1459
+ * Participle Phrases: Phrases that begin with a participle and end with a noun phrase.
1460
+ * Appositive Phrases: Phrases that rename or identify a noun phrase.
1461
+ * Simple Transitions: Basic coordinating conjunctions and chronological adverbs. Examples: 'and', 'but', 'or', 'so', 'then', 'next', 'first'.
1462
+ * Sophisticated Transitions: Conjunctive adverbs and phrases signaling logical relationships. Examples: 'however', 'therefore', 'consequently', 'as a result', 'for example', 'although'.
1463
+ * One-Concept Sentence: A sentence with ZERO subordinate clauses AND ZERO transition words/phrases (neither simple nor sophisticated).
1464
+ * Multi-Concept Sentence: Any sentence that has \u22651 subordinate clause OR \u22651 transition word/phrase (or both).
1465
+ * Basic Complex Sentences: Sentences with exactly one independent clause and at one dependent (subordinate) clause.
1466
+ * Advanced Complex Sentences: Sentences with two or more of any of those following (can include a mix, doesn't have to be two of the same type) subordinate phrases, clauses, transition words, or any other meaningful "interruptions" to the flow of the sentence (like not-only-but-also constructions, dashes, semicolons, and lengthy appositives). A sentence can be advanced complex if it has just one subordinate phrase or clause alongside a transition phrase, like: "For example, the British favored trade with Hong Kong, assuming favorable trade conditions.
1467
+
1468
+ # Computational Counts
1469
+ Use these as reference, your internal heuristics can be more reliable.
1470
+ {ground_truth_counts}
1471
+
1472
+ # Text to Analyze
1473
+ [BEGIN TEXT]
1474
+ {text}
1475
+ [END TEXT]
1476
+
1477
+ IMPORTANT: Your response should be a single JSON object with the following structure. Do not produce anything outside of the JSON object.
1478
+
1479
+ {format_instructions}
1480
+ `;
1481
+
1482
+ // src/prompts/sentence-structure/analysis.ts
1483
+ function getSystemPromptAnalysis() {
1484
+ return analysis_system_default;
1485
+ }
1486
+ function getUserPromptAnalysis(text, groundTruthCounts) {
1487
+ return analysis_user_default.replace("{text}", text).replace("{ground_truth_counts}", groundTruthCounts).replace("{format_instructions}", "");
1488
+ }
1489
+
1490
+ // ../../evals/prompts/sentence-structure/complexity-system.txt
1491
+ var complexity_system_default = "You are an expert in grammar and literacy, and understand K-12 and Qualitative Text Complexity rubric (SAP).";
1492
+
1493
+ // ../../evals/prompts/sentence-structure/complexity-user.txt
1494
+ var complexity_user_default = '\nYour task is to perform a text complexity analysis for a Grade {grade} student. You will be given a text excerpt and a set of quantitative sentence-level statistics for that text.\n\nYou must integrate both the qualitative aspects of the text and the quantitative statistics to make your final judgment. Do not rely on the numbers alone.\n\n1. Read the TEXT EXCERPT to understand its topic, conceptual load, and overall structure.\n2. Review the TEXT STATISTICS as a guide for complexity level.\n3. Synthesize your findings in your reasoning. Explain how the structure (qualitative) interact with the text statistics (quantitative) to determine the complexity. For example, a text with simple sentences might still be complex if the topic is very dense or abstract.\n\nYour final answer must be one of ["Slightly Complex," "Moderately Complex," "Very Complex", "Exceedingly Complex"].\n\n# GRADE {grade} RUBRIC\n{rubric}\n\n# TEXT EXCERPT\n[BEGIN TEXT]\n{excerpt}\n[END TEXT]\n\n# TEXT STATISTICS\n{sentence_features}\n\n# OUTPUT FORMAT\n{format_instructions}\n';
1495
+
1496
+ // ../../evals/prompts/sentence-structure/rubric-grade-3.txt
1497
+ var rubric_grade_3_default = '\n **Instructions for Analysis:** First, evaluate if the text meets the criteria for "Slightly Complex" or "Exceedingly Complex". If it does not fit into these categories, then decide between "Moderately Complex" and "Very Complex".\n\n **Slightly Complex:**\n * **Description:** The text consists of simple, straightforward language and sentence structures.\n * **Statistical Guidelines:** The text is likely "Slightly Complex" if it meets at least TWO of the following criteria:\n * **Sentence Type:** Primarily simple sentences. (`percent_simple_sentences` is typically > 60%).\n * **Sentence Length:** Short sentences. (`avg_sentence_length` is typically < 12 words).\n * **Subordination:** Very low use of clauses. (`percent_sentences_with_subordinate` is typically < 25%).\n\n **Moderately Complex:**\n * **Description:** The text shows a mix of simple and more complex sentences, introducing some variety in structure without being overly demanding.\n * **Statistical Guidelines:** If the text is not "Slightly Complex", consider "Moderately Complex" if it generally aligns with these ranges:\n * **Sentence Type:** A balanced mix of sentence types. (`percent_simple_sentences` is typically between 40% and 60%).\n * **Sentence Length:** Medium length sentences. (`avg_sentence_length` is typically between 12 and 16 words).\n * **Subordination:** A moderate use of clauses. (`percent_sentences_with_subordinate` is typically between 25% and 45%).\n\n **Very Complex:**\n * **Description:** The text features more elaborate sentences with multiple clauses and ideas, requiring more effort from the reader to parse. This is often the default category for grade-level text that isn\'t simple or exceptionally difficult.\n * **Statistical Guidelines:** If the text is more complex than "Moderately" but does not meet the "Exceedingly" criteria, it is likely "Very Complex". Key indicators include:\n * **Sentence Type:** Complex structures are common. (`percent_simple_sentences` is a minority, typically < 40%).\n * **Sentence Length:** Longer sentences are frequent. (`avg_sentence_length` is typically between 16 and 19 words).\n * **Subordination:** Subordinate clauses are a key feature. (`percent_sentences_with_subordinate` is typically > 45%).\n\n **Exceedingly Complex:**\n * **Description:** The text is dense with very long, intricate sentences and a high degree of subordination, making it exceptionally challenging for this grade level.\n * **Statistical Guidelines:** The text is "Exceedingly Complex" if it shows an extreme combination of sentence length and structural density. It should meet at least **TWO** of the following criteria, including at least **ONE** from the "Structural Density" group.\n * **Structural Density Indicators:**\n * High Subordination: `percent_sentences_with_subordinate` is extensive (typically > 50%).\n * Multiple Subordinates: `percent_sentences_with_multiple_subordinates` is consistently present (typically > 12%).\n * High Syntactic Complexity: `percent_compound_complex_sentences` is significant (typically > 15%).\n * **Length Indicators:**\n * Extreme Sentence Length: `avg_sentence_length` is very long (typically > 19 words).\n * Low Simplicity: `percent_simple_sentences` is very low (typically < 30%).\n * Concentrated Length: `percent_very_long_sentences` is notable (typically > 10%).\n';
1498
+
1499
+ // ../../evals/prompts/sentence-structure/rubric-grade-4.txt
1500
+ var rubric_grade_4_default = '\n **Instructions for Analysis:** First, evaluate if the text meets the criteria for "Slightly Complex" or "Exceedingly Complex". If it does not fit into these categories, then decide between "Moderately Complex" and "Very Complex".\n\n **Slightly Complex:**\n * **Description:** The text uses clear, direct language with basic sentence structures appropriate for developing readers.\n * **Statistical Guidelines:** The text is likely "Slightly Complex" if it meets at least TWO of the following criteria:\n * **Sentence Type:** Dominated by simple sentences. (`percent_simple_sentences` is typically > 55%).\n * **Sentence Length:** Short to medium sentences. (`avg_sentence_length` is typically < 13 words).\n * **Subordination:** Infrequent use of clauses. (`percent_sentences_with_subordinate` is typically < 30%).\n\n **Moderately Complex:**\n * **Description:** The text contains a variety of sentence structures, including compound and complex sentences, but remains accessible.\n * **Statistical Guidelines:** If the text is not "Slightly Complex", consider "Moderately Complex" if it generally aligns with these ranges:\n * **Sentence Type:** A healthy mix of sentence types. (`percent_simple_sentences` is typically between 40% and 55%).\n * **Sentence Length:** Medium length sentences. (`avg_sentence_length` is typically between 13 and 17 words).\n * **Subordination:** A moderate number of clauses. (`percent_sentences_with_subordinate` is typically between 30% and 50%).\n\n **Very Complex:**\n * **Description:** The text is characterized by longer sentences and the regular use of dependent clauses, requiring readers to track multiple ideas. This is the default for challenging, on-grade-level texts.\n * **Statistical Guidelines:** If the text is more complex than "Moderately" but does not meet the "Exceedingly" criteria, it is likely "Very Complex". Key indicators include:\n * **Sentence Type:** Simple sentences are a clear minority. (`percent_simple_sentences` is typically < 40%).\n * **Sentence Length:** Sentences are consistently long. (`avg_sentence_length` is typically between 17 and 22 words).\n * **Subordination:** Subordination is a major feature. (`percent_sentences_with_subordinate` is typically > 50%).\n * **Multiple Subordination:** Sentences with multiple clauses appear more often. (`percent_sentences_with_multiple_subordinates` is typically > 8%).\n\n **Exceedingly Complex:**\n * **Description:** The text\'s structure is highly sophisticated and dense, marked by extensive use of embedded clauses and long, flowing sentences that are well above grade-level expectations.\n * **Statistical Guidelines:** A text is "Exceedingly Complex" if its structure is highly sophisticated and dense. It should meet at least **TWO** of the following criteria, including at least **ONE** from the "Structural Density" group.\n * **Structural Density Indicators:**\n * High Subordination: `percent_sentences_with_subordinate` is very high (typically > 60%).\n * Multiple Subordinates: `percent_sentences_with_multiple_subordinates` is high and consistent (typically > 15%).\n * High Syntactic Complexity: `percent_compound_complex_sentences` is a notable feature (typically > 20%).\n * **Length Indicators:**\n * Extreme Sentence Length: `avg_sentence_length` is exceptionally long (typically > 22 words).\n * Low Simplicity: `percent_simple_sentences` is very low (typically < 25%).\n * Concentrated Length: `percent_very_long_sentences` is significant (typically > 15%).\n';
1501
+
1502
+ // ../../evals/prompts/sentence-structure/rubric-grades-5-12.txt
1503
+ var rubric_grades_5_12_default = "\n **Slightly Complex:** A text is in the Slightly Complex bucket if it has at least 50% simple sentences. If it doesn't, the text is a higher level of complexity. If the % of simple sentences is >= 50% and the % of compound sentences is >= 20%, the text is Moderately Complex, otherwise, the text is Slightly Complex. Slightly Complex texts NEVER have advanced complex sentences \u2014 the presence of an advanced complex sentence always leads to a higher level of complexity than Slightly.\n **For Moderately Complex:** These texts can take on any distribution of sentence types as long as there aren't more than 2 advanced complex sentences and as long as there aren't so many simple sentences that the text becomes Slightly Complex. That means Moderately Complex texts may have many simple sentences (although not so many that the text is Slightly Complex), compound sentences, and/or basic complex sentences. It's also possible for a moderately complex text to contain one or two advanced complex sentences, as long as there aren't more than 2. If there are more than 2, then the text is either Very or Exceedingly complex.\n **Very Complex:** These texts contain 3 or more advanced complex sentences (unless the percentage of advanced complex sentences is >= 65)%, in which case the text becomes Exceedingly Complex). They may still contain many simple, compound, and basic complex sentences, but a text is not Very Complex unless there are 3 or more advanced complex sentences.\n **Exceedingly Complex:** These texts have 65%+ of their sentences being advanced complex sentences.\n";
1504
+
1505
+ // src/prompts/sentence-structure/complexity.ts
1506
+ function getSystemPromptComplexity() {
1507
+ return complexity_system_default;
1508
+ }
1509
+ function getRubricForGrade(grade) {
1510
+ if (grade === "3") {
1511
+ return rubric_grade_3_default;
1512
+ } else if (grade === "4") {
1513
+ return rubric_grade_4_default;
1514
+ } else {
1515
+ return rubric_grades_5_12_default;
1516
+ }
1517
+ }
1518
+ function getUserPromptComplexity(sentenceFeatures, grade, excerpt) {
1519
+ const rubric = getRubricForGrade(grade);
1520
+ return complexity_user_default.replace("{sentence_features}", sentenceFeatures).replace("{grade}", grade).replace("{rubric}", rubric).replace("{excerpt}", excerpt).replace("{format_instructions}", "");
1521
+ }
1522
+
1523
+ // src/evaluators/sentence-structure.ts
1524
+ function normalizeLabel(label) {
1525
+ if (!label) {
1526
+ return null;
1527
+ }
1528
+ const normalized = label.trim().toLowerCase().replace(/_/g, " ");
1529
+ const mapping = {
1530
+ "slightly complex": "Slightly complex",
1531
+ "moderately complex": "Moderately complex",
1532
+ "very complex": "Very complex",
1533
+ "exceedingly complex": "Exceedingly complex",
1534
+ "extremely complex": "Exceedingly complex"
1535
+ };
1536
+ return mapping[normalized] ?? null;
1537
+ }
1538
+ var SentenceStructureEvaluator = class _SentenceStructureEvaluator extends BaseEvaluator {
1539
+ static metadata = {
1540
+ id: "sentence-structure",
1541
+ name: "Sentence Structure",
1542
+ description: "Evaluates sentence structure complexity based on grammatical features",
1543
+ supportedGrades: ["3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
1544
+ defaultProviders: ["openai" /* OpenAI */]
1545
+ };
1546
+ provider;
1547
+ constructor(config) {
1548
+ super(config);
1549
+ this.provider = this.createConfiguredProvider("openai" /* OpenAI */, "gpt-4o", config.openaiApiKey);
1550
+ }
1551
+ /**
1552
+ * Evaluate sentence structure complexity for a given text and grade level
1553
+ *
1554
+ * @param text - The text to evaluate
1555
+ * @param grade - The target grade level (3-12)
1556
+ * @returns Evaluation result with complexity score and detailed analysis
1557
+ * @throws {ValidationError} If text is empty, too short/long, or grade is invalid
1558
+ * @throws {ConfigurationError} If modelOverride specifies a model ID that the provider rejects
1559
+ * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError)
1560
+ */
1561
+ async evaluate(text, grade) {
1562
+ this.logger.info("Starting sentence structure evaluation", {
1563
+ evaluator: "sentence-structure",
1564
+ operation: "evaluate",
1565
+ grade,
1566
+ textLength: text.length
1567
+ });
1568
+ const startTime = Date.now();
1569
+ const stageDetails = [];
1570
+ try {
1571
+ this.validateText(text);
1572
+ this.validateGrade(grade, new Set(_SentenceStructureEvaluator.metadata.supportedGrades));
1573
+ this.logger.debug("Stage 1: Analyzing sentence structure", {
1574
+ evaluator: "sentence-structure",
1575
+ operation: "sentence_analysis"
1576
+ });
1577
+ const analysisResponse = await this.analyzeSentenceStructure(text);
1578
+ stageDetails.push({
1579
+ stage: "sentence_analysis",
1580
+ provider: this.provider.label,
1581
+ latency_ms: analysisResponse.latencyMs,
1582
+ token_usage: {
1583
+ input_tokens: analysisResponse.usage.inputTokens,
1584
+ output_tokens: analysisResponse.usage.outputTokens
1585
+ }
1586
+ });
1587
+ const features = addEngineeredFeatures(analysisResponse.data);
1588
+ this.logger.debug("Stage 2: Classifying complexity", {
1589
+ evaluator: "sentence-structure",
1590
+ operation: "complexity_classification"
1591
+ });
1592
+ const complexityResponse = await this.classifyComplexity(features, grade, text);
1593
+ stageDetails.push({
1594
+ stage: "complexity_classification",
1595
+ provider: this.provider.label,
1596
+ latency_ms: complexityResponse.latencyMs,
1597
+ token_usage: {
1598
+ input_tokens: complexityResponse.usage.inputTokens,
1599
+ output_tokens: complexityResponse.usage.outputTokens
1600
+ }
1601
+ });
1602
+ const latencyMs = Date.now() - startTime;
1603
+ const totalTokenUsage = {
1604
+ input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
1605
+ output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
1606
+ };
1607
+ const result = {
1608
+ score: complexityResponse.data.answer,
1609
+ reasoning: complexityResponse.data.reasoning,
1610
+ metadata: {
1611
+ model: this.provider.label,
1612
+ processingTimeMs: latencyMs
1613
+ },
1614
+ _internal: {
1615
+ sentenceAnalysis: analysisResponse.data,
1616
+ features,
1617
+ complexity: complexityResponse.data
1618
+ }
1619
+ };
1620
+ this.sendTelemetry({
1621
+ status: "success",
1622
+ latencyMs,
1623
+ textLength: text.length,
1624
+ grade,
1625
+ provider: this.provider.label,
1626
+ tokenUsage: totalTokenUsage,
1627
+ metadata: {
1628
+ stage_details: stageDetails
1629
+ },
1630
+ inputText: text
1631
+ }).catch(() => {
1632
+ });
1633
+ this.logger.info("Sentence structure evaluation completed successfully", {
1634
+ evaluator: "sentence-structure",
1635
+ operation: "evaluate",
1636
+ grade,
1637
+ score: result.score,
1638
+ processingTimeMs: latencyMs
1639
+ });
1640
+ return result;
1641
+ } catch (error) {
1642
+ const latencyMs = Date.now() - startTime;
1643
+ this.logger.error("Sentence structure evaluation failed", {
1644
+ evaluator: "sentence-structure",
1645
+ operation: "evaluate",
1646
+ grade,
1647
+ error: error instanceof Error ? error : void 0,
1648
+ processingTimeMs: latencyMs,
1649
+ completedStages: stageDetails.length
1650
+ });
1651
+ const totalTokenUsage = stageDetails.length > 0 ? {
1652
+ input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
1653
+ output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
1654
+ } : void 0;
1655
+ this.sendTelemetry({
1656
+ status: "error",
1657
+ latencyMs,
1658
+ textLength: text.length,
1659
+ grade,
1660
+ provider: this.provider.label,
1661
+ tokenUsage: totalTokenUsage,
1662
+ errorCode: error instanceof Error ? error.name : "UnknownError",
1663
+ metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : void 0,
1664
+ inputText: text
1665
+ }).catch(() => {
1666
+ });
1667
+ if (error instanceof ValidationError) {
1668
+ throw error;
1669
+ }
1670
+ throw wrapProviderError(error, "Sentence structure evaluation failed");
1671
+ }
1672
+ }
1673
+ /**
1674
+ * Stage 1: Analyze sentence grammatical structure
1675
+ *
1676
+ * Analyzes sentence types, clauses, phrases, transitions, and other grammatical features
1677
+ */
1678
+ async analyzeSentenceStructure(text) {
1679
+ const metrics = calculateReadabilityMetrics(text);
1680
+ const gtCountsStr = [
1681
+ `num_sentences: ${metrics.sentenceCount}`,
1682
+ `num_words: ${metrics.wordCount}`,
1683
+ `num_char: ${metrics.characterCount}`,
1684
+ `num_syllable: ${metrics.syllableCount}`,
1685
+ `flesch_kincaid_grade: ${metrics.fleschKincaidGrade}`
1686
+ ].join("\n");
1687
+ const userPrompt = getUserPromptAnalysis(text, gtCountsStr);
1688
+ const response = await this.provider.generateStructured({
1689
+ messages: [
1690
+ { role: "system", content: getSystemPromptAnalysis() },
1691
+ { role: "user", content: userPrompt }
1692
+ ],
1693
+ schema: SentenceAnalysisSchema,
1694
+ temperature: 0
1695
+ });
1696
+ return {
1697
+ data: response.data,
1698
+ usage: response.usage,
1699
+ latencyMs: response.latencyMs
1700
+ };
1701
+ }
1702
+ /**
1703
+ * Stage 2: Classify sentence structure complexity
1704
+ *
1705
+ * Uses engineered features and grade-specific rubric to classify complexity level
1706
+ */
1707
+ async classifyComplexity(features, grade, excerpt) {
1708
+ const featuresJSON = featuresToJSON(features, 1, true);
1709
+ const userPrompt = getUserPromptComplexity(featuresJSON, grade, excerpt);
1710
+ const response = await this.provider.generateStructured({
1711
+ messages: [
1712
+ { role: "system", content: getSystemPromptComplexity() },
1713
+ { role: "user", content: userPrompt }
1714
+ ],
1715
+ schema: ComplexityClassificationSchema,
1716
+ temperature: 0
1717
+ });
1718
+ const normalizedAnswer = normalizeLabel(response.data.answer);
1719
+ if (!normalizedAnswer) {
1720
+ throw new Error(
1721
+ `Failed to normalize complexity label. Received unexpected value: "${response.data.answer}". Expected one of: Slightly Complex, Moderately Complex, Very Complex, Exceedingly Complex, Extremely Complex.`
1722
+ );
1723
+ }
1724
+ return {
1725
+ data: {
1726
+ ...response.data,
1727
+ answer: normalizedAnswer
1728
+ },
1729
+ usage: response.usage,
1730
+ latencyMs: response.latencyMs
1731
+ };
1732
+ }
1733
+ };
1734
+ var GradeBand = z.enum(["K-1", "2-3", "4-5", "6-8", "9-10", "11-CCR"]);
1735
+ var GradeLevelAppropriatenessSchema = z.object({
1736
+ reasoning: z.string().describe(
1737
+ "Your reasoning for your answer in numbered bullet points for 4 steps with a 4th bullet point for synthesis."
1738
+ ),
1739
+ grade: GradeBand.describe("The appropriate grade level for the text"),
1740
+ alternative_grade: GradeBand.describe("An alternative grade level for the text"),
1741
+ scaffolding_needed: z.string().describe("Scaffolding needed for the text to be appropriate for the alternative grade")
1742
+ });
1743
+
1744
+ // ../../evals/prompts/grade-level-appropriateness/system.txt
1745
+ var system_default = "\nYou are an expert in English literature education for K-12.\nYour job is to help evaluate the grade level appropriateness of a given text.\n\nYou will be given a text and you should determine which grade level the text is appropriate for (grade levels include: K-1, 2-3, 4-5, 6-8, 9-10, 11-CCR)\n\nIMPORTANT: You should pay attention to the vocabulary used, topics of the text and readability of text.\n\nPlease first reason out loud about the vocabulary complexity of the text and then provide an answer between grade level options: K-1, 2-3, 4-5, 6-8, 9-10, 11-CCR.\n\n";
1746
+
1747
+ // ../../evals/prompts/grade-level-appropriateness/user.txt
1748
+ var user_default = '\nUse these steps to determine appropriate grade level for a text:\n1. Calculate word count and Flesch-Kincaid Grade Level of the text, and generate a grade band.\nHere are the bands guideline for word count\n\n2-3: 200-800 words\n4-5: 200-800 words\n6-8: 400-1000 words\n9-10: 500-1500 words\n11-12: 1501 words and more\n\nHere is the formula for Flesch-Kincaid Grade Level:\nFlesch-Kincaid Grade Level = 0.39 * (total words / total sentences) + 11.8 * (total syllables / total words) - 15.59\n\n\n2. Determine the qualitative complexity using this text complexity rubric:\nTEXT STRUCTURE\n\nExceedingly Complex\n \u2022 Deep, intricate, often ambiguous connections between many ideas/processes/events\n \u2022 Organization is intricate or discipline-specific\n \u2022 Text features are essential for understanding\n \u2022 Graphics are intricate, extensive, and integral to meaning; may convey unique information\n\nVery Complex\n \u2022 Expanded ideas/processes/events with implicit or subtle connections\n \u2022 Organization may have multiple pathways or discipline-specific traits\n \u2022 Text features directly enhance understanding\n \u2022 Graphics support or are integral to understanding\n\nModerately Complex\n \u2022 Some implicit/subtle connections between ideas/events\n \u2022 Organization is evident and generally sequential or chronological\n \u2022 Text features enhance understanding\n \u2022 Graphics are mostly supplementary\n\nSlightly Complex\n \u2022 Explicit and clear connections between ideas/events\n \u2022 Organization is chronological, sequential, or predictable\n \u2022 Text features help navigation but are not essential\n \u2022 Graphics are simple, not necessary, but may assist understanding\n\n\u2E3B\n\nLANGUAGE FEATURES\n\nExceedingly Complex\n \u2022 Dense, abstract, ironic, and/or figurative language\n \u2022 Complex, unfamiliar, archaic, subject-specific, or ambiguous vocabulary\n \u2022 Mainly complex sentences with multiple subordinate clauses and transitions\n\nVery Complex\n \u2022 Fairly complex; some abstract, ironic, and/or figurative language\n \u2022 Some unfamiliar, archaic, or overly academic vocabulary\n \u2022 Many complex sentences with subordinate phrases/clauses\n\nModerately Complex\n \u2022 Mostly explicit language with some complex meaning\n \u2022 Mostly familiar and conversational vocabulary\n \u2022 Primarily simple and compound sentences, with some complex ones\n\nSlightly Complex\n \u2022 Explicit, literal, straightforward language\n \u2022 Contemporary, familiar, conversational vocabulary\n \u2022 Mainly simple sentences\n\n\u2E3B\n\nPURPOSE\n\nExceedingly Complex\n \u2022 Subtle, intricate, and difficult to determine\n \u2022 Includes many theoretical or abstract elements\n\nVery Complex\n \u2022 Implicit or subtle, fairly easy to infer\n \u2022 More theoretical or abstract than concrete\n\nModerately Complex\n \u2022 Implied but easy to identify based on context or source\n\nSlightly Complex\n \u2022 Explicitly stated, clear, concrete, and narrowly focused\n\n\u2E3B\n\nKNOWLEDGE DEMANDS\n\nExceedingly Complex\n \u2022 Requires extensive discipline-specific or theoretical knowledge\n \u2022 Many references/allusions to other texts or ideas\n\nVery Complex\n \u2022 Requires moderate discipline-specific knowledge\n \u2022 Some references/allusions to other texts or ideas\n\nModerately Complex\n \u2022 Requires common knowledge and some discipline-specific knowledge\n \u2022 Few references/allusions\n\nSlightly Complex\n \u2022 Requires everyday, practical knowledge\n \u2022 No references/allusions\n\n3. Background knowledge:\nAt which grade level would student have enough background knowledge to understand the text?\n\n4. Use your judgement of the above three steps. First use the quantitative signal to get first signal of the appropriate grade level range, then use qualitative analysis to refine your decisions and consider if student at such grade will have enough background knowledge to arrive at a final grade level band. Also consider if the text can be for a lower grade with additional scaffolding.\n\n<begin of text to evaluate>\n<text>{text}</text>\n<end of text to evaluate>\n\nWhen providing your response, first think out loud of your reasoning and then provide your answer from one of the grade band options above. Your reasoning and answer needs to be in JSON format. Strictly follow the following format for your response.\n\nYour final answer should be in the "grade" property for the target grade band for the text aimed for independent reading. If there is alternative appropriate grade students can read and comprehend with scaffold (eg. picture, graph, additional context, etc) or for read-aloud purposes for lower grade, provide it in the "alternative_grade" property and provide the types of scaffolding in the "scaffolding_needed" property.\n\nIn your reasoning, provide numbered bullet points for each of the analyses in each of the 3 steps. At the end, give me the 4th bullet point called "synthesis" to summarize your analysis from the above 3 steps that help you arrive at the final decision.\n\n{format_instructions}\n';
1749
+
1750
+ // src/prompts/grade-level-appropriateness/index.ts
1751
+ function getSystemPrompt2() {
1752
+ return system_default;
1753
+ }
1754
+ function getUserPrompt2(text) {
1755
+ return user_default.replace("{text}", text).replace("{format_instructions}", "");
1756
+ }
1757
+
1758
+ // src/evaluators/grade-level-appropriateness.ts
1759
+ var GradeLevelAppropriatenessEvaluator = class extends BaseEvaluator {
1760
+ static metadata = {
1761
+ id: "grade-level-appropriateness",
1762
+ name: "Grade Level Appropriateness",
1763
+ description: "Determines appropriate grade level for text with scaffolding recommendations",
1764
+ supportedGrades: [],
1765
+ // No grade parameter required - evaluates what grade the text is appropriate for
1766
+ defaultProviders: ["google" /* Google */]
1767
+ };
1768
+ provider;
1769
+ constructor(config) {
1770
+ super(config);
1771
+ this.provider = this.createConfiguredProvider(
1772
+ "google" /* Google */,
1773
+ "gemini-2.5-pro",
1774
+ config.googleApiKey
1775
+ );
1776
+ }
1777
+ /**
1778
+ * Evaluate grade level appropriateness for a given text
1779
+ *
1780
+ * @param text - The text to evaluate
1781
+ * @returns Evaluation result with grade recommendations and scaffolding suggestions
1782
+ * @throws {ValidationError} If text is empty or too short/long
1783
+ * @throws {ConfigurationError} If modelOverride specifies a model ID that the provider rejects
1784
+ * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError)
1785
+ */
1786
+ async evaluate(text) {
1787
+ this.logger.info("Starting grade level appropriateness evaluation", {
1788
+ evaluator: "grade-level-appropriateness",
1789
+ operation: "evaluate",
1790
+ textLength: text.length
1791
+ });
1792
+ const startTime = Date.now();
1793
+ try {
1794
+ this.validateText(text);
1795
+ this.logger.debug("Evaluating grade level appropriateness", {
1796
+ evaluator: "grade-level-appropriateness",
1797
+ operation: "grade_evaluation"
1798
+ });
1799
+ const userPrompt = getUserPrompt2(text);
1800
+ const response = await this.provider.generateStructured({
1801
+ messages: [
1802
+ { role: "system", content: getSystemPrompt2() },
1803
+ { role: "user", content: userPrompt }
1804
+ ],
1805
+ schema: GradeLevelAppropriatenessSchema,
1806
+ temperature: 0.25
1807
+ });
1808
+ const latencyMs = Date.now() - startTime;
1809
+ const tokenUsage = {
1810
+ input_tokens: response.usage.inputTokens,
1811
+ output_tokens: response.usage.outputTokens
1812
+ };
1813
+ const result = {
1814
+ score: response.data.grade,
1815
+ reasoning: response.data.reasoning,
1816
+ metadata: {
1817
+ model: this.provider.label,
1818
+ processingTimeMs: latencyMs
1819
+ },
1820
+ _internal: response.data
1821
+ };
1822
+ this.sendTelemetry({
1823
+ status: "success",
1824
+ latencyMs,
1825
+ textLength: text.length,
1826
+ provider: this.provider.label,
1827
+ tokenUsage,
1828
+ // No metadata.stage_details for single-stage evaluator
1829
+ inputText: text
1830
+ }).catch(() => {
1831
+ });
1832
+ this.logger.info("Grade level appropriateness evaluation completed successfully", {
1833
+ evaluator: "grade-level-appropriateness",
1834
+ operation: "evaluate",
1835
+ grade: result.score,
1836
+ processingTimeMs: latencyMs
1837
+ });
1838
+ return result;
1839
+ } catch (error) {
1840
+ const latencyMs = Date.now() - startTime;
1841
+ this.logger.error("Grade level appropriateness evaluation failed", {
1842
+ evaluator: "grade-level-appropriateness",
1843
+ operation: "evaluate",
1844
+ error: error instanceof Error ? error : void 0,
1845
+ processingTimeMs: latencyMs
1846
+ });
1847
+ this.sendTelemetry({
1848
+ status: "error",
1849
+ latencyMs,
1850
+ textLength: text.length,
1851
+ provider: this.provider.label,
1852
+ errorCode: error instanceof Error ? error.name : "UnknownError",
1853
+ inputText: text
1854
+ }).catch(() => {
1855
+ });
1856
+ if (error instanceof ValidationError) {
1857
+ throw error;
1858
+ }
1859
+ throw wrapProviderError(error, "Grade level appropriateness evaluation failed");
1860
+ }
1861
+ }
1862
+ };
1863
+ var SmkOutputSchema = z.object({
1864
+ identified_topics: z.array(z.string()).describe("List of major subjects/concepts found in the text."),
1865
+ curriculum_check: z.string().describe("Whether the topics are standard K-8 or specialized high school level."),
1866
+ assumptions_and_scaffolding: z.string().describe("What the author assumes the reader knows vs. what is explained."),
1867
+ friction_analysis: z.string().describe("Whether difficulty comes from vocabulary/structure or actual knowledge demands."),
1868
+ complexity_score: TextComplexityLevel.describe("The subject matter knowledge complexity level of the text"),
1869
+ reasoning: z.string().describe("A brief synthesis of why the text fits the chosen complexity level.")
1870
+ });
1871
+
1872
+ // ../../evals/prompts/subject-matter-knowledge/system.txt
1873
+ var system_default2 = `
1874
+ To perform the task of evaluating text complexity based on Subject Matter Knowledge (SMK), strictly adhere to the following instructions.
1875
+ Role
1876
+ You are an expert K-12 Literacy Pedagogue and Text Complexity Evaluator. Your specific focus is analyzing Subject Matter Knowledge (SMK) demands according to the Common Core Qualitative Text Complexity Rubric.
1877
+ Objective
1878
+ Analyze a provided text relative to a target grade_level. You must determine the extent of background knowledge required to comprehend the text. You must distinguish between Common/Standard knowledge (generally lower/moderate complexity) and Specialized/Theoretical knowledge (generally higher complexity).
1879
+ Input Data
1880
+ text: The passage to analyze.
1881
+ grade_level: The target student grade (integer).
1882
+ fk_score: Flesch-Kincaid Grade Level. Note: Use this only as a loose proxy for sentence structure. Do not let a high FK score artificially inflate the Subject Matter Knowledge score if the concepts remain simple.
1883
+
1884
+ 1. The Rubric: Subject Matter Knowledge (SMK)
1885
+ 1. Slightly Complex
1886
+ Scope: Everyday, practical knowledge, and Introduction to Skills.
1887
+ Concept Type: Concrete, directly observable, and familiar.
1888
+ Key Indicator: "How-to" texts involving familiar objects (e.g., drawing a cupboard, playing a game, family life). Even if specific terms (like "scale" or "measure") are used, if the application is on a common object, it remains Slightly Complex.
1889
+ 2. Moderately Complex
1890
+ Scope: Common Discipline-Specific Knowledge or Narrative History.
1891
+ Definition: Topics widely introduced in K-8 curricula (Basic American History, Geography, Earth Science, Biology).
1892
+ Key Characteristic: The text bridges concrete descriptions with abstract themes (e.g., using farming to discuss justice), OR narrates historical events via sensory details.
1893
+ Spatial Reasoning: Texts requiring mental manipulation of maps/routes are generally Moderate, unless the object is a familiar household item (see Slightly Complex).
1894
+ 3. Very Complex
1895
+ Scope: Specialized Discipline-Specific, Engineering Mechanics, or Political Theory.
1896
+ Definition: Topics characteristic of High School (9-12) curricula requiring abstract mental models.
1897
+ Key Characteristic: Requires understanding mechanisms (how physics works/propulsion), chemical composition, or undefined political stakes (specific treaties, alliances, or secularization without context).
1898
+ 4. Exceedingly Complex
1899
+ Scope: Professional or Academic knowledge.
1900
+
1901
+ 2. The Expert Mental Model (Decision Logic)
1902
+ Use these refined rules to categorize cases.
1903
+ Rule A: The "Layers of Meaning" Check
1904
+ Concrete -> Abstract (Moderate): The text describes concrete things (farming) to argue an abstract point (justice, rights).
1905
+ Concrete -> Concrete (Slightly): The text describes concrete things (lines, paper) to achieve a concrete result (drawing a cupboard). Do not over-rank practical instructions.
1906
+ Rule B: The Science & Engineering Boundary
1907
+ Observational (Moderate): Habitats, Water Cycle, observable traits, simple definitions.
1908
+ Mechanistic/Theoretical (Very): Engineering mechanics (how propulsion works via reaction), Instrumentation (using a spectroscope), or Chemical/Atomic theory.
1909
+ Test: Does the text explain how a machine functions using physical principles? If yes, it is Very Complex.
1910
+ Rule C: The History/Social Studies Boundary
1911
+ General/Narrative (Moderate):
1912
+ Sensory: Battle descriptions focusing on sights/sounds (flashes, smoke).
1913
+ Standard Topics: Immigration, Slavery, Government, Geography. Lists of nationalities or religions are "Common Knowledge" for Grades 6-8.
1914
+ Political/Contextual (Very):
1915
+ Implicit Context: Texts assuming knowledge of specific political factions, treaties, or the causes of events without explanation (e.g., "The Allies," "The Front," "The secularization of the clergy").
1916
+ Test: If the reader must know why two groups are fighting or the specific political history of a revolution to understand the text, it is Very Complex.
1917
+ Rule D: The "Technical vs. Practical" Trap
1918
+ Scenario: A text teaches a technical skill (e.g., Technical Drawing/Technology) but applies it to a familiar object (a cupboard).
1919
+ Decision: Slightly Complex.
1920
+ Reasoning: Do not confuse "Technical Vocabulary" (scale, thick lines) with "Theoretical Complexity." If the underlying concept is familiar (furniture), the SMK load is low.
1921
+
1922
+ 3. Critical Calibration Examples
1923
+ Text: "Make a rough sketch... How many shelves should the cupboard have?" (Grade 2) -> Slightly Complex.
1924
+ Reasoning: (Rule D/Rule A) Although it mentions "scale" and "technology," the task is concrete and relies on everyday knowledge.
1925
+ Text: "Hydraulic propulsion works by sucking water at the bow and forcing it sternward." (Grade 10) -> Very Complex.
1926
+ Reasoning: (Rule B) Explains a mechanism using physics principles.
1927
+ Text: "The Allies fight the enemy's cavalry; we remember the hospitality to priests during the Revolution." (Grade 6) -> Very Complex.
1928
+ Reasoning: (Rule C) Assumes undefined knowledge of WWI alliances and the specific political history of the French Revolution.
1929
+ Text: "Immigrants from Poland, Italy, and Russia arrived. Most were Catholic or Orthodox." (Grade 7) -> Moderately Complex.
1930
+ Reasoning: (Rule C) Standard K-8 topic. Lists of nationalities are content vocabulary, not specialized theory.
1931
+
1932
+ 4. Output Format
1933
+ Return your analysis in a valid JSON object. Do not include markdown formatting.
1934
+ Keys:
1935
+ - identified_topics: List[str] identifying the core subjects.
1936
+ - curriculum_check: String explaining if the topics are "Standard/General" (typical for K-8) or "Specialized/High School" (typical for 9-12).
1937
+ - assumptions_and_scaffolding: String analyzing what the author assumes the reader knows vs what is explained.
1938
+ - friction_analysis: String discussing the gap between Concrete description and Abstract meaning.
1939
+ - complexity_score: String (One of: slightly_complex, moderately_complex, very_complex, exceedingly_complex).
1940
+ - reasoning: String synthesizing the decision.
1941
+
1942
+ `;
1943
+
1944
+ // ../../evals/prompts/subject-matter-knowledge/user.txt
1945
+ var user_default2 = "Analyze:\nText: {text}\nGrade: {grade}\nFK Score: {fk_score}";
1946
+
1947
+ // src/prompts/subject-matter-knowledge/index.ts
1948
+ function getSystemPrompt3() {
1949
+ return system_default2;
1950
+ }
1951
+ function getUserPrompt3(text, grade, fkScore) {
1952
+ return user_default2.replaceAll("{text}", text).replaceAll("{grade}", grade).replaceAll("{fk_score}", fkScore.toString());
1953
+ }
1954
+
1955
+ // src/evaluators/smk.ts
1956
+ var SmkEvaluator = class _SmkEvaluator extends BaseEvaluator {
1957
+ static metadata = {
1958
+ id: "subject-matter-knowledge",
1959
+ name: "Subject Matter Knowledge",
1960
+ description: "Evaluates background knowledge demands of educational texts relative to grade level",
1961
+ supportedGrades: ["3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
1962
+ defaultProviders: ["google" /* Google */]
1963
+ };
1964
+ provider;
1965
+ constructor(config) {
1966
+ super(config);
1967
+ this.provider = this.createConfiguredProvider(
1968
+ "google" /* Google */,
1969
+ "gemini-3-flash-preview",
1970
+ config.googleApiKey
1971
+ );
1972
+ }
1973
+ /**
1974
+ * Evaluate subject matter knowledge complexity for a given text and grade level
1975
+ *
1976
+ * @param text - The text to evaluate
1977
+ * @param grade - The target grade level (3-12)
1978
+ * @returns Evaluation result with complexity score and detailed analysis
1979
+ * @throws {ValidationError} If text is empty, too short/long, or grade is invalid
1980
+ * @throws {ConfigurationError} If modelOverride specifies a model ID that the provider rejects
1981
+ * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError)
1982
+ */
1983
+ async evaluate(text, grade) {
1984
+ this.logger.info("Starting SMK evaluation", {
1985
+ evaluator: "subject-matter-knowledge",
1986
+ operation: "evaluate",
1987
+ grade,
1988
+ textLength: text.length
1989
+ });
1990
+ const startTime = Date.now();
1991
+ const stageDetails = [];
1992
+ try {
1993
+ this.validateText(text);
1994
+ this.validateGrade(grade, new Set(_SmkEvaluator.metadata.supportedGrades));
1995
+ this.logger.debug("Evaluating subject matter knowledge complexity", {
1996
+ evaluator: "subject-matter-knowledge",
1997
+ operation: "smk_evaluation"
1998
+ });
1999
+ const fkScore = calculateFleschKincaidGrade(text);
2000
+ const response = await this.evaluateSmk(text, grade, fkScore);
2001
+ stageDetails.push({
2002
+ stage: "smk_evaluation",
2003
+ provider: this.provider.label,
2004
+ latency_ms: response.latencyMs,
2005
+ token_usage: {
2006
+ input_tokens: response.usage.inputTokens,
2007
+ output_tokens: response.usage.outputTokens
2008
+ }
2009
+ });
2010
+ const latencyMs = Date.now() - startTime;
2011
+ const totalTokenUsage = {
2012
+ input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
2013
+ output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
2014
+ };
2015
+ const result = {
2016
+ score: response.data.complexity_score,
2017
+ reasoning: response.data.reasoning,
2018
+ metadata: {
2019
+ model: this.provider.label,
2020
+ processingTimeMs: latencyMs
2021
+ },
2022
+ _internal: response.data
2023
+ };
2024
+ this.sendTelemetry({
2025
+ status: "success",
2026
+ latencyMs,
2027
+ textLength: text.length,
2028
+ grade,
2029
+ provider: this.provider.label,
2030
+ tokenUsage: totalTokenUsage,
2031
+ metadata: {
2032
+ stage_details: stageDetails
2033
+ },
2034
+ inputText: text
2035
+ }).catch(() => {
2036
+ });
2037
+ this.logger.info("SMK evaluation completed successfully", {
2038
+ evaluator: "subject-matter-knowledge",
2039
+ operation: "evaluate",
2040
+ grade,
2041
+ score: result.score,
2042
+ processingTimeMs: latencyMs
2043
+ });
2044
+ return result;
2045
+ } catch (error) {
2046
+ const latencyMs = Date.now() - startTime;
2047
+ this.logger.error("SMK evaluation failed", {
2048
+ evaluator: "subject-matter-knowledge",
2049
+ operation: "evaluate",
2050
+ grade,
2051
+ error: error instanceof Error ? error : void 0,
2052
+ processingTimeMs: latencyMs,
2053
+ completedStages: stageDetails.length
2054
+ });
2055
+ const totalTokenUsage = stageDetails.length > 0 ? {
2056
+ input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
2057
+ output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
2058
+ } : void 0;
2059
+ this.sendTelemetry({
2060
+ status: "error",
2061
+ latencyMs,
2062
+ textLength: text.length,
2063
+ grade,
2064
+ provider: this.provider.label,
2065
+ tokenUsage: totalTokenUsage,
2066
+ errorCode: error instanceof Error ? error.name : "UnknownError",
2067
+ metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : void 0,
2068
+ inputText: text
2069
+ }).catch(() => {
2070
+ });
2071
+ if (error instanceof ValidationError) {
2072
+ throw error;
2073
+ }
2074
+ throw wrapProviderError(error, "SMK evaluation failed");
2075
+ }
2076
+ }
2077
+ /**
2078
+ * Run the SMK evaluation LLM call
2079
+ */
2080
+ async evaluateSmk(text, grade, fkScore) {
2081
+ const response = await this.provider.generateStructured({
2082
+ messages: [
2083
+ { role: "system", content: getSystemPrompt3() },
2084
+ { role: "user", content: getUserPrompt3(text, grade, fkScore) }
2085
+ ],
2086
+ schema: SmkOutputSchema,
2087
+ temperature: 0
2088
+ });
2089
+ return {
2090
+ data: response.data,
2091
+ usage: response.usage,
2092
+ latencyMs: response.latencyMs
2093
+ };
2094
+ }
2095
+ };
2096
+ var ConventionalityOutputSchema = z.object({
2097
+ conventionality_features: z.array(z.string()).describe("The specific language features driving the complexity (e.g., literal narrative, concrete actions, sustained irony, abstract qualities) with direct quotes from the text."),
2098
+ grade_context: z.string().describe("How the conventionality demands compare to general expectations for the provided target grade."),
2099
+ instructional_insights: z.string().describe("Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom."),
2100
+ complexity_score: TextComplexityLevel.describe("The conventionality complexity level of the text"),
2101
+ reasoning: z.string().describe("A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails.")
2102
+ });
2103
+
2104
+ // ../../evals/prompts/conventionality/system.txt
2105
+ var system_default3 = `Role
2106
+ You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level.
2107
+
2108
+ Objective
2109
+ Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade.
2110
+
2111
+ Complexity Levels
2112
+ - Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level.
2113
+ - Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging.
2114
+ - Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear.
2115
+ - Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices.
2116
+
2117
+ Essential Evaluation Rules
2118
+ 1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex."
2119
+
2120
+ 2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group.
2121
+
2122
+ 3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two.
2123
+
2124
+ 4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex."
2125
+
2126
+ 5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing.
2127
+
2128
+ 6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge.
2129
+
2130
+ 7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex.
2131
+
2132
+ 8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students.
2133
+
2134
+ 9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary.
2135
+
2136
+ Input Format
2137
+ You will receive:
2138
+ - text: The passage to evaluate.
2139
+ - grade_level: The target student grade level.
2140
+ - fk_score: The Flesch-Kincaid readability score.
2141
+
2142
+ Output Format
2143
+ Provide a JSON object containing ONLY the following keys:
2144
+ - complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'.
2145
+ - reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural).
2146
+ - conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text.
2147
+ - grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade.
2148
+ - instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom.`;
2149
+
2150
+ // ../../evals/prompts/conventionality/user.txt
2151
+ var user_default3 = "Analyze:\nText: {text}\nGrade: {grade}\nFK Score: {fk_score}";
2152
+
2153
+ // src/prompts/conventionality/index.ts
2154
+ function getSystemPrompt4() {
2155
+ return system_default3;
2156
+ }
2157
+ function getUserPrompt4(text, grade, fkScore) {
2158
+ return user_default3.replaceAll("{text}", text).replaceAll("{grade}", grade).replaceAll("{fk_score}", fkScore.toString());
2159
+ }
2160
+
2161
+ // src/evaluators/conventionality.ts
2162
+ var ConventionalityEvaluator = class _ConventionalityEvaluator extends BaseEvaluator {
2163
+ static metadata = {
2164
+ id: "conventionality",
2165
+ name: "Conventionality",
2166
+ description: "Evaluates how explicit, literal, and straightforward a text's meaning is relative to grade level",
2167
+ supportedGrades: ["3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
2168
+ defaultProviders: ["google" /* Google */]
2169
+ };
2170
+ provider;
2171
+ constructor(config) {
2172
+ super(config);
2173
+ this.provider = this.createConfiguredProvider(
2174
+ "google" /* Google */,
2175
+ "gemini-3-flash-preview",
2176
+ config.googleApiKey
2177
+ );
2178
+ }
2179
+ /**
2180
+ * Evaluate conventionality complexity for a given text and grade level
2181
+ *
2182
+ * @param text - The text to evaluate
2183
+ * @param grade - The target grade level (3-12)
2184
+ * @returns Evaluation result with complexity score and detailed analysis
2185
+ * @throws {ValidationError} If text is empty, too short/long, or grade is invalid
2186
+ * @throws {ConfigurationError} If modelOverride specifies a model ID that the provider rejects
2187
+ * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError)
2188
+ */
2189
+ async evaluate(text, grade) {
2190
+ this.logger.info("Starting Conventionality evaluation", {
2191
+ evaluator: "conventionality",
2192
+ operation: "evaluate",
2193
+ grade,
2194
+ textLength: text.length
2195
+ });
2196
+ const startTime = Date.now();
2197
+ const stageDetails = [];
2198
+ try {
2199
+ this.validateText(text);
2200
+ this.validateGrade(grade, new Set(_ConventionalityEvaluator.metadata.supportedGrades));
2201
+ this.logger.debug("Evaluating conventionality complexity", {
2202
+ evaluator: "conventionality",
2203
+ operation: "conventionality_evaluation"
2204
+ });
2205
+ const fkScore = calculateFleschKincaidGrade(text);
2206
+ const response = await this.evaluateConventionality(text, grade, fkScore);
2207
+ stageDetails.push({
2208
+ stage: "conventionality_evaluation",
2209
+ provider: this.provider.label,
2210
+ latency_ms: response.latencyMs,
2211
+ token_usage: {
2212
+ input_tokens: response.usage.inputTokens,
2213
+ output_tokens: response.usage.outputTokens
2214
+ }
2215
+ });
2216
+ const latencyMs = Date.now() - startTime;
2217
+ const totalTokenUsage = {
2218
+ input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
2219
+ output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
2220
+ };
2221
+ const result = {
2222
+ score: response.data.complexity_score,
2223
+ reasoning: response.data.reasoning,
2224
+ metadata: {
2225
+ model: this.provider.label,
2226
+ processingTimeMs: latencyMs
2227
+ },
2228
+ _internal: response.data
2229
+ };
2230
+ this.sendTelemetry({
2231
+ status: "success",
2232
+ latencyMs,
2233
+ textLength: text.length,
2234
+ grade,
2235
+ provider: this.provider.label,
2236
+ tokenUsage: totalTokenUsage,
2237
+ metadata: {
2238
+ stage_details: stageDetails
2239
+ },
2240
+ inputText: text
2241
+ }).catch(() => {
2242
+ });
2243
+ this.logger.info("Conventionality evaluation completed successfully", {
2244
+ evaluator: "conventionality",
2245
+ operation: "evaluate",
2246
+ grade,
2247
+ score: result.score,
2248
+ processingTimeMs: latencyMs
2249
+ });
2250
+ return result;
2251
+ } catch (error) {
2252
+ const latencyMs = Date.now() - startTime;
2253
+ this.logger.error("Conventionality evaluation failed", {
2254
+ evaluator: "conventionality",
2255
+ operation: "evaluate",
2256
+ grade,
2257
+ error: error instanceof Error ? error : void 0,
2258
+ processingTimeMs: latencyMs,
2259
+ completedStages: stageDetails.length
2260
+ });
2261
+ const totalTokenUsage = stageDetails.length > 0 ? {
2262
+ input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0),
2263
+ output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0)
2264
+ } : void 0;
2265
+ this.sendTelemetry({
2266
+ status: "error",
2267
+ latencyMs,
2268
+ textLength: text.length,
2269
+ grade,
2270
+ provider: this.provider.label,
2271
+ tokenUsage: totalTokenUsage,
2272
+ errorCode: error instanceof Error ? error.name : "UnknownError",
2273
+ metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : void 0,
2274
+ inputText: text
2275
+ }).catch(() => {
2276
+ });
2277
+ if (error instanceof ValidationError) {
2278
+ throw error;
2279
+ }
2280
+ throw wrapProviderError(error, "Conventionality evaluation failed");
2281
+ }
2282
+ }
2283
+ /**
2284
+ * Run the Conventionality evaluation LLM call
2285
+ */
2286
+ async evaluateConventionality(text, grade, fkScore) {
2287
+ const response = await this.provider.generateStructured({
2288
+ messages: [
2289
+ { role: "system", content: getSystemPrompt4() },
2290
+ { role: "user", content: getUserPrompt4(text, grade, fkScore) }
2291
+ ],
2292
+ schema: ConventionalityOutputSchema,
2293
+ temperature: 0
2294
+ });
2295
+ return {
2296
+ data: response.data,
2297
+ usage: response.usage,
2298
+ latencyMs: response.latencyMs
2299
+ };
2300
+ }
2301
+ };
2302
+ var PurposeOutputSchema = z.object({ "complexity_score": z.enum(["slightly_complex", "moderately_complex", "very_complex", "exceedingly_complex", "more_context_needed"]).describe("The Purpose complexity level for the target grade."), "reasoning": z.string().describe("A high-level summary of why the text is at this complexity level for the target grade."), "details": z.object({ "detailed_summary": z.array(z.object({ "factor": z.string().describe("The specific text complexity factor identified."), "description": z.string().describe("How this factor manifests in the text."), "effect_on_complexity_dimension": z.string().describe("How this factor affects the reader's ability to understand the text's specific complexity dimension.") }).strict()).describe("Individual complexity factors with descriptions and their effects."), "adjustment_and_scaffolding": z.array(z.object({ "scaffolding_need": z.string().describe("The complexity factor that requires scaffolding."), "suggestion": z.string().describe("A specific instructional strategy to support students with this factor.") }).strict()).describe("Scaffolding strategies to make the text accessible at the target grade."), "recommended_use_cases": z.array(z.object({ "opportunity": z.string().describe("An instructional opportunity related to the text."), "suggestion": z.string().describe("A specific way to leverage this text for that instructional purpose.") }).strict()).describe("Additional instructional opportunities for using this text.") }).strict().describe("Practical instructional details including scaffolding strategies and recommended use cases.") }).strict();
2303
+
2304
+ // ../../evals/prompts/purpose/system.txt
2305
+ var system_default4 = '\n Role\n You are an expert reading assessment evaluator. Your task is to determine the Text Complexity of a given passage based exclusively on the Purpose dimension of the qualitative measures rubric.\n\n Task Details\n You will be provided with an informational or literary `text`, along with its `grade_level` and `fk_score` (Flesch-Kincaid). You must analyze the text and determine how difficult it is for a reader to identify the author\'s purpose. \n\n Crucially, you must distinguish between the text\'s *topic* (what it is about) and its *purpose* (why the author wrote it). \n\n Rubric: Purpose Complexity\n Exceedingly Complex: Subtle and intricate, difficult to determine; includes many theoretical or abstract elements.\n Very Complex: Implicit or subtle but fairly easy to infer; more theoretical or abstract than concrete.\n Moderately Complex: Implied but easy to identify based upon context or source.\n Slightly Complex: Explicitly stated, clear, concrete, narrowly focused.\n More Context Needed: The text is a fragment or lacks necessary introductory context, making the true purpose impossible to determine accurately without external background knowledge.\n\n Expert Rules for Evaluating Purpose\n Based on expert consensus and historical grading corrections, you must apply the following heuristics:\n\n 1. The "Slightly Complex" Benchmark (Straightforward and Explicit)\n A text is Slightly Complex if its purpose is explicitly stated or if its informative intent is straightforward, clear, concrete, and directly answers what the text is immediately about. If the text opens by clearly identifying a concrete topic (e.g., "Pins are made of either brass or iron wire") and rigidly follows through by explaining factual, practical information or a process (like manufacturing steps or geographic facts), the purpose is considered explicit and straightforward. It does *not* require a literal statement like "The purpose of this text is to..." as long as the delivery of information is direct, clear, and unadorned by persuasive elements or complex framing.\n\n 2. Moderately Complex via Guiding Questions & Inquiry Formats\n If a text begins with a general introduction and uses guiding questions (e.g., "Have you ever wondered how clouds are formed?") to transition into an explanation, the purpose is implied rather than explicitly stated upfront. Because the reader must recognize the question as the pivot point for the author\'s intent, it is Moderately Complex.\n\n 3. Moderately Complex via Multiple Distinct Informational Goals\n If a text covers a broad topic but jumps between several distinct scientific or informational objectives without an overarching framing device or explicit thesis (e.g., talking about measuring ice sheets, then mapping, then finding meteorites), the reader must synthesize these diverse facts to recognize the broader purpose, making it Moderately Complex.\n\n 4. Moderately Complex via Arguments Disguised as Information\n If an author is arguing a specific point, correcting a misconception, or defending a stance, but the text could initially be mistaken by students as purely informative factual text, it is Moderately Complex. The reader must infer the persuasive intent or argumentative purpose beneath the informative tone.\n\n 5. "More Context Needed" for Fragments\n If a text is a fragment missing a crucial introduction or context, and identifying the author\'s purpose beyond a simple surface-level description would be exceptionally difficult for a reader in the target grade level without that external background, score it as `more_context_needed`. \n\n Output Format\n Provide your evaluation in the following structure:\n reasoning:\n - Surface Analysis: Identify if the text clearly identifies its topic and delivers straightforward facts, or if it utilizes structural cues, titles, or direct thesis statements.\n - Subtlety & Framing: Is the informative purpose straightforward and concrete? Does it use guiding questions? Is it an argument disguised as pure information? Are there multiple distinct informational goals requiring synthesis?\n - Context Check: Is this text a fragment missing crucial context that obscures the deeper purpose for the target grade level?\n - Rubric Alignment: Explain how the text aligns with the specific language of the rubric, explicitly referencing the expert rules above. Justify why it isn\'t one level simpler or more complex.\n\n answer:\n - complexity_score: (slightly_complex, moderately_complex, very_complex, exceedingly_complex, more_context_needed)\n - reasoning: A brief summary of your final decision.\n - details: Structured breakdown of PurposeDetails including detailed_summary, adjustment_and_scaffolding, and recommended_use_cases.\n';
2306
+
2307
+ // ../../evals/prompts/purpose/user.txt
2308
+ var user_default4 = "Analyze:\nText: {text}\nGrade: {grade_level}\nFK Score: {fk_score}";
2309
+
2310
+ // ../../evals/prompts/purpose/config.json
2311
+ var config_default = {
2312
+ evaluator: {
2313
+ id: "literacy.gla.purpose",
2314
+ name: "Purpose Dimension Text Complexity Evaluator",
2315
+ description: "Evaluates the Purpose dimension of qualitative text complexity for K-12 reading assessment, producing a 5-level rubric rating with structured pedagogical detail."
2316
+ },
2317
+ preprocessing: [
2318
+ {
2319
+ id: "fk_score",
2320
+ kind: "flesch_kincaid_grade",
2321
+ description: "Compute the Flesch-Kincaid Grade Level for the input text and bind it to {fk_score} in the prompt.",
2322
+ input: "text",
2323
+ output: "fk_score",
2324
+ implementation: {
2325
+ python: {
2326
+ library: "textstat",
2327
+ function: "flesch_kincaid_grade",
2328
+ post_transform: {
2329
+ type: "round",
2330
+ precision: 2
2331
+ }
2332
+ },
2333
+ typescript: {
2334
+ library: "text-readability",
2335
+ function: "fleschKincaidGrade",
2336
+ post_transform: {
2337
+ type: "round",
2338
+ precision: 2
2339
+ }
2340
+ }
2341
+ }
2342
+ }
2343
+ ],
2344
+ steps: [
2345
+ {
2346
+ id: "evaluate_purpose",
2347
+ description: "Single-call LLM step that produces the EvaluatorOutput JSON.",
2348
+ prompt: {
2349
+ type: "chat",
2350
+ messages: [
2351
+ {
2352
+ role: "system",
2353
+ source_path: "system.txt",
2354
+ sha256: "745b95b7d54dc845b99363c9d3360355381883c22a5f6a0f305d7349cae38a54"
2355
+ },
2356
+ {
2357
+ role: "user",
2358
+ source_path: "user.txt",
2359
+ sha256: "cd8e6347db1a55d104e34436f8f66e833bd6583645d4786a554aaefdd26479b2"
2360
+ }
2361
+ ],
2362
+ placeholders: {
2363
+ text: {
2364
+ required: true,
2365
+ source: "input"
2366
+ },
2367
+ grade_level: {
2368
+ required: true,
2369
+ source: "input"
2370
+ },
2371
+ fk_score: {
2372
+ required: true,
2373
+ source: "preprocessing.fk_score"
2374
+ }
2375
+ }
2376
+ },
2377
+ model: {
2378
+ provider: "google",
2379
+ name: "gemini-3-flash-preview"
2380
+ },
2381
+ generation: {
2382
+ temperature: 0
2383
+ },
2384
+ parser: {
2385
+ kind: "structured_output"
2386
+ },
2387
+ output_binding: "formatted_output"
2388
+ }
2389
+ ]};
2390
+
2391
+ // src/prompts/purpose/index.ts
2392
+ var STEP_ID = `evaluate_${config_default.evaluator.id.split(".").pop()}`;
2393
+ var _step = config_default.steps.find((s) => s.id === STEP_ID);
2394
+ if (!_step) throw new Error(`Step "${STEP_ID}" not found in purpose config.json`);
2395
+ var PLACEHOLDER_KEYS = Object.keys(_step.prompt.placeholders);
2396
+ function applyPlaceholders(template, inputs) {
2397
+ return PLACEHOLDER_KEYS.reduce(
2398
+ (text, key) => key in inputs ? text.replaceAll(`{${key}}`, inputs[key]) : text,
2399
+ template
2400
+ );
2401
+ }
2402
+ function getSystemPrompt5(inputs) {
2403
+ return applyPlaceholders(system_default4, inputs);
2404
+ }
2405
+ function getUserPrompt5(inputs) {
2406
+ return applyPlaceholders(user_default4, inputs);
2407
+ }
2408
+
2409
+ // ../../evals/prompts/purpose/input_schema.json
2410
+ var input_schema_default = {
2411
+ properties: {
2412
+ grade_level: {
2413
+ minimum: 3,
2414
+ maximum: 12}
2415
+ }
2416
+ };
2417
+
2418
+ // src/evaluators/purpose.ts
2419
+ var STEP_ID2 = `evaluate_${config_default.evaluator.id.split(".").pop()}`;
2420
+ var _step2 = config_default.steps.find((s) => s.id === STEP_ID2);
2421
+ if (!_step2) throw new Error(`Step "${STEP_ID2}" not found in purpose config.json`);
2422
+ var STEP = _step2;
2423
+ var GRADE_MIN = input_schema_default.properties.grade_level.minimum;
2424
+ var GRADE_MAX = input_schema_default.properties.grade_level.maximum;
2425
+ var SUPPORTED_GRADES = Array.from({ length: GRADE_MAX - GRADE_MIN + 1 }, (_, i) => String(GRADE_MIN + i));
2426
+ var COMPLEXITY_SCORE_DISPLAY = {
2427
+ "slightly_complex": "Slightly complex",
2428
+ "moderately_complex": "Moderately complex",
2429
+ "very_complex": "Very complex",
2430
+ "exceedingly_complex": "Exceedingly complex",
2431
+ "more_context_needed": "More context needed"
2432
+ };
2433
+ var PurposeEvaluator = class _PurposeEvaluator extends BaseEvaluator {
2434
+ static metadata = {
2435
+ id: config_default.evaluator.id,
2436
+ name: config_default.evaluator.name,
2437
+ description: config_default.evaluator.description,
2438
+ supportedGrades: SUPPORTED_GRADES,
2439
+ defaultProviders: ["google" /* Google */]
2440
+ };
2441
+ static TEMPERATURE = STEP.generation.temperature;
2442
+ static computeFkScore(text) {
2443
+ const fkStep = config_default.preprocessing.find((p) => p.id === "fk_score");
2444
+ if (!fkStep) throw new Error("fk_score preprocessing step not found in purpose config.json");
2445
+ return runPreprocessingStep(text, fkStep.implementation.typescript);
2446
+ }
2447
+ provider;
2448
+ constructor(config) {
2449
+ super(config);
2450
+ this.provider = this.createConfiguredProvider(
2451
+ "google" /* Google */,
2452
+ STEP.model.name,
2453
+ config.googleApiKey
2454
+ );
2455
+ }
2456
+ /**
2457
+ * Evaluate purpose complexity for a given text and grade level
2458
+ *
2459
+ * @param text - The text to evaluate
2460
+ * @param grade - The target grade level (3-12)
2461
+ * @returns Evaluation result with complexity score and detailed analysis
2462
+ * @throws {ValidationError} If text is empty, too short/long, or grade is invalid
2463
+ * @throws {ConfigurationError} If modelOverride specifies a model ID that the provider rejects
2464
+ * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError)
2465
+ */
2466
+ async evaluate(text, grade) {
2467
+ this.logger.info("Starting Purpose evaluation", {
2468
+ evaluator: _PurposeEvaluator.metadata.id,
2469
+ operation: "evaluate",
2470
+ grade,
2471
+ textLength: text.length
2472
+ });
2473
+ const startTime = Date.now();
2474
+ const stageDetails = [];
2475
+ try {
2476
+ this.validateText(text);
2477
+ const gradeNum = this.parseAndValidateGrade(grade);
2478
+ const fkScore = _PurposeEvaluator.computeFkScore(text);
2479
+ const inputs = {
2480
+ text,
2481
+ grade_level: String(gradeNum),
2482
+ fk_score: String(fkScore)
2483
+ };
2484
+ const response = await this.callLLM(inputs);
2485
+ const latencyMs = Date.now() - startTime;
2486
+ const tokenUsage = {
2487
+ input_tokens: response.usage.inputTokens,
2488
+ output_tokens: response.usage.outputTokens
2489
+ };
2490
+ stageDetails.push({
2491
+ stage: STEP.id,
2492
+ provider: this.provider.label,
2493
+ latency_ms: response.latencyMs,
2494
+ token_usage: tokenUsage
2495
+ });
2496
+ const result = {
2497
+ score: COMPLEXITY_SCORE_DISPLAY[response.data.complexity_score],
2498
+ reasoning: response.data.reasoning,
2499
+ metadata: {
2500
+ model: this.provider.label,
2501
+ processingTimeMs: latencyMs
2502
+ },
2503
+ _internal: response.data
2504
+ };
2505
+ this.sendTelemetry({
2506
+ status: "success",
2507
+ latencyMs,
2508
+ textLength: text.length,
2509
+ grade: String(gradeNum),
2510
+ provider: this.provider.label,
2511
+ tokenUsage,
2512
+ metadata: { stage_details: stageDetails },
2513
+ inputText: text
2514
+ }).catch(() => void 0);
2515
+ this.logger.info("Purpose evaluation completed successfully", {
2516
+ evaluator: _PurposeEvaluator.metadata.id,
2517
+ operation: "evaluate",
2518
+ grade: gradeNum,
2519
+ score: result.score,
2520
+ processingTimeMs: latencyMs
2521
+ });
2522
+ return result;
2523
+ } catch (error) {
2524
+ const latencyMs = Date.now() - startTime;
2525
+ this.logger.error("Purpose evaluation failed", {
2526
+ evaluator: _PurposeEvaluator.metadata.id,
2527
+ operation: "evaluate",
2528
+ grade,
2529
+ error: error instanceof Error ? error : void 0,
2530
+ processingTimeMs: latencyMs
2531
+ });
2532
+ const tokenUsage = stageDetails.length > 0 ? {
2533
+ input_tokens: stageDetails.reduce((s, d) => s + (d.token_usage?.input_tokens ?? 0), 0),
2534
+ output_tokens: stageDetails.reduce((s, d) => s + (d.token_usage?.output_tokens ?? 0), 0)
2535
+ } : void 0;
2536
+ this.sendTelemetry({
2537
+ status: "error",
2538
+ latencyMs,
2539
+ textLength: text.length,
2540
+ grade: String(grade),
2541
+ provider: this.provider.label,
2542
+ tokenUsage,
2543
+ errorCode: error instanceof Error ? error.name : "UnknownError",
2544
+ metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : void 0,
2545
+ inputText: text
2546
+ }).catch(() => void 0);
2547
+ if (error instanceof ValidationError) throw error;
2548
+ throw wrapProviderError(error, "Purpose evaluation failed");
2549
+ }
2550
+ }
2551
+ parseAndValidateGrade(grade) {
2552
+ const num = Number(grade.trim());
2553
+ if (!Number.isInteger(num) || num < GRADE_MIN || num > GRADE_MAX) {
2554
+ throw new ValidationError(
2555
+ `Invalid grade "${grade}". Purpose evaluator supports integer grades ${GRADE_MIN}\u2013${GRADE_MAX}.`
2556
+ );
2557
+ }
2558
+ return num;
2559
+ }
2560
+ async callLLM(inputs) {
2561
+ const response = await this.provider.generateStructured({
2562
+ messages: [
2563
+ { role: "system", content: getSystemPrompt5(inputs) },
2564
+ { role: "user", content: getUserPrompt5(inputs) }
2565
+ ],
2566
+ schema: PurposeOutputSchema,
2567
+ temperature: _PurposeEvaluator.TEMPERATURE
2568
+ });
2569
+ return { data: response.data, usage: response.usage, latencyMs: response.latencyMs };
2570
+ }
2571
+ };
2572
+
2573
+ // src/batch/evaluator.ts
2574
+ var EVALUATOR_MAP = /* @__PURE__ */ new Map([
2575
+ [GradeLevelAppropriatenessEvaluator.metadata.id, GradeLevelAppropriatenessEvaluator],
2576
+ [SmkEvaluator.metadata.id, SmkEvaluator],
2577
+ [VocabularyEvaluator.metadata.id, VocabularyEvaluator],
2578
+ [SentenceStructureEvaluator.metadata.id, SentenceStructureEvaluator],
2579
+ [ConventionalityEvaluator.metadata.id, ConventionalityEvaluator],
2580
+ [PurposeEvaluator.metadata.id, PurposeEvaluator]
2581
+ ]);
2582
+ var EVALUATOR_GROUPS = [
2583
+ {
2584
+ id: "text-complexity",
2585
+ name: "Text Complexity Analysis",
2586
+ description: "Evaluates all dimensions of the Qualitative Text Complexity rubric",
2587
+ evaluatorIds: [
2588
+ GradeLevelAppropriatenessEvaluator.metadata.id,
2589
+ SmkEvaluator.metadata.id,
2590
+ VocabularyEvaluator.metadata.id,
2591
+ SentenceStructureEvaluator.metadata.id,
2592
+ ConventionalityEvaluator.metadata.id,
2593
+ PurposeEvaluator.metadata.id
2594
+ ],
2595
+ requiresGoogleKey: true,
2596
+ requiresOpenAIKey: true,
2597
+ maxInputRows: 50
2598
+ }
2599
+ ];
2600
+ function getAvailableGroups() {
2601
+ return [...EVALUATOR_GROUPS];
2602
+ }
2603
+ var BatchEvaluator = class {
2604
+ config;
2605
+ limit;
2606
+ evaluatorInstances = /* @__PURE__ */ new Map();
2607
+ isCancelled = false;
2608
+ completedResults = [];
2609
+ constructor(config) {
2610
+ this.config = {
2611
+ concurrency: 3,
2612
+ maxRetries: 2,
2613
+ telemetry: false,
2614
+ ...config
2615
+ };
2616
+ this.limit = pLimit(this.config.concurrency);
2617
+ }
2618
+ /**
2619
+ * Cancel ongoing evaluation.
2620
+ * Returns partial results collected so far.
2621
+ */
2622
+ cancel() {
2623
+ this.isCancelled = true;
2624
+ return [...this.completedResults];
2625
+ }
2626
+ /**
2627
+ * Initialize evaluator instances for the given IDs
2628
+ */
2629
+ initializeEvaluators(evaluatorIds) {
2630
+ for (const id of evaluatorIds) {
2631
+ if (this.evaluatorInstances.has(id)) continue;
2632
+ const EvaluatorClass = EVALUATOR_MAP.get(id);
2633
+ if (!EvaluatorClass) {
2634
+ throw new Error(`Unknown evaluator: ${id}`);
2635
+ }
2636
+ const evaluator = new EvaluatorClass({
2637
+ googleApiKey: this.config.googleApiKey,
2638
+ openaiApiKey: this.config.openaiApiKey,
2639
+ maxRetries: this.config.maxRetries,
2640
+ telemetry: this.config.telemetry
2641
+ });
2642
+ this.evaluatorInstances.set(id, evaluator);
2643
+ }
2644
+ }
2645
+ /**
2646
+ * Create tasks from inputs and evaluator IDs
2647
+ */
2648
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
2649
+ createTasks(inputs, evaluatorIds) {
2650
+ const tasks = [];
2651
+ for (const input of inputs) {
2652
+ for (const evaluatorId of evaluatorIds) {
2653
+ tasks.push({
2654
+ text: input.text,
2655
+ grade: input.grade,
2656
+ evaluatorId,
2657
+ rowIndex: input.rowIndex,
2658
+ originalRow: input.originalRow
2659
+ });
2660
+ }
2661
+ }
2662
+ return tasks;
2663
+ }
2664
+ /**
2665
+ * Execute a single evaluation task
2666
+ */
2667
+ async executeTask(task, onProgress) {
2668
+ if (this.isCancelled) {
2669
+ const batchResult = {
2670
+ rowIndex: task.rowIndex,
2671
+ text: task.text,
2672
+ grade: task.grade,
2673
+ evaluatorId: task.evaluatorId,
2674
+ status: "error",
2675
+ error: "Cancelled by user",
2676
+ processingTimeMs: 0,
2677
+ originalRow: task.originalRow
2678
+ };
2679
+ return batchResult;
2680
+ }
2681
+ const startTime = Date.now();
2682
+ const evaluator = this.evaluatorInstances.get(task.evaluatorId);
2683
+ if (!evaluator) {
2684
+ const batchResult = {
2685
+ rowIndex: task.rowIndex,
2686
+ text: task.text,
2687
+ grade: task.grade,
2688
+ evaluatorId: task.evaluatorId,
2689
+ status: "error",
2690
+ error: `Evaluator not initialized: ${task.evaluatorId}`,
2691
+ processingTimeMs: 0,
2692
+ originalRow: task.originalRow
2693
+ };
2694
+ this.completedResults.push(batchResult);
2695
+ if (onProgress) onProgress(batchResult);
2696
+ return batchResult;
2697
+ }
2698
+ try {
2699
+ const result = await evaluator.evaluate(task.text, task.grade);
2700
+ const batchResult = {
2701
+ rowIndex: task.rowIndex,
2702
+ text: task.text,
2703
+ grade: task.grade,
2704
+ evaluatorId: task.evaluatorId,
2705
+ status: "success",
2706
+ score: result.score,
2707
+ reasoning: result.reasoning,
2708
+ processingTimeMs: Date.now() - startTime,
2709
+ originalRow: task.originalRow
2710
+ };
2711
+ this.completedResults.push(batchResult);
2712
+ if (onProgress) onProgress(batchResult);
2713
+ return batchResult;
2714
+ } catch (error) {
2715
+ const batchResult = {
2716
+ rowIndex: task.rowIndex,
2717
+ text: task.text,
2718
+ grade: task.grade,
2719
+ evaluatorId: task.evaluatorId,
2720
+ status: "error",
2721
+ error: error instanceof Error ? error.message : String(error),
2722
+ processingTimeMs: Date.now() - startTime,
2723
+ originalRow: task.originalRow
2724
+ };
2725
+ this.completedResults.push(batchResult);
2726
+ if (onProgress) onProgress(batchResult);
2727
+ return batchResult;
2728
+ }
2729
+ }
2730
+ /**
2731
+ * Calculate summary statistics
2732
+ */
2733
+ calculateSummary(results, durationMs) {
2734
+ const summary = {
2735
+ totalTasks: results.length,
2736
+ successful: results.filter((r) => r.status === "success").length,
2737
+ failed: results.filter((r) => r.status === "error").length,
2738
+ durationMs,
2739
+ resultsPerEvaluator: {}
2740
+ };
2741
+ const evaluatorIds = Array.from(new Set(results.map((r) => r.evaluatorId)));
2742
+ for (const id of evaluatorIds) {
2743
+ const evalResults = results.filter((r) => r.evaluatorId === id);
2744
+ summary.resultsPerEvaluator[id] = {
2745
+ successful: evalResults.filter((r) => r.status === "success").length,
2746
+ failed: evalResults.filter((r) => r.status === "error").length
2747
+ };
2748
+ }
2749
+ return summary;
2750
+ }
2751
+ /**
2752
+ * Run batch evaluation for an evaluator group.
2753
+ *
2754
+ * @param inputs - Array of input rows
2755
+ * @param groupId - The evaluator group to run (see getAvailableGroups())
2756
+ * @param onProgress - Optional callback invoked after each task completes
2757
+ * @returns Batch evaluation results and summary
2758
+ */
2759
+ async evaluate(inputs, groupId, onProgress) {
2760
+ const startTime = Date.now();
2761
+ const group = EVALUATOR_GROUPS.find((g) => g.id === groupId);
2762
+ if (!group) {
2763
+ throw new Error(
2764
+ `Unknown evaluator group: "${groupId}". Available: ${EVALUATOR_GROUPS.map((g) => g.id).join(", ")}`
2765
+ );
2766
+ }
2767
+ if (inputs.length > group.maxInputRows) {
2768
+ throw new Error(
2769
+ `Input exceeds limit for "${group.id}": ${inputs.length} rows (max ${group.maxInputRows}). Split into smaller batches.`
2770
+ );
2771
+ }
2772
+ this.isCancelled = false;
2773
+ this.completedResults = [];
2774
+ this.initializeEvaluators(group.evaluatorIds);
2775
+ const tasks = this.createTasks(inputs, group.evaluatorIds);
2776
+ const settledResults = await Promise.allSettled(
2777
+ tasks.map((task) => this.limit(() => this.executeTask(task, onProgress)))
2778
+ );
2779
+ const results = settledResults.filter((r) => r.status === "fulfilled").map((r) => r.value);
2780
+ const durationMs = Date.now() - startTime;
2781
+ const summary = this.calculateSummary(results, durationMs);
2782
+ return { results, summary };
2783
+ }
2784
+ };
2785
+ function findColumn(row, columnName) {
2786
+ const normalizedTarget = columnName.toLowerCase().trim();
2787
+ for (const key of Object.keys(row)) {
2788
+ if (key.toLowerCase().trim() === normalizedTarget) {
2789
+ return key;
2790
+ }
2791
+ }
2792
+ return void 0;
2793
+ }
2794
+ function parseCSV(csvPath) {
2795
+ if (!fs2.existsSync(csvPath)) {
2796
+ throw new Error(`CSV file not found: ${csvPath}`);
2797
+ }
2798
+ const records = parse(fs2.readFileSync(csvPath, "utf-8"), {
2799
+ columns: true,
2800
+ skip_empty_lines: true,
2801
+ trim: true
2802
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
2803
+ });
2804
+ if (records.length === 0) {
2805
+ throw new Error("CSV file is empty");
2806
+ }
2807
+ const firstRow = records[0];
2808
+ const textColumn = findColumn(firstRow, "text");
2809
+ const gradeColumn = findColumn(firstRow, "grade");
2810
+ if (!textColumn) {
2811
+ throw new Error('CSV must have a "text" column (case-insensitive)');
2812
+ }
2813
+ if (!gradeColumn) {
2814
+ throw new Error('CSV must have a "grade" column (case-insensitive)');
2815
+ }
2816
+ const inputs = [];
2817
+ for (let i = 0; i < records.length; i++) {
2818
+ const row = records[i];
2819
+ const text = row[textColumn];
2820
+ const grade = row[gradeColumn];
2821
+ if (!text || !grade) {
2822
+ console.warn(`Warning: skipping row ${i + 2} \u2014 missing text or grade`);
2823
+ continue;
2824
+ }
2825
+ inputs.push({
2826
+ text: String(text).trim(),
2827
+ grade: String(grade).trim(),
2828
+ rowIndex: i + 2,
2829
+ // 1-based, offset by 1 for the header row
2830
+ originalRow: row
2831
+ });
2832
+ }
2833
+ return inputs;
2834
+ }
2835
+
2836
+ // src/batch/report-template.html
2837
+ var report_template_default = `<!DOCTYPE html>
2838
+ <html lang="en">
2839
+ <head>
2840
+ <meta charset="UTF-8">
2841
+ <meta name="viewport" content="width=device-width, initial-scale=1">
2842
+ <title>Evaluation Report</title>
2843
+ <style>
2844
+ :root {
2845
+ --primary: #242423;
2846
+ --secondary: #6B6A64;
2847
+ --informational: #125B3A;
2848
+ --informational-bg: #E0F5EC;
2849
+ --border: #e2e8f0;
2850
+ --neutral-bg: #f1f5f9;
2851
+ --neutral-muted: #64748b;
2852
+ --on-band: #177A4D;
2853
+ --within-reach: #B79F15;
2854
+ --off-target: #C4352D;
2855
+ --card-bg: rgba(255,255,255,0.6);
2856
+ --card-shadow: 0 4px 10px 0 rgba(36,36,35,0.1);
2857
+ --radius: 8px;
2858
+ }
2859
+
2860
+ * { box-sizing: border-box; margin: 0; padding: 0; }
2861
+
2862
+ body {
2863
+ font-family: system-ui, -apple-system, sans-serif;
2864
+ background: #F4F6F8;
2865
+ color: var(--primary);
2866
+ font-size: 14px;
2867
+ line-height: 1.5;
2868
+ }
2869
+
2870
+ .container { max-width: 1200px; margin: 0 auto; padding: 32px 40px; }
2871
+
2872
+ /* \u2500\u2500 Header \u2500\u2500 */
2873
+ header.report-header { margin-bottom: 24px; }
2874
+ header.report-header h1 { font-size: 20px; font-weight: 600; margin: 0 0 4px 0; color: var(--primary); }
2875
+ header.report-header .subtitle { font-size: 14px; color: var(--secondary); }
2876
+
2877
+ /* \u2500\u2500 Cards \u2500\u2500 */
2878
+ .card { background: var(--card-bg); border-radius: var(--radius); box-shadow: var(--card-shadow); margin-bottom: 20px; overflow: hidden; }
2879
+ .card-body { padding: 16px 20px; }
2880
+ .card-label { font-size: 11px; font-weight: 600; text-transform: uppercase; letter-spacing: 0.06em; color: var(--secondary); margin-bottom: 16px; }
2881
+ .card-label-sm { font-size: 11px; text-transform: uppercase; letter-spacing: 0.05em; color: var(--secondary); }
2882
+
2883
+ /* \u2500\u2500 Tabs \u2500\u2500 */
2884
+ .tabs { display: flex; border-bottom: 1px solid var(--border); }
2885
+ .tab-btn { padding: 12px 20px; font-size: 14px; font-weight: 500; background: none; border: none; border-bottom: 2px solid transparent; margin-bottom: -1px; cursor: pointer; color: var(--secondary); transition: color 0.15s, border-color 0.15s; }
2886
+ .tab-btn:hover { color: var(--primary); }
2887
+ .tab-btn.active { color: var(--primary); border-bottom-color: var(--on-band); }
2888
+ .tab-panel { display: none; }
2889
+ .tab-panel.active { display: block; }
2890
+ .tab-content { padding: 20px 28px; }
2891
+
2892
+ /* \u2500\u2500 Layout grids \u2500\u2500 */
2893
+ .grid-2 { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 20px; }
2894
+ .grid-3 { display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px; margin-bottom: 20px; }
2895
+ .grid-4 { display: grid; grid-template-columns: repeat(2, 1fr); gap: 20px 32px; }
2896
+ .flex-col { display: flex; flex-direction: column; gap: 4px; }
2897
+
2898
+ /* \u2500\u2500 Tags \u2500\u2500 */
2899
+ .tag { display: inline-block; padding: 4px 10px; border-radius: 6px; font-size: 12px; font-weight: 600; }
2900
+ .tag-on-band { background: var(--on-band); color: #fff; }
2901
+ .tag-within-reach { background: var(--within-reach); color: #fff; }
2902
+ .tag-off-target { background: var(--off-target); color: #fff; }
2903
+ .tag-informational{ background: var(--neutral-bg); color: var(--neutral-muted); }
2904
+
2905
+ /* \u2500\u2500 GLA stat cards \u2500\u2500 */
2906
+ .gla-card .top-stripe { height: 4px; }
2907
+ .gla-card.on-band .top-stripe { background: var(--on-band); }
2908
+ .gla-card.within-reach .top-stripe { background: var(--within-reach); }
2909
+ .gla-card.off-target .top-stripe { background: var(--off-target); }
2910
+ .gla-card.on-band .card-label-sm { color: var(--on-band); }
2911
+ .gla-card.within-reach .card-label-sm { color: var(--within-reach); }
2912
+ .gla-card.off-target .card-label-sm { color: var(--off-target); }
2913
+ .gla-card .big-num { font-size: 28px; font-weight: 700; margin: 6px 0; }
2914
+ .gla-card.on-band .big-num { color: var(--on-band); }
2915
+ .gla-card.within-reach .big-num { color: var(--within-reach); }
2916
+ .gla-card.off-target .big-num { color: var(--off-target); }
2917
+ .gla-card .desc { font-size: 13px; color: var(--secondary); }
2918
+
2919
+ /* \u2500\u2500 Complexity dimension summary bars \u2500\u2500 */
2920
+ .cx-dim-track { height: 10px; background: var(--border); border-radius: 99px; overflow: hidden; }
2921
+ .cx-dim-fill { height: 100%; border-radius: 99px; background: var(--neutral-muted); transition: width 0.4s ease; }
2922
+
2923
+ /* \u2500\u2500 Insights \u2500\u2500 */
2924
+ .number-icon { display: inline-flex; align-items: center; justify-content: center; width: 28px; height: 28px; border-radius: 50%; background: var(--primary); color: #fff; font-size: 14px; font-weight: 600; flex-shrink: 0; }
2925
+ .insight-row { display: flex; align-items: flex-start; gap: 12px; margin-bottom: 10px; }
2926
+ .insight-text { flex: 1; font-size: 14px; color: var(--primary); }
2927
+ .disclaimer { font-size: 13px; color: var(--secondary); margin-top: 12px; }
2928
+
2929
+ /* \u2500\u2500 Grade level distribution (CSS bars) \u2500\u2500 */
2930
+ .dist-chart { margin-top: 8px; }
2931
+ .dist-row { display: flex; align-items: center; margin-bottom: 10px; }
2932
+ .dist-row .band { width: 60px; font-size: 12px; font-weight: 600; color: var(--primary); flex-shrink: 0; }
2933
+ .dist-row .bars { flex: 1; height: 24px; display: flex; }
2934
+ .dist-row .bar-on { background: var(--on-band); }
2935
+ .dist-row .bar-wr { background: var(--within-reach); }
2936
+ .dist-row .bar-off { background: var(--off-target); }
2937
+ .dist-row .bars > .bar-seg:first-child { border-radius: 4px 0 0 4px; }
2938
+ .dist-row .bars > .bar-seg:last-child { border-radius: 0 4px 4px 4px; }
2939
+ .dist-row .bars > .bar-seg.bar-off:last-child { border-radius: 0 4px 4px 0; }
2940
+ .dist-row .bars > .bar-seg.bar-off:only-child { border-radius: 4px; }
2941
+ .dist-row .bars > .bar-seg:only-child { border-radius: 4px; }
2942
+ .dist-legend { display: flex; justify-content: center; gap: 24px; margin-top: 16px; font-size: 12px; color: var(--primary); }
2943
+ .dist-legend span { display: inline-flex; align-items: center; gap: 6px; }
2944
+ .dist-legend .dot { width: 10px; height: 10px; border-radius: 2px; flex-shrink: 0; }
2945
+
2946
+ /* \u2500\u2500 Bar segment tooltips \u2500\u2500 */
2947
+ .bar-seg { position: relative; cursor: default; }
2948
+ .bar-seg::after {
2949
+ content: attr(data-tip);
2950
+ position: absolute;
2951
+ bottom: calc(100% + 6px);
2952
+ left: 50%;
2953
+ transform: translateX(-50%);
2954
+ background: var(--primary);
2955
+ color: #fff;
2956
+ padding: 5px 9px;
2957
+ border-radius: 5px;
2958
+ font-size: 12px;
2959
+ white-space: nowrap;
2960
+ pointer-events: none;
2961
+ opacity: 0;
2962
+ transition: opacity 0.12s;
2963
+ z-index: 20;
2964
+ }
2965
+ .bar-seg::before {
2966
+ content: '';
2967
+ position: absolute;
2968
+ bottom: calc(100% + 1px);
2969
+ left: 50%;
2970
+ transform: translateX(-50%);
2971
+ border: 5px solid transparent;
2972
+ border-top-color: var(--primary);
2973
+ opacity: 0;
2974
+ transition: opacity 0.12s;
2975
+ pointer-events: none;
2976
+ z-index: 20;
2977
+ }
2978
+ .bar-seg:hover::after,
2979
+ .bar-seg:hover::before { opacity: 1; }
2980
+
2981
+ /* \u2500\u2500 Heatmap \u2500\u2500 */
2982
+ .heatmap-table { width: 100%; border-collapse: collapse; font-size: 13px; }
2983
+ .heatmap-table th { padding: 10px 16px; text-align: left; font-size: 11px; font-weight: 700; text-transform: uppercase; letter-spacing: 0.06em; color: var(--neutral-muted); border-bottom: 2px solid var(--border); }
2984
+ .heatmap-table td { padding: 10px 16px; border-bottom: 1px solid var(--border); }
2985
+ .heatmap-table td:first-child { font-weight: 600; color: var(--primary); }
2986
+ .heatmap-table .cell-num { text-align: center; }
2987
+ .heatmap-table tr:last-child td { border-bottom: none; }
2988
+ .heatmap-cell { display: inline-block; padding: 5px 12px; border-radius: 6px; font-size: 13px; font-weight: 600; background: var(--neutral-bg); color: var(--neutral-muted); }
2989
+
2990
+ /* \u2500\u2500 Full results table \u2500\u2500 */
2991
+ .results-scroll { overflow-x: auto; overflow-y: auto; max-height: 600px; }
2992
+ table.data-table { width: 100%; border-collapse: collapse; font-size: 13px; min-width: 100%; white-space: nowrap; }
2993
+ table.data-table th {
2994
+ padding: 10px 16px; text-align: left; font-size: 12px; font-weight: 600;
2995
+ color: var(--primary); background: var(--neutral-bg);
2996
+ border-bottom: 2px solid var(--border);
2997
+ position: sticky; top: 0; z-index: 3; white-space: nowrap;
2998
+ }
2999
+ table.data-table td {
3000
+ padding: 10px 16px; border-bottom: 1px solid var(--border);
3001
+ vertical-align: top; white-space: normal;
3002
+ color: var(--primary);
3003
+ }
3004
+ table.data-table tr:last-child td { border-bottom: none; }
3005
+ table.data-table tbody tr:hover td { background: rgba(0,0,0,0.02); }
3006
+ table.data-table tbody tr:hover td.frozen { background: var(--neutral-bg); }
3007
+ /* Frozen (sticky-left) columns */
3008
+ table.data-table th.frozen,
3009
+ table.data-table td.frozen { position: sticky; background: var(--neutral-bg); z-index: 2; }
3010
+ table.data-table td.frozen { background: #fff; }
3011
+ table.data-table th.frozen { z-index: 4; }
3012
+ table.data-table th.frozen-last,
3013
+ table.data-table td.frozen-last { border-right: 2px solid var(--border); }
3014
+ table.data-table th.group-start,
3015
+ table.data-table td.group-start { border-left: 2px solid var(--border); }
3016
+ /* Text cell \u2014 3-line clamp, full text on hover */
3017
+ table.data-table .cell-text { cursor: help; }
3018
+ table.data-table .cell-text-inner {
3019
+ display: -webkit-box;
3020
+ -webkit-line-clamp: 3;
3021
+ -webkit-box-orient: vertical;
3022
+ overflow: hidden;
3023
+ white-space: normal;
3024
+ }
3025
+ /* Reasoning cells \u2014 2-line clamp, full text on hover */
3026
+ table.data-table .cell-reasoning { font-size: 12px; color: var(--secondary); cursor: help; }
3027
+ table.data-table .cell-reasoning-inner {
3028
+ display: -webkit-box;
3029
+ -webkit-line-clamp: 2;
3030
+ -webkit-box-orient: vertical;
3031
+ overflow: hidden;
3032
+ white-space: normal;
3033
+ }
3034
+
3035
+ /* \u2500\u2500 Empty / no-data \u2500\u2500 */
3036
+ .no-data { padding: 40px; text-align: center; color: var(--secondary); font-size: 15px; }
3037
+ </style>
3038
+ </head>
3039
+ <body>
3040
+
3041
+ <div class="container">
3042
+ <header class="report-header" id="app-header"></header>
3043
+ <div class="card" style="margin-bottom:0; overflow:visible;">
3044
+ <div class="tabs" id="tab-bar"></div>
3045
+ <div id="tab-summary" class="tab-panel active"></div>
3046
+ <div id="tab-results" class="tab-panel"></div>
3047
+ </div>
3048
+ </div>
3049
+
3050
+ <script>
3051
+ // ---------------------------------------------------------------------------
3052
+ // MOCK DATA \u2014 used when the template is opened directly in a browser so
3053
+ // designers can see a realistic preview without running the CLI.
3054
+ // This constant is never referenced when real data has been injected.
3055
+ // ---------------------------------------------------------------------------
3056
+ const MOCK_REPORT_DATA = {
3057
+ meta: {
3058
+ reportId: 'sample_content_batch_20260301T1430',
3059
+ generatedAt: 'Mar 1, 2026 2:30 PM',
3060
+ csvPath: '/Users/designer/Documents/interventionhelper_content_batch_03-01.csv',
3061
+ evaluatorIds: ['grade-level-appropriateness', 'subject-matter-knowledge', 'vocabulary', 'sentence-structure', 'conventionality'],
3062
+ evaluatorNames: ['Grade Level Appropriateness', 'Subject Matter Knowledge', 'Vocabulary', 'Sentence Structure', 'Conventionality'],
3063
+ totalRows: 300,
3064
+ processedRows: 287,
3065
+ erroredRows: 13,
3066
+ },
3067
+ gradeLevelStats: {
3068
+ onBand: 172, adjacent: 85, offTarget: 30,
3069
+ onBandPct: 60, adjacentPct: 30, offTargetPct: 10,
3070
+ hasData: true,
3071
+ },
3072
+ complexityStats: [
3073
+ {
3074
+ evaluatorId: 'subject-matter-knowledge', name: 'Subject Matter Knowledge',
3075
+ average: 2.6, label: 'Moderately complex',
3076
+ distribution: [28, 98, 110, 51],
3077
+ },
3078
+ {
3079
+ evaluatorId: 'vocabulary', name: 'Vocabulary',
3080
+ average: 2.4, label: 'Moderately complex',
3081
+ distribution: [45, 120, 95, 27],
3082
+ },
3083
+ {
3084
+ evaluatorId: 'sentence-structure', name: 'Sentence Structure',
3085
+ average: 1.9, label: 'Slightly complex',
3086
+ distribution: [88, 105, 72, 22],
3087
+ },
3088
+ {
3089
+ evaluatorId: 'conventionality', name: 'Conventionality',
3090
+ average: 1.7, label: 'Slightly complex',
3091
+ distribution: [112, 118, 45, 12],
3092
+ },
3093
+ ],
3094
+ gradeBandDistribution: {
3095
+ bands: ['K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR'],
3096
+ data: [
3097
+ { onBand: 0, adjacent: 0, offTarget: 0, total: 0 },
3098
+ { onBand: 32, adjacent: 18, offTarget: 5, total: 55 },
3099
+ { onBand: 58, adjacent: 22, offTarget: 8, total: 88 },
3100
+ { onBand: 48, adjacent: 25, offTarget: 10, total: 83 },
3101
+ { onBand: 22, adjacent: 14, offTarget: 5, total: 41 },
3102
+ { onBand: 12, adjacent: 6, offTarget: 2, total: 20 },
3103
+ ],
3104
+ },
3105
+ complexityHeatmap: {
3106
+ bands: ['K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR'],
3107
+ evaluators: ['Subject Matter Knowledge', 'Vocabulary', 'Sentence Structure', 'Conventionality'],
3108
+ evaluatorIds: ['subject-matter-knowledge', 'vocabulary', 'sentence-structure', 'conventionality'],
3109
+ values: [
3110
+ [null, null, null, null],
3111
+ [1.8, 1.6, 1.4, 1.3 ],
3112
+ [2.3, 2.1, 1.8, 1.7 ],
3113
+ [2.7, 2.5, 2.2, 2.0 ],
3114
+ [3.1, 2.9, 2.6, 2.4 ],
3115
+ [3.4, 3.2, 2.8, 2.6 ],
3116
+ ],
3117
+ },
3118
+ insights: [
3119
+ 'Review texts marked as Off Target \u2014 they may need content revision or grade-level adjustment before distribution.',
3120
+ 'Texts evaluated as Adjacent may benefit from light scaffolding strategies such as vocabulary pre-teaching.',
3121
+ 'Higher grade bands tend to show greater text complexity. Consider whether complexity aligns with instructional goals.',
3122
+ ],
3123
+ fullResults: {
3124
+ originalColumns: ['row_id', 'text', 'grade', 'source'],
3125
+ hasGLA: true,
3126
+ complexityEvaluators: [
3127
+ { evaluatorId: 'subject-matter-knowledge', name: 'Subject Matter Knowledge', prefix: 'subject_matter_knowledge' },
3128
+ { evaluatorId: 'vocabulary', name: 'Vocabulary', prefix: 'vocabulary' },
3129
+ { evaluatorId: 'sentence-structure', name: 'Sentence Structure', prefix: 'sentence_structure' },
3130
+ { evaluatorId: 'conventionality', name: 'Conventionality', prefix: 'conventionality' },
3131
+ ],
3132
+ rows: [
3133
+ {
3134
+ row_id: '1', grade: '5', source: 'science_unit_3',
3135
+ text: 'The water cycle describes how water evaporates from surfaces, rises into the atmosphere, cools and condenses into clouds, and falls back to the ground as precipitation.',
3136
+ __gla_status: 'On Band', __gla_band: '4-5',
3137
+ __gla_reasoning: 'Uses grade-appropriate science vocabulary with a clear explanatory structure suitable for grades 4\u20135.',
3138
+ __subject_matter_knowledge_score: 'Moderately complex',
3139
+ __subject_matter_knowledge_reasoning: 'Requires familiarity with basic Earth science concepts; grade 5 students may need prior exposure to the water cycle.',
3140
+ __vocabulary_score: 'Moderately complex',
3141
+ __vocabulary_reasoning: 'Contains domain-specific terms (evaporates, condenses, precipitation) that require pre-teaching for grade 5 students.',
3142
+ __sentence_structure_score: 'Slightly complex',
3143
+ __sentence_structure_reasoning: 'Primarily compound sentences with clear connective structure appropriate for grade 5.',
3144
+ __conventionality_score: 'Slightly complex',
3145
+ __conventionality_reasoning: 'Language is largely literal and explicit; no figurative or idiomatic usage that would increase comprehension demand.',
3146
+ },
3147
+ {
3148
+ row_id: '2', grade: '6', source: 'science_unit_1',
3149
+ text: 'Photosynthesis is the process by which green plants use sunlight, water and carbon dioxide to produce food and oxygen.',
3150
+ __gla_status: 'Adjacent', __gla_band: '4-5',
3151
+ __gla_reasoning: 'Content is accessible but slightly below typical grade 6 complexity expectations.',
3152
+ __subject_matter_knowledge_score: 'Slightly complex',
3153
+ __subject_matter_knowledge_reasoning: 'Core concept of photosynthesis is introduced in upper elementary science; low prior knowledge demand for grade 6.',
3154
+ __vocabulary_score: 'Moderately complex',
3155
+ __vocabulary_reasoning: 'Key scientific terms are present but relatively straightforward for grade 6 readers.',
3156
+ __sentence_structure_score: 'Slightly complex',
3157
+ __sentence_structure_reasoning: 'Single main clause with a relative clause; well within grade 6 reading ability.',
3158
+ __conventionality_score: 'Slightly complex',
3159
+ __conventionality_reasoning: 'Entirely literal and direct; meaning is fully transparent with no figurative language.',
3160
+ },
3161
+ {
3162
+ row_id: '3', grade: '8', source: 'biology_unit_2',
3163
+ text: 'The mitochondria, often described as the powerhouse of the cell, are organelles found in the cytoplasm of eukaryotic cells, where they generate most of the adenosine triphosphate used for cellular energy.',
3164
+ __gla_status: 'Off Target', __gla_band: '11-CCR',
3165
+ __gla_reasoning: 'Text uses advanced biochemical terminology (adenosine triphosphate, eukaryotic) more appropriate for upper secondary or college-level readers.',
3166
+ __subject_matter_knowledge_score: 'Very complex',
3167
+ __subject_matter_knowledge_reasoning: 'Assumes familiarity with cell biology, organelle function, and biochemical energy systems well beyond typical grade 8 expectations.',
3168
+ __vocabulary_score: 'Exceedingly complex',
3169
+ __vocabulary_reasoning: 'High density of Tier 3 domain-specific words significantly exceeds typical grade 8 vocabulary expectations.',
3170
+ __sentence_structure_score: 'Very complex',
3171
+ __sentence_structure_reasoning: 'Long, embedded clauses with multiple modifying phrases create significant syntactic complexity for grade 8.',
3172
+ __conventionality_score: 'Moderately complex',
3173
+ __conventionality_reasoning: '"Powerhouse of the cell" is a well-known metaphor but requires understanding of the underlying analogy; otherwise literal.',
3174
+ },
3175
+ {
3176
+ row_id: '4', grade: '3', source: 'science_unit_3',
3177
+ text: 'Rain falls from clouds when tiny water droplets join together and become heavy enough to fall to the ground.',
3178
+ __gla_status: 'On Band', __gla_band: '2-3',
3179
+ __gla_reasoning: 'Simple vocabulary and sentence structure are appropriate for grades 2\u20133.',
3180
+ __subject_matter_knowledge_score: 'Slightly complex',
3181
+ __subject_matter_knowledge_reasoning: 'Everyday weather phenomenon; no specialized prior knowledge required for grade 3 students.',
3182
+ __vocabulary_score: 'Slightly complex',
3183
+ __vocabulary_reasoning: 'Common everyday vocabulary with no domain-specific terms requiring pre-teaching.',
3184
+ __sentence_structure_score: 'Slightly complex',
3185
+ __sentence_structure_reasoning: 'Short simple sentences with basic connective structure.',
3186
+ __conventionality_score: 'Slightly complex',
3187
+ __conventionality_reasoning: 'Entirely conventional and literal; straightforward causal explanation with no figurative language.',
3188
+ },
3189
+ {
3190
+ row_id: '5', grade: '9', source: 'biology_unit_1',
3191
+ text: 'Ecosystems are communities of organisms that interact with each other and their physical environment, shaped by both biotic and abiotic factors that influence population dynamics over time.',
3192
+ __gla_status: 'On Band', __gla_band: '9-10',
3193
+ __gla_reasoning: 'Appropriate complexity and terminology for a grade 9\u201310 biology curriculum.',
3194
+ __subject_matter_knowledge_score: 'Very complex',
3195
+ __subject_matter_knowledge_reasoning: 'Requires understanding of ecological concepts (biotic/abiotic factors, population dynamics) typical of secondary-level biology coursework.',
3196
+ __vocabulary_score: 'Very complex',
3197
+ __vocabulary_reasoning: 'Multiple Tier 3 terms (biotic, abiotic, population dynamics) require strong background knowledge.',
3198
+ __sentence_structure_score: 'Moderately complex',
3199
+ __sentence_structure_reasoning: 'Compound-complex sentence with a relative clause; manageable for grade 9 readers.',
3200
+ __conventionality_score: 'Slightly complex',
3201
+ __conventionality_reasoning: 'Technical but literal throughout; no irony or figurative usage that would obscure meaning.',
3202
+ },
3203
+ {
3204
+ row_id: '6', grade: '4', source: 'social_studies_unit_2',
3205
+ text: 'Ancient Egyptians built pyramids as tombs for their pharaohs and used a picture-based writing system called hieroglyphics.',
3206
+ __gla_status: 'On Band', __gla_band: '4-5',
3207
+ __gla_reasoning: 'Vocabulary and sentence length are well-matched to grade 4\u20135 social studies content.',
3208
+ __subject_matter_knowledge_score: 'Moderately complex',
3209
+ __subject_matter_knowledge_reasoning: 'Ancient Egypt is a common grade 4 social studies topic; some prior exposure to civilizations is expected.',
3210
+ __vocabulary_score: 'Moderately complex',
3211
+ __vocabulary_reasoning: 'Domain-specific proper nouns (pharaohs, hieroglyphics) may need brief glossing.',
3212
+ __sentence_structure_score: 'Slightly complex',
3213
+ __sentence_structure_reasoning: 'Two coordinated independent clauses; clear and accessible structure.',
3214
+ __conventionality_score: 'Slightly complex',
3215
+ __conventionality_reasoning: 'Descriptive and informational; meaning is explicit and no non-literal language is used.',
3216
+ },
3217
+ {
3218
+ row_id: '7', grade: '11', source: 'lit_unit_4',
3219
+ text: 'Shakespeare\\'s use of dramatic irony in Othello functions as a mechanism of tragic inevitability, positioning the audience as unwilling witnesses to the protagonist\\'s epistemological collapse.',
3220
+ __gla_status: 'On Band', __gla_band: '11-CCR',
3221
+ __gla_reasoning: 'Sophisticated literary analysis vocabulary and complex syntax are well-suited to grades 11\u2013CCR.',
3222
+ __subject_matter_knowledge_score: 'Exceedingly complex',
3223
+ __subject_matter_knowledge_reasoning: 'Requires familiarity with Shakespearean drama, literary theory, and epistemological concepts; assumes high prior exposure to canonical literature.',
3224
+ __vocabulary_score: 'Exceedingly complex',
3225
+ __vocabulary_reasoning: 'Tier 3 literary and philosophical vocabulary (epistemological, dramatic irony, tragic inevitability) demands high reading proficiency.',
3226
+ __sentence_structure_score: 'Exceedingly complex',
3227
+ __sentence_structure_reasoning: 'Noun phrase and participial phrase stacking creates a dense, highly embedded syntactic structure.',
3228
+ __conventionality_score: 'Very complex',
3229
+ __conventionality_reasoning: 'Analytical prose employs abstract and figurative constructs; "epistemological collapse" and "mechanism of tragic inevitability" are non-literal formulations requiring interpretive inference.',
3230
+ },
3231
+ {
3232
+ row_id: '8', grade: '7', source: 'history_unit_1',
3233
+ text: 'The Industrial Revolution transformed European societies by shifting labor from farms to factories, driving rapid urban growth and fundamentally changing how goods were produced and traded.',
3234
+ __gla_status: 'Adjacent', __gla_band: '9-10',
3235
+ __gla_reasoning: 'Vocabulary and conceptual density exceed typical grade 7 expectations; better suited for grades 9\u201310.',
3236
+ __subject_matter_knowledge_score: 'Moderately complex',
3237
+ __subject_matter_knowledge_reasoning: 'Industrial Revolution is introduced in middle school; grade 7 students likely have foundational context, though economic concepts add demand.',
3238
+ __vocabulary_score: 'Very complex',
3239
+ __vocabulary_reasoning: 'Abstract economic and historical vocabulary (urban growth, fundamentally) adds significant reading demand.',
3240
+ __sentence_structure_score: 'Moderately complex',
3241
+ __sentence_structure_reasoning: 'Participial phrases and coordinated verb phrases add structural complexity but remain readable.',
3242
+ __conventionality_score: 'Slightly complex',
3243
+ __conventionality_reasoning: 'Primarily literal and informational; "transformed" is used in its conventional sense with no figurative layering.',
3244
+ },
3245
+ ],
3246
+ },
3247
+ };
3248
+
3249
+ // ---------------------------------------------------------------------------
3250
+ // DATA \u2014 this line is replaced by the formatter at report generation time.
3251
+ // When opening the template directly in a browser, MOCK_REPORT_DATA is used.
3252
+ // ---------------------------------------------------------------------------
3253
+ var REPORT_DATA = null; // __REPLACED_BY_FORMATTER__
3254
+ REPORT_DATA = REPORT_DATA || MOCK_REPORT_DATA;
3255
+
3256
+ // ---------------------------------------------------------------------------
3257
+ // Canonical evaluator order
3258
+ // ---------------------------------------------------------------------------
3259
+
3260
+ const EVALUATOR_ORDER = [
3261
+ 'grade-level-appropriateness',
3262
+ 'subject-matter-knowledge',
3263
+ 'vocabulary',
3264
+ 'sentence-structure',
3265
+ 'conventionality',
3266
+ ];
3267
+
3268
+ function evalSortIndex(id) {
3269
+ const i = EVALUATOR_ORDER.indexOf(id);
3270
+ return i === -1 ? 999 : i;
3271
+ }
3272
+
3273
+ // Sort meta.evaluatorIds / evaluatorNames in tandem
3274
+ const _metaPairs = REPORT_DATA.meta.evaluatorIds
3275
+ .map((id, i) => ({ id, name: REPORT_DATA.meta.evaluatorNames[i] }))
3276
+ .sort((a, b) => evalSortIndex(a.id) - evalSortIndex(b.id));
3277
+ REPORT_DATA.meta.evaluatorIds = _metaPairs.map(x => x.id);
3278
+ REPORT_DATA.meta.evaluatorNames = _metaPairs.map(x => x.name);
3279
+
3280
+ // Sort complexityStats
3281
+ REPORT_DATA.complexityStats = [...REPORT_DATA.complexityStats]
3282
+ .sort((a, b) => evalSortIndex(a.evaluatorId) - evalSortIndex(b.evaluatorId));
3283
+
3284
+ // Sort complexityHeatmap evaluators and their value columns
3285
+ const _hmPairs = REPORT_DATA.complexityHeatmap.evaluators
3286
+ .map((name, i) => ({ name, id: REPORT_DATA.complexityHeatmap.evaluatorIds[i], i }))
3287
+ .sort((a, b) => evalSortIndex(a.id) - evalSortIndex(b.id));
3288
+ REPORT_DATA.complexityHeatmap.evaluators = _hmPairs.map(x => x.name);
3289
+ REPORT_DATA.complexityHeatmap.evaluatorIds = _hmPairs.map(x => x.id);
3290
+ REPORT_DATA.complexityHeatmap.values = REPORT_DATA.complexityHeatmap.values
3291
+ .map(row => _hmPairs.map(x => row[x.i]));
3292
+
3293
+ // Sort fullResults.complexityEvaluators
3294
+ REPORT_DATA.fullResults.complexityEvaluators = [...REPORT_DATA.fullResults.complexityEvaluators]
3295
+ .sort((a, b) => evalSortIndex(a.evaluatorId) - evalSortIndex(b.evaluatorId));
3296
+
3297
+ // ---------------------------------------------------------------------------
3298
+ // Utilities
3299
+ // ---------------------------------------------------------------------------
3300
+
3301
+ function esc(str) {
3302
+ return String(str ?? '')
3303
+ .replace(/&/g, '&amp;')
3304
+ .replace(/</g, '&lt;')
3305
+ .replace(/>/g, '&gt;')
3306
+ .replace(/"/g, '&quot;');
3307
+ }
3308
+
3309
+ function statusBadge(status) {
3310
+ const cls = {
3311
+ 'On Band': 'tag-on-band',
3312
+ 'Adjacent': 'tag-within-reach',
3313
+ 'Off Target':'tag-off-target',
3314
+ }[status] || '';
3315
+ const display = status === 'Adjacent' ? 'Within Reach' : status;
3316
+ return \`<span class="tag \${cls}">\${esc(display)}</span>\`;
3317
+ }
3318
+
3319
+ // ---------------------------------------------------------------------------
3320
+ // Tab switching
3321
+ // ---------------------------------------------------------------------------
3322
+
3323
+ function switchTab(tab) {
3324
+ document.querySelectorAll('.tab-btn').forEach(b =>
3325
+ b.classList.toggle('active', b.dataset.tab === tab)
3326
+ );
3327
+ document.getElementById('tab-summary').classList.toggle('active', tab === 'summary');
3328
+ document.getElementById('tab-results').classList.toggle('active', tab === 'results');
3329
+ }
3330
+
3331
+ // ---------------------------------------------------------------------------
3332
+ // Header
3333
+ // ---------------------------------------------------------------------------
3334
+
3335
+ function renderHeader() {
3336
+ const { meta } = REPORT_DATA;
3337
+ document.getElementById('app-header').innerHTML = \`
3338
+ <h1>Evaluation Report</h1>
3339
+ <p class="subtitle">Generated: \${esc(meta.generatedAt)} &bull; Report ID: \${esc(meta.reportId)}</p>
3340
+ \`;
3341
+ document.title = \`Report: \${meta.reportId}\`;
3342
+ }
3343
+
3344
+ // ---------------------------------------------------------------------------
3345
+ // Tab bar
3346
+ // ---------------------------------------------------------------------------
3347
+
3348
+ function renderTabs() {
3349
+ document.getElementById('tab-bar').innerHTML = \`
3350
+ <button class="tab-btn active" data-tab="summary" onclick="switchTab('summary')">Summary</button>
3351
+ <button class="tab-btn" data-tab="results" onclick="switchTab('results')">Full Results</button>
3352
+ \`;
3353
+ }
3354
+
3355
+ // ---------------------------------------------------------------------------
3356
+ // Summary tab
3357
+ // ---------------------------------------------------------------------------
3358
+
3359
+ function renderSummary() {
3360
+ const { meta, gradeLevelStats: gls, complexityStats, insights,
3361
+ gradeBandDistribution, complexityHeatmap } = REPORT_DATA;
3362
+
3363
+ // \u2500\u2500 Snapshot \u2500\u2500
3364
+ const snapshotHtml = \`
3365
+ <div class="card">
3366
+ <div class="card-body">
3367
+ <div class="card-label">Snapshot</div>
3368
+ <div style="display:flex; flex-direction:column; gap:14px;">
3369
+ <div class="flex-col">
3370
+ <span class="card-label-sm">Evaluators</span>
3371
+ <div style="margin-top:6px; display:flex; flex-wrap:wrap; gap:6px;">
3372
+ \${meta.evaluatorNames.map(n => \`<span class="tag tag-informational">\${esc(n)}</span>\`).join('')}
3373
+ </div>
3374
+ </div>
3375
+ <div style="display:grid; grid-template-columns:1fr 1fr 3fr; gap:12px 32px;">
3376
+ <div class="flex-col">
3377
+ <span class="card-label-sm">Rows Processed</span>
3378
+ <span style="font-weight:500; margin-top:4px;">\${meta.processedRows} of \${meta.totalRows}</span>
3379
+ </div>
3380
+ <div class="flex-col">
3381
+ <span class="card-label-sm">Errors / Skipped</span>
3382
+ <span style="font-weight:500; margin-top:4px;">\${meta.erroredRows}</span>
3383
+ </div>
3384
+ <div class="flex-col">
3385
+ <span class="card-label-sm">Source File</span>
3386
+ <span style="font-size:12px; font-family:monospace; word-break:break-all; color:var(--secondary); margin-top:4px;" title="\${esc(meta.csvPath)}">\${esc(meta.csvPath)}</span>
3387
+ </div>
3388
+ </div>
3389
+ </div>
3390
+ </div>
3391
+ </div>
3392
+ \`;
3393
+
3394
+ // \u2500\u2500 GLA stat cards \u2500\u2500
3395
+ const glsHtml = gls.hasData ? \`
3396
+ <div class="grid-3">
3397
+ <div class="card gla-card on-band" style="margin-bottom:0;">
3398
+ <div class="top-stripe"></div>
3399
+ <div class="card-body">
3400
+ <div class="card-label-sm">On Band</div>
3401
+ <div class="big-num">\${gls.onBandPct}%</div>
3402
+ <div class="desc">\${gls.onBand} of \${meta.processedRows} rows where the evaluated grade band matches intended</div>
3403
+ </div>
3404
+ </div>
3405
+ <div class="card gla-card within-reach" style="margin-bottom:0;">
3406
+ <div class="top-stripe"></div>
3407
+ <div class="card-body">
3408
+ <div class="card-label-sm">Within Reach</div>
3409
+ <div class="big-num">\${gls.adjacentPct}%</div>
3410
+ <div class="desc">\${gls.adjacent} rows where the alternative grade band aligns with intended</div>
3411
+ </div>
3412
+ </div>
3413
+ <div class="card gla-card off-target" style="margin-bottom:0;">
3414
+ <div class="top-stripe"></div>
3415
+ <div class="card-body">
3416
+ <div class="card-label-sm">Off Target</div>
3417
+ <div class="big-num">\${gls.offTargetPct}%</div>
3418
+ <div class="desc">\${gls.offTarget} rows where neither evaluated nor alternative matches intended</div>
3419
+ </div>
3420
+ </div>
3421
+ </div>
3422
+ \` : '';
3423
+
3424
+ // \u2500\u2500 Complexity dimension summary card \u2500\u2500
3425
+ const cxHtml = complexityStats.length > 0 ? (() => {
3426
+ const totalRows = meta.processedRows || meta.totalRows;
3427
+ const items = complexityStats.map(cs => {
3428
+ const pct = Math.round((cs.average / 4.0) * 100);
3429
+ const labelText = cs.average > 0
3430
+ ? \`\${esc(cs.label)} (\${cs.average.toFixed(1)} / 4.0)\`
3431
+ : '\u2014';
3432
+ return \`
3433
+ <div style="display:flex; flex-direction:column; gap:8px;">
3434
+ <span style="font-size:13px; font-weight:500; color:var(--primary);">\${esc(cs.name)}</span>
3435
+ <div class="cx-dim-track"><div class="cx-dim-fill" style="width:\${pct}%"></div></div>
3436
+ <span style="font-size:12px; color:var(--secondary);">\${labelText}</span>
3437
+ </div>
3438
+ \`;
3439
+ }).join('');
3440
+ return \`
3441
+ <div class="card">
3442
+ <div class="card-body">
3443
+ <div class="card-label">Text Complexity Dimensions (avg. across \${totalRows} rows)</div>
3444
+ <div style="display:grid; grid-template-columns:repeat(3,1fr); gap:16px 24px;">
3445
+ \${items}
3446
+ </div>
3447
+ </div>
3448
+ </div>
3449
+ \`;
3450
+ })() : '';
3451
+
3452
+ // \u2500\u2500 Insights \u2500\u2500
3453
+ const insightsHtml = \`
3454
+ <div class="card" style="display: none;">
3455
+ <div class="card-body">
3456
+ <div class="card-label" style="display:flex; align-items:center; gap:8px;">
3457
+ Insights
3458
+ <span class="tag tag-informational" style="font-size:10px; letter-spacing:0.06em; padding:2px 8px;">Early Access</span>
3459
+ </div>
3460
+ <div style="display:flex; flex-direction:column; gap:10px;">
3461
+ \${insights.map((text, idx) => \`
3462
+ <div class="insight-row">
3463
+ <span class="number-icon" aria-label="Insight \${idx + 1}">\${idx + 1}</span>
3464
+ <span class="insight-text">\${esc(text)}</span>
3465
+ </div>
3466
+ \`).join('')}
3467
+ </div>
3468
+ <p class="disclaimer">These insights are automatically generated and may not reflect the full context of your data.</p>
3469
+ </div>
3470
+ </div>
3471
+ \`;
3472
+
3473
+ // \u2500\u2500 Grade level distribution (CSS stacked bars) \u2500\u2500
3474
+ const distHtml = (() => {
3475
+ const toPct = (n, total) => total > 0 ? Math.round((n / total) * 100) : 0;
3476
+ const activeBands = gradeBandDistribution.bands.filter((_, i) => gradeBandDistribution.data[i].total > 0);
3477
+ const activeData = gradeBandDistribution.data.filter(d => d.total > 0);
3478
+ if (activeBands.length === 0) return '';
3479
+
3480
+ const rows = activeBands.map((band, i) => {
3481
+ const d = activeData[i];
3482
+ const onPct = toPct(d.onBand, d.total);
3483
+ const adjPct = toPct(d.adjacent, d.total);
3484
+ const offPct = toPct(d.offTarget, d.total);
3485
+ const onSeg = d.onBand > 0 ? \`<div class="bar-seg bar-on" style="width:\${onPct}%" data-tip="On Band: \${d.onBand} (\${onPct}%)"></div>\` : '';
3486
+ const adjSeg = d.adjacent > 0 ? \`<div class="bar-seg bar-wr" style="width:\${adjPct}%" data-tip="Within Reach: \${d.adjacent} (\${adjPct}%)"></div>\` : '';
3487
+ const offSeg = d.offTarget > 0 ? \`<div class="bar-seg bar-off" style="width:\${offPct}%" data-tip="Off Target: \${d.offTarget} (\${offPct}%)"></div>\` : '';
3488
+ return \`
3489
+ <div class="dist-row">
3490
+ <span class="band">\${esc(band)}</span>
3491
+ <div class="bars">\${onSeg}\${adjSeg}\${offSeg}</div>
3492
+ </div>
3493
+ \`;
3494
+ }).join('');
3495
+
3496
+ return \`
3497
+ <div class="card">
3498
+ <div class="card-body">
3499
+ <div class="card-label">Grade Band Alignment by Intended Grade</div>
3500
+ <div class="dist-chart">\${rows}</div>
3501
+ <div class="dist-legend">
3502
+ <span><span class="dot" style="background:var(--on-band)"></span>On Band</span>
3503
+ <span><span class="dot" style="background:var(--within-reach)"></span>Within Reach</span>
3504
+ <span><span class="dot" style="background:var(--off-target)"></span>Off Target</span>
3505
+ </div>
3506
+ </div>
3507
+ </div>
3508
+ \`;
3509
+ })();
3510
+
3511
+ // \u2500\u2500 Heatmap \u2500\u2500
3512
+ const heatmapHtml = complexityStats.length > 0 && complexityHeatmap.evaluators.length > 0 ? \`
3513
+ <div class="card">
3514
+ <div class="card-body">
3515
+ <div class="card-label">Text Complexity Dimensions by Intended Grade</div>
3516
+ <div style="overflow-x:auto;">
3517
+ <table class="heatmap-table">
3518
+ <thead>
3519
+ <tr>
3520
+ <th>Dimension</th>
3521
+ \${complexityHeatmap.bands.map(b => \`<th style="text-align:center; padding:8px 10px; white-space:nowrap; min-width:56px;">\${esc(b)}</th>\`).join('')}
3522
+ </tr>
3523
+ </thead>
3524
+ <tbody>
3525
+ \${complexityHeatmap.evaluators.map((evaluator, ei) => \`
3526
+ <tr>
3527
+ <td>\${esc(evaluator)}</td>
3528
+ \${complexityHeatmap.bands.map((band, bi) => {
3529
+ const val = complexityHeatmap.values[bi][ei];
3530
+ const label = val !== null ? val.toFixed(1) : '\u2014';
3531
+ return \`<td class="cell-num" style="padding:8px 10px;"><span class="heatmap-cell" style="padding:4px 8px;">\${label}</span></td>\`;
3532
+ }).join('')}
3533
+ </tr>
3534
+ \`).join('')}
3535
+ </tbody>
3536
+ </table>
3537
+ </div>
3538
+ </div>
3539
+ </div>
3540
+ \` : '';
3541
+
3542
+ document.getElementById('tab-summary').innerHTML = \`
3543
+ <div class="tab-content">
3544
+ \${snapshotHtml}\${glsHtml}\${cxHtml}\${insightsHtml}\${distHtml}\${heatmapHtml}
3545
+ </div>
3546
+ \`;
3547
+ }
3548
+
3549
+ // ---------------------------------------------------------------------------
3550
+ // Full Results tab
3551
+ // ---------------------------------------------------------------------------
3552
+
3553
+ function renderResults() {
3554
+ const { fullResults } = REPORT_DATA;
3555
+ const { originalColumns, hasGLA, complexityEvaluators, rows } = fullResults;
3556
+
3557
+ const KEPT_COLS = ['row_id', 'text', 'grade'];
3558
+ const visibleColumns = originalColumns.filter(col => KEPT_COLS.includes(col.toLowerCase()));
3559
+ const frozenCount = 1; // only Row # is sticky
3560
+
3561
+ function colLabel(col) {
3562
+ const c = col.toLowerCase();
3563
+ if (c === 'row_id') return 'Row #';
3564
+ if (c === 'text') return 'Text';
3565
+ if (c === 'grade') return 'Grade';
3566
+ return col;
3567
+ }
3568
+ function colWidth(col) {
3569
+ const c = col.toLowerCase();
3570
+ if (c === 'row_id') return 60;
3571
+ if (c === 'text') return 340;
3572
+ if (c === 'grade') return 70;
3573
+ return 160;
3574
+ }
3575
+
3576
+ const thOriginal = visibleColumns.map((col, i) => {
3577
+ const isFrozen = i === 0;
3578
+ return \`<th class="\${isFrozen ? 'frozen frozen-last' : ''}" \${isFrozen ? 'data-frozen="0" style="min-width:' + colWidth(col) + 'px; left:0; z-index:4;"' : 'style="min-width:' + colWidth(col) + 'px;"'}>\${colLabel(col)}</th>\`;
3579
+ }).join('');
3580
+
3581
+ const thGLA = hasGLA ? \`
3582
+ <th class="group-start" style="min-width:120px">Grade Level Status</th>
3583
+ <th style="min-width:110px">GLA Grade Band</th>
3584
+ <th style="min-width:260px">GLA Reasoning</th>
3585
+ \` : '';
3586
+
3587
+ const thCX = complexityEvaluators.map(e => \`
3588
+ <th class="group-start" style="min-width:150px">\${esc(e.name)} Score</th>
3589
+ <th style="min-width:260px">\${esc(e.name)} Reasoning</th>
3590
+ \`).join('');
3591
+
3592
+ const bodyRows = rows.map(row => {
3593
+ const tdOriginal = visibleColumns.map((col, i) => {
3594
+ const isFrozen = i === 0;
3595
+ const isTextCol = col.toLowerCase() === 'text';
3596
+ const content = isTextCol
3597
+ ? \`<div class="cell-text-inner">\${esc(row[col])}</div>\`
3598
+ : esc(row[col]);
3599
+ const classes = [
3600
+ isFrozen ? 'frozen frozen-last' : '',
3601
+ isTextCol ? 'cell-text' : '',
3602
+ ].filter(Boolean).join(' ');
3603
+ const style = \`min-width:\${colWidth(col)}px;\${isFrozen ? ' left:0; z-index:2;' : ''}\`;
3604
+ return \`<td\${classes ? \` class="\${classes}"\` : ''} \${isFrozen ? 'data-frozen="0"' : ''} style="\${style}"\${isTextCol ? \` title="\${esc(row[col])}"\` : ''}>\${content}</td>\`;
3605
+ }).join('');
3606
+
3607
+ const glaStatus = row['__gla_status'] || '';
3608
+ const tdGLA = hasGLA ? \`
3609
+ <td class="group-start">\${statusBadge(glaStatus)}</td>
3610
+ <td>\${esc(row['__gla_band'])}</td>
3611
+ <td class="cell-reasoning" title="\${esc(row['__gla_reasoning'])}"><div class="cell-reasoning-inner">\${esc(row['__gla_reasoning'])}</div></td>
3612
+ \` : '';
3613
+
3614
+ const tdCX = complexityEvaluators.map(e => {
3615
+ const prefix = \`__\${e.prefix}\`;
3616
+ return \`
3617
+ <td class="group-start">\${esc(row[prefix + '_score'])}</td>
3618
+ <td class="cell-reasoning" title="\${esc(row[prefix + '_reasoning'])}"><div class="cell-reasoning-inner">\${esc(row[prefix + '_reasoning'])}</div></td>
3619
+ \`;
3620
+ }).join('');
3621
+
3622
+ return \`<tr>\${tdOriginal}\${tdGLA}\${tdCX}</tr>\`;
3623
+ }).join('');
3624
+
3625
+ document.getElementById('tab-results').innerHTML = \`
3626
+ <div class="tab-content" style="max-width:100%;">
3627
+ <div class="results-scroll">
3628
+ <table class="data-table" id="results-table">
3629
+ <thead><tr>\${thOriginal}\${thGLA}\${thCX}</tr></thead>
3630
+ <tbody>\${bodyRows}</tbody>
3631
+ </table>
3632
+ </div>
3633
+ </div>
3634
+ \`;
3635
+
3636
+ requestAnimationFrame(() => applyFrozenOffsets('results-table', frozenCount));
3637
+ }
3638
+
3639
+ /**
3640
+ * Reads the rendered widths of frozen header cells and applies correct
3641
+ * \`left\` offsets to all frozen <th> and <td> cells in the table.
3642
+ */
3643
+ function applyFrozenOffsets(tableId, frozenCount) {
3644
+ const table = document.getElementById(tableId);
3645
+ if (!table) return;
3646
+
3647
+ const headerCells = table.querySelectorAll('thead th.frozen');
3648
+ const offsets = [];
3649
+ let cumLeft = 0;
3650
+ headerCells.forEach(th => {
3651
+ offsets.push(cumLeft);
3652
+ cumLeft += th.offsetWidth;
3653
+ });
3654
+
3655
+ table.querySelectorAll('tr').forEach(row => {
3656
+ const cells = row.querySelectorAll('td.frozen, th.frozen');
3657
+ cells.forEach((cell, i) => {
3658
+ if (i < frozenCount) cell.style.left = (offsets[i] ?? 0) + 'px';
3659
+ });
3660
+ });
3661
+ }
3662
+
3663
+ // ---------------------------------------------------------------------------
3664
+ // Bootstrap
3665
+ // ---------------------------------------------------------------------------
3666
+
3667
+ renderHeader();
3668
+ renderTabs();
3669
+ renderSummary();
3670
+ renderResults();
3671
+ </script>
3672
+ </body>
3673
+ </html>
3674
+ `;
3675
+
3676
+ // src/batch/formatters.ts
3677
+ var GLA_EVALUATOR_ID = "grade-level-appropriateness";
3678
+ var GRADE_BANDS = ["K-1", "2-3", "4-5", "6-8", "9-10", "11-CCR"];
3679
+ var COMPLEXITY_SCORE_MAP = {
3680
+ "slightly complex": 1,
3681
+ "moderately complex": 2,
3682
+ "very complex": 3,
3683
+ "exceedingly complex": 4
3684
+ // 'more context needed' has no numeric equivalent — rows with this score appear as N/A
3685
+ // in individual results and are excluded from aggregate stats, same as failed evaluations.
3686
+ };
3687
+ function evaluatorDisplayName(id) {
3688
+ return id.split("-").map((w) => w.charAt(0).toUpperCase() + w.slice(1)).join(" ");
3689
+ }
3690
+ function gradeToBandIndex(grade) {
3691
+ const g = String(grade).trim().toUpperCase().replace(/^0+/, "");
3692
+ if (g === "K" || g === "KINDERGARTEN") return 0;
3693
+ if (g === "1") return 0;
3694
+ if (g === "2" || g === "3") return 1;
3695
+ if (g === "4" || g === "5") return 2;
3696
+ if (g === "6" || g === "7" || g === "8") return 3;
3697
+ if (g === "9" || g === "10") return 4;
3698
+ if (g === "11" || g === "12" || g === "CCR") return 5;
3699
+ return -1;
3700
+ }
3701
+ function glaBandToIndex(band) {
3702
+ return GRADE_BANDS.indexOf(band);
3703
+ }
3704
+ function getGLAStatus(inputGrade, glaBand) {
3705
+ const inputIdx = gradeToBandIndex(inputGrade);
3706
+ const glaIdx = glaBandToIndex(glaBand);
3707
+ if (inputIdx === -1 || glaIdx === -1) return "off-target";
3708
+ const diff = Math.abs(inputIdx - glaIdx);
3709
+ if (diff === 0) return "on-band";
3710
+ if (diff === 1) return "adjacent";
3711
+ return "off-target";
3712
+ }
3713
+ function complexityToNumeric(score) {
3714
+ return COMPLEXITY_SCORE_MAP[score.toLowerCase().trim()];
3715
+ }
3716
+ function complexityScoreLabel(avg) {
3717
+ if (avg < 1.5) return "Slightly Complex";
3718
+ if (avg < 2.5) return "Moderately Complex";
3719
+ if (avg < 3.5) return "Very Complex";
3720
+ return "Exceedingly Complex";
3721
+ }
3722
+ function generateInsights() {
3723
+ return [
3724
+ "Review texts marked as Off Target \u2014 they may need content revision or grade-level adjustment before distribution.",
3725
+ "Texts evaluated as Adjacent may benefit from light scaffolding strategies such as vocabulary pre-teaching.",
3726
+ "Higher grade bands tend to show greater text complexity. Consider whether complexity aligns with instructional goals."
3727
+ ];
3728
+ }
3729
+ function groupResultsByRow(results) {
3730
+ const grouped = /* @__PURE__ */ new Map();
3731
+ for (const result of results) {
3732
+ if (!grouped.has(result.rowIndex)) {
3733
+ grouped.set(result.rowIndex, []);
3734
+ }
3735
+ grouped.get(result.rowIndex).push(result);
3736
+ }
3737
+ return grouped;
3738
+ }
3739
+ function formatEvaluatorPrefix(evaluatorId) {
3740
+ const slug = evaluatorId.includes(".") ? evaluatorId.split(".").pop() : evaluatorId;
3741
+ return slug.replace(/-/g, "_");
3742
+ }
3743
+ function escapeCSV(field) {
3744
+ if (field.includes(",") || field.includes('"') || field.includes("\n")) {
3745
+ return `"${field.replace(/"/g, '""')}"`;
3746
+ }
3747
+ return field;
3748
+ }
3749
+ function formatAsCSV(output) {
3750
+ if (output.results.length === 0) {
3751
+ return "";
3752
+ }
3753
+ const groupedByRow = groupResultsByRow(output.results);
3754
+ const evaluatorIds = Array.from(new Set(output.results.map((r) => r.evaluatorId))).sort();
3755
+ const firstResult = output.results[0];
3756
+ const originalColumns = Object.keys(firstResult.originalRow);
3757
+ const evaluatorColumns = [];
3758
+ for (const evalId of evaluatorIds) {
3759
+ const prefix = formatEvaluatorPrefix(evalId);
3760
+ evaluatorColumns.push(`${prefix}_score`);
3761
+ evaluatorColumns.push(`${prefix}_reasoning`);
3762
+ evaluatorColumns.push(`${prefix}_status`);
3763
+ }
3764
+ const headers = [...originalColumns, ...evaluatorColumns];
3765
+ const rows = [];
3766
+ const sortedRowIndices = Array.from(groupedByRow.keys()).sort((a, b) => a - b);
3767
+ for (const rowIndex of sortedRowIndices) {
3768
+ const resultsForRow = groupedByRow.get(rowIndex);
3769
+ const firstResultForRow = resultsForRow[0];
3770
+ const originalValues = originalColumns.map(
3771
+ (col) => escapeCSV(String(firstResultForRow.originalRow[col] || ""))
3772
+ );
3773
+ const evaluatorValues = [];
3774
+ for (const evalId of evaluatorIds) {
3775
+ const result = resultsForRow.find((r) => r.evaluatorId === evalId);
3776
+ if (result) {
3777
+ evaluatorValues.push(result.status === "success" ? escapeCSV(result.score || "") : "");
3778
+ evaluatorValues.push(result.status === "success" ? escapeCSV(result.reasoning || "") : escapeCSV(result.error || ""));
3779
+ evaluatorValues.push(result.status);
3780
+ } else {
3781
+ evaluatorValues.push("", "", "not_run");
3782
+ }
3783
+ }
3784
+ rows.push([...originalValues, ...evaluatorValues]);
3785
+ }
3786
+ return [headers, ...rows].map((row) => row.join(",")).join("\n");
3787
+ }
3788
+ function formatAsHTML(output, meta) {
3789
+ const { results } = output;
3790
+ const byRow = groupResultsByRow(results);
3791
+ const allRowIndices = Array.from(byRow.keys()).sort((a, b) => a - b);
3792
+ const allEvaluatorIds = Array.from(new Set(results.map((r) => r.evaluatorId))).sort();
3793
+ const hasGLA = allEvaluatorIds.includes(GLA_EVALUATOR_ID);
3794
+ const complexityIds = allEvaluatorIds.filter((id) => id !== GLA_EVALUATOR_ID);
3795
+ let processedRows = 0;
3796
+ let erroredRows = 0;
3797
+ for (const rowResults of byRow.values()) {
3798
+ if (rowResults.some((r) => r.status === "error")) erroredRows++;
3799
+ else processedRows++;
3800
+ }
3801
+ const glaCounts = { onBand: 0, adjacent: 0, offTarget: 0 };
3802
+ const rowGLAStatus = /* @__PURE__ */ new Map();
3803
+ if (hasGLA) {
3804
+ for (const [rowIndex, rowResults] of byRow) {
3805
+ const glaResult = rowResults.find((r) => r.evaluatorId === GLA_EVALUATOR_ID);
3806
+ if (glaResult && glaResult.status === "success" && glaResult.score) {
3807
+ const status = getGLAStatus(glaResult.grade, glaResult.score);
3808
+ rowGLAStatus.set(rowIndex, { status, band: glaResult.score, reasoning: glaResult.reasoning || "" });
3809
+ if (status === "on-band") glaCounts.onBand++;
3810
+ else if (status === "adjacent") glaCounts.adjacent++;
3811
+ else glaCounts.offTarget++;
3812
+ }
3813
+ }
3814
+ }
3815
+ const glaTotal = glaCounts.onBand + glaCounts.adjacent + glaCounts.offTarget;
3816
+ const pct = (n) => glaTotal > 0 ? Math.round(n / glaTotal * 100) : 0;
3817
+ const complexityStats = complexityIds.map((evalId) => {
3818
+ const scores = [];
3819
+ const distribution = [0, 0, 0, 0];
3820
+ for (const rowResults of byRow.values()) {
3821
+ const r = rowResults.find((x) => x.evaluatorId === evalId);
3822
+ if (r && r.status === "success" && r.score) {
3823
+ const num = complexityToNumeric(r.score);
3824
+ if (num !== void 0) {
3825
+ scores.push(num);
3826
+ distribution[num - 1]++;
3827
+ }
3828
+ }
3829
+ }
3830
+ const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
3831
+ return {
3832
+ evaluatorId: evalId,
3833
+ name: evaluatorDisplayName(evalId),
3834
+ average: Math.round(avg * 10) / 10,
3835
+ label: avg > 0 ? complexityScoreLabel(avg) : "N/A",
3836
+ distribution
3837
+ };
3838
+ });
3839
+ const bandDist = GRADE_BANDS.map(() => ({ onBand: 0, adjacent: 0, offTarget: 0, total: 0 }));
3840
+ for (const [rowIndex, rowResults] of byRow) {
3841
+ const firstResult = rowResults[0];
3842
+ if (!firstResult) continue;
3843
+ const bandIdx = gradeToBandIndex(firstResult.grade);
3844
+ if (bandIdx === -1) continue;
3845
+ const glaStatus = rowGLAStatus.get(rowIndex);
3846
+ if (glaStatus) {
3847
+ bandDist[bandIdx].total++;
3848
+ if (glaStatus.status === "on-band") bandDist[bandIdx].onBand++;
3849
+ else if (glaStatus.status === "adjacent") bandDist[bandIdx].adjacent++;
3850
+ else bandDist[bandIdx].offTarget++;
3851
+ }
3852
+ }
3853
+ const hmSums = GRADE_BANDS.map(() => complexityIds.map(() => 0));
3854
+ const hmCounts = GRADE_BANDS.map(() => complexityIds.map(() => 0));
3855
+ for (const rowResults of byRow.values()) {
3856
+ const firstResult = rowResults[0];
3857
+ if (!firstResult) continue;
3858
+ const bandIdx = gradeToBandIndex(firstResult.grade);
3859
+ if (bandIdx === -1) continue;
3860
+ complexityIds.forEach((evalId, evalIdx) => {
3861
+ const r = rowResults.find((x) => x.evaluatorId === evalId);
3862
+ if (r && r.status === "success" && r.score) {
3863
+ const num = complexityToNumeric(r.score);
3864
+ if (num !== void 0) {
3865
+ hmSums[bandIdx][evalIdx] += num;
3866
+ hmCounts[bandIdx][evalIdx]++;
3867
+ }
3868
+ }
3869
+ });
3870
+ }
3871
+ const heatmapValues = GRADE_BANDS.map(
3872
+ (_, bi) => complexityIds.map((_2, ei) => {
3873
+ const count = hmCounts[bi][ei];
3874
+ return count > 0 ? Math.round(hmSums[bi][ei] / count * 10) / 10 : null;
3875
+ })
3876
+ );
3877
+ const firstRowResults = allRowIndices.length > 0 ? byRow.get(allRowIndices[0]) ?? [] : [];
3878
+ const originalColumns = firstRowResults.length > 0 ? Object.keys(firstRowResults[0].originalRow) : [];
3879
+ const fullResultsRows = allRowIndices.map((rowIndex) => {
3880
+ const rowResults = byRow.get(rowIndex);
3881
+ const firstResult = rowResults[0];
3882
+ const row = {};
3883
+ for (const col of originalColumns) {
3884
+ row[col] = String(firstResult.originalRow[col] ?? "");
3885
+ }
3886
+ const glaStatus = rowGLAStatus.get(rowIndex);
3887
+ const glaLabels = { "on-band": "On Band", "adjacent": "Adjacent", "off-target": "Off Target" };
3888
+ row["__gla_status"] = glaStatus ? glaLabels[glaStatus.status] : hasGLA ? "Error" : "";
3889
+ row["__gla_band"] = glaStatus?.band ?? "";
3890
+ row["__gla_reasoning"] = glaStatus?.reasoning ?? "";
3891
+ for (const evalId of complexityIds) {
3892
+ const r = rowResults.find((x) => x.evaluatorId === evalId);
3893
+ const prefix = `__${evalId.replace(/-/g, "_")}`;
3894
+ row[`${prefix}_score`] = r?.status === "success" ? r.score ?? "" : r?.status === "error" ? "Error" : "";
3895
+ row[`${prefix}_reasoning`] = r?.status === "success" ? r.reasoning ?? "" : r?.error ?? "";
3896
+ }
3897
+ return row;
3898
+ });
3899
+ const reportData = {
3900
+ meta: {
3901
+ reportId: meta.reportId,
3902
+ generatedAt: meta.generatedAt.toLocaleString("en-US", {
3903
+ month: "short",
3904
+ day: "numeric",
3905
+ year: "numeric",
3906
+ hour: "numeric",
3907
+ minute: "2-digit",
3908
+ hour12: true
3909
+ }),
3910
+ csvPath: meta.csvPath,
3911
+ groupId: meta.groupId,
3912
+ evaluatorIds: allEvaluatorIds,
3913
+ evaluatorNames: allEvaluatorIds.map(evaluatorDisplayName),
3914
+ totalRows: meta.totalInputRows,
3915
+ processedRows,
3916
+ erroredRows
3917
+ },
3918
+ gradeLevelStats: {
3919
+ onBand: glaCounts.onBand,
3920
+ adjacent: glaCounts.adjacent,
3921
+ offTarget: glaCounts.offTarget,
3922
+ onBandPct: pct(glaCounts.onBand),
3923
+ adjacentPct: pct(glaCounts.adjacent),
3924
+ offTargetPct: pct(glaCounts.offTarget),
3925
+ hasData: glaTotal > 0
3926
+ },
3927
+ complexityStats,
3928
+ gradeBandDistribution: {
3929
+ bands: [...GRADE_BANDS],
3930
+ data: bandDist
3931
+ },
3932
+ complexityHeatmap: {
3933
+ bands: [...GRADE_BANDS],
3934
+ evaluators: complexityIds.map(evaluatorDisplayName),
3935
+ evaluatorIds: complexityIds,
3936
+ values: heatmapValues
3937
+ },
3938
+ insights: generateInsights(),
3939
+ fullResults: {
3940
+ originalColumns,
3941
+ hasGLA,
3942
+ complexityEvaluators: complexityIds.map((id) => ({
3943
+ evaluatorId: id,
3944
+ name: evaluatorDisplayName(id),
3945
+ prefix: id.replace(/-/g, "_")
3946
+ })),
3947
+ rows: fullResultsRows
3948
+ }
3949
+ };
3950
+ const safeJson = JSON.stringify(reportData).replace(/</g, "\\u003c").replace(/>/g, "\\u003e").replace(/&/g, "\\u0026");
3951
+ const INJECTION_MARKER = "var REPORT_DATA = null; // __REPLACED_BY_FORMATTER__";
3952
+ if (!report_template_default.includes(INJECTION_MARKER)) {
3953
+ throw new Error("Report template injection marker not found \u2014 template may be corrupted");
3954
+ }
3955
+ return report_template_default.replace(INJECTION_MARKER, `var REPORT_DATA = ${safeJson};`);
3956
+ }
3957
+
3958
+ // src/batch/progress.ts
3959
+ var ProgressTracker = class {
3960
+ totalTasks;
3961
+ completed = 0;
3962
+ successful = 0;
3963
+ failed = 0;
3964
+ startTime;
3965
+ perEvaluator = /* @__PURE__ */ new Map();
3966
+ constructor(totalTasks) {
3967
+ this.totalTasks = totalTasks;
3968
+ this.startTime = Date.now();
3969
+ }
3970
+ /**
3971
+ * Update progress with a new result
3972
+ */
3973
+ update(result) {
3974
+ this.completed++;
3975
+ if (result.status === "success") {
3976
+ this.successful++;
3977
+ } else {
3978
+ this.failed++;
3979
+ }
3980
+ if (!this.perEvaluator.has(result.evaluatorId)) {
3981
+ this.perEvaluator.set(result.evaluatorId, { completed: 0, successful: 0, failed: 0 });
3982
+ }
3983
+ const stats = this.perEvaluator.get(result.evaluatorId);
3984
+ stats.completed++;
3985
+ if (result.status === "success") {
3986
+ stats.successful++;
3987
+ } else {
3988
+ stats.failed++;
3989
+ }
3990
+ }
3991
+ /**
3992
+ * Get current progress percentage
3993
+ */
3994
+ getPercentage() {
3995
+ return Math.round(this.completed / this.totalTasks * 100);
3996
+ }
3997
+ /**
3998
+ * Get elapsed time in seconds
3999
+ */
4000
+ getElapsedSeconds() {
4001
+ return Math.round((Date.now() - this.startTime) / 1e3);
4002
+ }
4003
+ /**
4004
+ * Estimate remaining time in seconds
4005
+ */
4006
+ getEstimatedRemainingSeconds() {
4007
+ if (this.completed === 0) return 0;
4008
+ const elapsed = Date.now() - this.startTime;
4009
+ const avgTimePerTask = elapsed / this.completed;
4010
+ const remaining = this.totalTasks - this.completed;
4011
+ return Math.round(avgTimePerTask * remaining / 1e3);
4012
+ }
4013
+ /**
4014
+ * Format elapsed time as human-readable string
4015
+ */
4016
+ formatElapsed() {
4017
+ const seconds = this.getElapsedSeconds();
4018
+ if (seconds < 60) return `${seconds}s`;
4019
+ const minutes = Math.floor(seconds / 60);
4020
+ const remainingSeconds = seconds % 60;
4021
+ return `${minutes}m ${remainingSeconds}s`;
4022
+ }
4023
+ /**
4024
+ * Format estimated remaining time as human-readable string
4025
+ */
4026
+ formatEstimatedRemaining() {
4027
+ const seconds = this.getEstimatedRemainingSeconds();
4028
+ if (seconds < 60) return `${seconds}s`;
4029
+ const minutes = Math.floor(seconds / 60);
4030
+ const remainingSeconds = seconds % 60;
4031
+ return `${minutes}m ${remainingSeconds}s`;
4032
+ }
4033
+ /**
4034
+ * Generate progress bar
4035
+ */
4036
+ getProgressBar(width = 20) {
4037
+ const percentage = this.getPercentage();
4038
+ const filled = Math.round(percentage / 100 * width);
4039
+ const empty = width - filled;
4040
+ return "\u2588".repeat(filled) + "\u2591".repeat(empty);
4041
+ }
4042
+ /**
4043
+ * Display progress in terminal
4044
+ */
4045
+ display() {
4046
+ if (this.completed > 1) {
4047
+ const linesToClear = 3 + this.perEvaluator.size;
4048
+ process.stdout.write(`\x1B[${linesToClear}A`);
4049
+ process.stdout.write("\x1B[J");
4050
+ }
4051
+ console.log(
4052
+ `${this.getProgressBar()} ${this.getPercentage()}% (${this.completed}/${this.totalTasks})`
4053
+ );
4054
+ for (const [evalId, stats] of this.perEvaluator.entries()) {
4055
+ const status = stats.completed === stats.successful ? "\u2713" : stats.failed > 0 ? "\u2717" : "\u23F3";
4056
+ console.log(
4057
+ ` ${status} ${evalId}: ${stats.successful}/${stats.completed} successful`
4058
+ );
4059
+ }
4060
+ console.log(
4061
+ `
4062
+ \u23F1 Elapsed: ${this.formatElapsed()} | Estimated remaining: ${this.formatEstimatedRemaining()}`
4063
+ );
4064
+ }
4065
+ /**
4066
+ * Display final summary
4067
+ */
4068
+ displaySummary() {
4069
+ const linesToClear = 3 + this.perEvaluator.size + 1;
4070
+ process.stdout.write(`\x1B[${linesToClear}A`);
4071
+ process.stdout.write("\x1B[J");
4072
+ console.log("\n\u2705 Batch evaluation completed!\n");
4073
+ console.log(`Total tasks: ${this.totalTasks}`);
4074
+ console.log(`Successful: ${this.successful} \u2713`);
4075
+ console.log(`Failed: ${this.failed} \u2717`);
4076
+ console.log(`Duration: ${this.formatElapsed()}`);
4077
+ if (this.perEvaluator.size > 1) {
4078
+ console.log("\nResults per evaluator:");
4079
+ for (const [evalId, stats] of this.perEvaluator.entries()) {
4080
+ console.log(
4081
+ ` ${evalId}: ${stats.successful} successful, ${stats.failed} failed`
4082
+ );
4083
+ }
4084
+ }
4085
+ console.log();
4086
+ }
4087
+ };
4088
+
4089
+ // src/batch/cli.ts
4090
+ function parseArgs() {
4091
+ const args = process.argv.slice(2);
4092
+ const result = {};
4093
+ for (let i = 0; i < args.length; i++) {
4094
+ if (args[i] === "--concurrency" && args[i + 1]) {
4095
+ const v = parseInt(args[++i], 10);
4096
+ if (!isNaN(v) && v > 0) result.concurrency = v;
4097
+ } else if (args[i] === "--max-retries" && args[i + 1]) {
4098
+ const v = parseInt(args[++i], 10);
4099
+ if (!isNaN(v) && v >= 0) result.maxRetries = v;
4100
+ } else if (args[i] === "--no-telemetry") {
4101
+ result.noTelemetry = true;
4102
+ }
4103
+ }
4104
+ return result;
4105
+ }
4106
+ async function main() {
4107
+ const cliArgs = parseArgs();
4108
+ console.log("\n\u{1F4CA} Batch CSV Evaluator\n");
4109
+ console.log("This tool will evaluate multiple texts using one or more evaluators.\n");
4110
+ try {
4111
+ let inputs = [];
4112
+ const { csvPath } = await prompts({
4113
+ type: "text",
4114
+ name: "csvPath",
4115
+ message: "Where is your CSV file?",
4116
+ initial: "./input.csv",
4117
+ validate: (value) => {
4118
+ try {
4119
+ inputs = parseCSV(value);
4120
+ return true;
4121
+ } catch (error) {
4122
+ return error instanceof Error ? error.message : "Invalid CSV file";
4123
+ }
4124
+ }
4125
+ });
4126
+ if (!csvPath) {
4127
+ console.log("No file path provided. Run the command again to start over.");
4128
+ process.exit(0);
4129
+ }
4130
+ console.log(`
4131
+ \u2713 Found ${inputs.length} rows in CSV
4132
+ `);
4133
+ const group = getAvailableGroups()[0];
4134
+ console.log(`\u2713 Evaluator group: ${group.name}`);
4135
+ console.log(` ${group.description}`);
4136
+ console.log(` Row limit: ${group.maxInputRows}
4137
+ `);
4138
+ if (inputs.length > group.maxInputRows) {
4139
+ console.error(`\u274C Too many rows: ${inputs.length} (max ${group.maxInputRows} for this group)
4140
+ `);
4141
+ console.log("Suggestions:");
4142
+ console.log(` \u2022 Trim the CSV to ${group.maxInputRows} rows`);
4143
+ console.log(" \u2022 Split into multiple smaller batches\n");
4144
+ process.exit(1);
4145
+ }
4146
+ let googleApiKey;
4147
+ let openaiApiKey;
4148
+ if (group.requiresGoogleKey) {
4149
+ const result = await prompts({
4150
+ type: "password",
4151
+ name: "key",
4152
+ message: "Google API Key:",
4153
+ initial: process.env.GOOGLE_API_KEY || "",
4154
+ validate: (value) => value ? true : "Google API key is required"
4155
+ });
4156
+ if (!result.key) {
4157
+ console.log("Cancelled.");
4158
+ process.exit(0);
4159
+ }
4160
+ googleApiKey = result.key;
4161
+ }
4162
+ if (group.requiresOpenAIKey) {
4163
+ const result = await prompts({
4164
+ type: "password",
4165
+ name: "key",
4166
+ message: "OpenAI API Key:",
4167
+ initial: process.env.OPENAI_API_KEY || "",
4168
+ validate: (value) => value ? true : "OpenAI API key is required"
4169
+ });
4170
+ if (!result.key) {
4171
+ console.log("Cancelled.");
4172
+ process.exit(0);
4173
+ }
4174
+ openaiApiKey = result.key;
4175
+ }
4176
+ const now = /* @__PURE__ */ new Date();
4177
+ const pad = (n) => String(n).padStart(2, "0");
4178
+ const timestamp = `${now.getFullYear()}-${pad(now.getMonth() + 1)}-${pad(now.getDate())}_${pad(now.getHours())}-${pad(now.getMinutes())}-${pad(now.getSeconds())}`;
4179
+ const defaultOutputDir = path.join(process.cwd(), `batch-results-${timestamp}`);
4180
+ const { outputDir } = await prompts({
4181
+ type: "text",
4182
+ name: "outputDir",
4183
+ message: "Output directory:",
4184
+ initial: defaultOutputDir,
4185
+ validate: (value) => {
4186
+ const parentDir = path.dirname(value);
4187
+ if (!fs2.existsSync(parentDir)) {
4188
+ return `Parent directory does not exist: ${parentDir}`;
4189
+ }
4190
+ try {
4191
+ const testFile = path.join(parentDir, ".write-test");
4192
+ fs2.writeFileSync(testFile, "");
4193
+ fs2.unlinkSync(testFile);
4194
+ return true;
4195
+ } catch (error) {
4196
+ if (error instanceof Error) {
4197
+ if (error.message.includes("EACCES")) return `No write permission for directory: ${parentDir}`;
4198
+ if (error.message.includes("EROFS")) return `Directory is read-only: ${parentDir}`;
4199
+ return `Cannot write to directory: ${error.message}`;
4200
+ }
4201
+ return "Cannot write to directory";
4202
+ }
4203
+ }
4204
+ });
4205
+ if (!outputDir) {
4206
+ console.log("No output directory provided. Run the command again to start over.");
4207
+ process.exit(0);
4208
+ }
4209
+ fs2.mkdirSync(outputDir, { recursive: true });
4210
+ const csvBasename = path.basename(csvPath, path.extname(csvPath));
4211
+ const reportMeta = {
4212
+ csvPath: path.resolve(csvPath),
4213
+ groupId: group.id,
4214
+ reportId: `${csvBasename.replace(/[^a-zA-Z0-9]/g, "_")}_${timestamp}`,
4215
+ generatedAt: now,
4216
+ totalInputRows: inputs.length
4217
+ };
4218
+ const totalTasks = inputs.length * group.evaluatorIds.length;
4219
+ console.log(`
4220
+ \u{1F4DD} Summary:`);
4221
+ console.log(` Input rows: ${inputs.length}`);
4222
+ console.log(` Evaluators: ${group.evaluatorIds.length}`);
4223
+ console.log(` Total tasks: ${totalTasks}`);
4224
+ console.log(` Concurrency: ${cliArgs.concurrency ?? 3}`);
4225
+ console.log(` Max retries: ${cliArgs.maxRetries ?? 2}`);
4226
+ console.log(` Output: ${outputDir}
4227
+ `);
4228
+ const { confirm } = await prompts({
4229
+ type: "confirm",
4230
+ name: "confirm",
4231
+ message: "Start batch evaluation?",
4232
+ initial: true
4233
+ });
4234
+ if (!confirm) {
4235
+ console.log("Cancelled.");
4236
+ process.exit(0);
4237
+ }
4238
+ console.log("\n" + "=".repeat(60));
4239
+ const tracker = new ProgressTracker(totalTasks);
4240
+ const evaluationStartTime = Date.now();
4241
+ const evaluator = new BatchEvaluator({
4242
+ googleApiKey,
4243
+ openaiApiKey,
4244
+ concurrency: cliArgs.concurrency ?? 3,
4245
+ maxRetries: cliArgs.maxRetries ?? 2,
4246
+ telemetry: !cliArgs.noTelemetry
4247
+ });
4248
+ let isShuttingDown = false;
4249
+ const handleShutdown = () => {
4250
+ if (isShuttingDown) {
4251
+ console.log("\n\n\u26A0\uFE0F Force quit detected. Exiting immediately...");
4252
+ process.exit(1);
4253
+ }
4254
+ isShuttingDown = true;
4255
+ console.log("\n\n\u26A0\uFE0F Shutdown requested. Saving partial results...");
4256
+ console.log(" (Press Ctrl+C again to force quit)\n");
4257
+ const partialResults = evaluator.cancel();
4258
+ if (partialResults.length > 0) {
4259
+ const durationMs = Date.now() - evaluationStartTime;
4260
+ const partialOutput = {
4261
+ results: partialResults,
4262
+ summary: {
4263
+ totalTasks: partialResults.length,
4264
+ successful: partialResults.filter((r) => r.status === "success").length,
4265
+ failed: partialResults.filter((r) => r.status === "error").length,
4266
+ durationMs,
4267
+ resultsPerEvaluator: {}
4268
+ }
4269
+ };
4270
+ try {
4271
+ fs2.writeFileSync(path.join(outputDir, "results-partial.csv"), formatAsCSV(partialOutput));
4272
+ fs2.writeFileSync(path.join(outputDir, "results-partial.html"), formatAsHTML(partialOutput, reportMeta));
4273
+ console.log(`\u2713 Saved ${partialResults.length} results to:`);
4274
+ console.log(` ${outputDir}/`);
4275
+ console.log(` \u251C\u2500\u2500 results-partial.csv`);
4276
+ console.log(` \u2514\u2500\u2500 results-partial.html`);
4277
+ console.log();
4278
+ } catch (error) {
4279
+ console.error("\u274C Error saving partial results:", error instanceof Error ? error.message : String(error));
4280
+ }
4281
+ } else {
4282
+ console.log("No results to save yet.\n");
4283
+ }
4284
+ process.exit(0);
4285
+ };
4286
+ process.on("SIGINT", handleShutdown);
4287
+ process.on("SIGTERM", handleShutdown);
4288
+ let output;
4289
+ try {
4290
+ output = await evaluator.evaluate(inputs, group.id, (result) => {
4291
+ tracker.update(result);
4292
+ tracker.display();
4293
+ });
4294
+ } finally {
4295
+ process.off("SIGINT", handleShutdown);
4296
+ process.off("SIGTERM", handleShutdown);
4297
+ }
4298
+ tracker.displaySummary();
4299
+ try {
4300
+ fs2.writeFileSync(path.join(outputDir, "results.csv"), formatAsCSV(output));
4301
+ fs2.writeFileSync(path.join(outputDir, "results.html"), formatAsHTML(output, reportMeta));
4302
+ console.log("\u{1F4C4} Output files generated:");
4303
+ console.log(` ${outputDir}/`);
4304
+ console.log(` \u251C\u2500\u2500 results.csv`);
4305
+ console.log(` \u2514\u2500\u2500 results.html`);
4306
+ console.log();
4307
+ const htmlPath = path.join(outputDir, "results.html");
4308
+ try {
4309
+ const cmd = process.platform === "win32" ? `start "" "${htmlPath}"` : `open "${htmlPath}"`;
4310
+ exec(cmd);
4311
+ } catch {
4312
+ }
4313
+ } catch (error) {
4314
+ console.error("\n\u274C Error writing output files:");
4315
+ if (error instanceof Error) console.error(` ${error.message}`);
4316
+ console.error("\n\u26A0\uFE0F Evaluation completed but outputs could not be saved.");
4317
+ process.exit(1);
4318
+ }
4319
+ } catch (error) {
4320
+ console.error("\n\u274C Error:", error instanceof Error ? error.message : String(error));
4321
+ process.exit(1);
4322
+ }
4323
+ }
4324
+ main();
4325
+ //# sourceMappingURL=cli.js.map
4326
+ //# sourceMappingURL=cli.js.map