llm-checker 3.2.8 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,798 @@
1
+ const fs = require('fs');
2
+ const os = require('os');
3
+ const path = require('path');
4
+ const { spawnSync } = require('child_process');
5
+ const YAML = require('yaml');
6
+ const {
7
+ SUPPORTED_CALIBRATION_OBJECTIVES,
8
+ calibrationObjectiveSchema,
9
+ calibrationExecutionModeSchema,
10
+ promptSuiteEntrySchema,
11
+ calibrationResultSchema,
12
+ calibrationPolicySchema,
13
+ DEFAULT_CALIBRATION_TASK
14
+ } = require('./schemas');
15
+ const { SUPPORTED_RUNTIMES, normalizeRuntime } = require('../runtime/runtime-support');
16
+
17
+ const SUPPORTED_FULL_MODE_RUNTIMES = ['ollama'];
18
+
19
+ function formatZodIssues(error) {
20
+ if (!error || !Array.isArray(error.issues) || error.issues.length === 0) {
21
+ return error?.message || 'Validation failed';
22
+ }
23
+
24
+ return error.issues
25
+ .map((issue) => {
26
+ const location = issue.path && issue.path.length > 0 ? issue.path.join('.') : 'root';
27
+ return `${location}: ${issue.message}`;
28
+ })
29
+ .join('; ');
30
+ }
31
+
32
+ function toNonEmptyTaskName(task) {
33
+ const raw = String(task || '').trim().toLowerCase();
34
+ return raw || DEFAULT_CALIBRATION_TASK;
35
+ }
36
+
37
+ function isYamlPath(filePath = '') {
38
+ const extension = path.extname(String(filePath || '')).toLowerCase();
39
+ return extension === '.yaml' || extension === '.yml';
40
+ }
41
+
42
+ function toNumber(value, fallback = 0) {
43
+ const parsed = Number(value);
44
+ return Number.isFinite(parsed) ? parsed : fallback;
45
+ }
46
+
47
+ function toPositiveInt(value, fallback) {
48
+ const parsed = Number.parseInt(String(value), 10);
49
+ if (!Number.isFinite(parsed) || parsed < 0) return fallback;
50
+ return parsed;
51
+ }
52
+
53
+ function percentile(values, p) {
54
+ if (!Array.isArray(values) || values.length === 0) return 0;
55
+ const sorted = values
56
+ .map((value) => toNumber(value, 0))
57
+ .filter((value) => Number.isFinite(value))
58
+ .sort((a, b) => a - b);
59
+ if (sorted.length === 0) return 0;
60
+ const index = Math.ceil((p / 100) * sorted.length) - 1;
61
+ return sorted[Math.min(Math.max(index, 0), sorted.length - 1)];
62
+ }
63
+
64
+ function median(values) {
65
+ return percentile(values, 50);
66
+ }
67
+
68
+ function normalizeModelQuantization(modelIdentifier) {
69
+ const value = String(modelIdentifier || '').toLowerCase();
70
+ if (value.includes('q2')) return 0.25;
71
+ if (value.includes('q3')) return 0.375;
72
+ if (value.includes('q4')) return 0.5;
73
+ if (value.includes('q5')) return 0.625;
74
+ if (value.includes('q6')) return 0.75;
75
+ if (value.includes('q8')) return 1.0;
76
+ if (value.includes('fp16') || value.includes('f16') || value.includes('bf16')) return 2.0;
77
+ return 1.0;
78
+ }
79
+
80
+ function extractParamsB(modelIdentifier) {
81
+ const match = String(modelIdentifier || '')
82
+ .toLowerCase()
83
+ .match(/(\d+(?:\.\d+)?)\s*b/);
84
+ if (!match) return null;
85
+ const value = Number.parseFloat(match[1]);
86
+ return Number.isFinite(value) ? value : null;
87
+ }
88
+
89
+ function estimatePeakMemoryMb(modelIdentifier) {
90
+ const paramsB = extractParamsB(modelIdentifier);
91
+ if (!paramsB) return undefined;
92
+
93
+ const bytesPerParam = normalizeModelQuantization(modelIdentifier);
94
+ const bytes = paramsB * 1_000_000_000 * bytesPerParam;
95
+ const withOverhead = bytes * 1.15;
96
+ return Math.round(withOverhead / (1024 * 1024));
97
+ }
98
+
99
+ function countTokensApprox(text) {
100
+ const source = String(text || '').trim();
101
+ if (!source) return 0;
102
+ return source.split(/\s+/).length;
103
+ }
104
+
105
+ function normalizeErrorCode(error) {
106
+ const explicitCode = String(error?.code || '').trim();
107
+ if (explicitCode) return explicitCode.toUpperCase();
108
+
109
+ const message = String(error?.message || '').toLowerCase();
110
+ if (message.includes('timed out')) return 'RUNTIME_TIMEOUT';
111
+ if (message.includes('unsupported runtime')) return 'UNSUPPORTED_RUNTIME';
112
+ if (message.includes('regex')) return 'QUALITY_REGEX_ERROR';
113
+ return 'CALIBRATION_RUNTIME_ERROR';
114
+ }
115
+
116
+ class CalibrationManager {
117
+ constructor(options = {}) {
118
+ this.promptExecutor =
119
+ typeof options.promptExecutor === 'function'
120
+ ? options.promptExecutor
121
+ : this.executePromptWithRuntime.bind(this);
122
+ }
123
+
124
+ resolvePath(filePath, cwd = process.cwd()) {
125
+ if (!filePath || typeof filePath !== 'string') {
126
+ throw new Error('A file path is required.');
127
+ }
128
+
129
+ return path.isAbsolute(filePath) ? filePath : path.resolve(cwd, filePath);
130
+ }
131
+
132
+ parsePromptSuite(suiteFilePath, options = {}) {
133
+ const cwd = options.cwd || process.cwd();
134
+ const resolvedPath = this.resolvePath(suiteFilePath, cwd);
135
+
136
+ if (!fs.existsSync(resolvedPath)) {
137
+ throw new Error(`Prompt suite file not found: ${resolvedPath}`);
138
+ }
139
+
140
+ const source = fs.readFileSync(resolvedPath, 'utf8');
141
+ const lines = source.split(/\r?\n/);
142
+ const entries = [];
143
+ const taskBreakdown = {};
144
+
145
+ lines.forEach((line, index) => {
146
+ const lineNumber = index + 1;
147
+ const trimmed = line.trim();
148
+ if (!trimmed) {
149
+ return;
150
+ }
151
+
152
+ let parsed;
153
+ try {
154
+ parsed = JSON.parse(trimmed);
155
+ } catch (error) {
156
+ throw new Error(`Invalid JSON in prompt suite at line ${lineNumber}: ${error.message}`);
157
+ }
158
+
159
+ let validated;
160
+ try {
161
+ validated = promptSuiteEntrySchema.parse(parsed);
162
+ } catch (error) {
163
+ throw new Error(
164
+ `Invalid prompt suite entry at line ${lineNumber}: ${formatZodIssues(error)}`
165
+ );
166
+ }
167
+
168
+ const task = toNonEmptyTaskName(validated.task);
169
+ const id = validated.id || `prompt-${entries.length + 1}`;
170
+
171
+ entries.push({
172
+ ...validated,
173
+ id,
174
+ task,
175
+ checks: Array.isArray(validated.checks) ? validated.checks : []
176
+ });
177
+
178
+ taskBreakdown[task] = (taskBreakdown[task] || 0) + 1;
179
+ });
180
+
181
+ if (entries.length === 0) {
182
+ throw new Error('Prompt suite must contain at least one JSONL entry.');
183
+ }
184
+
185
+ return {
186
+ path: resolvedPath,
187
+ entries,
188
+ metadata: {
189
+ path: resolvedPath,
190
+ total_prompts: entries.length,
191
+ task_breakdown: taskBreakdown
192
+ }
193
+ };
194
+ }
195
+
196
+ parseModelIdentifiers(modelInput) {
197
+ const values = Array.isArray(modelInput) ? modelInput : [modelInput];
198
+ const expanded = [];
199
+
200
+ values.forEach((entry) => {
201
+ String(entry || '')
202
+ .split(',')
203
+ .map((value) => value.trim())
204
+ .filter(Boolean)
205
+ .forEach((value) => expanded.push(value));
206
+ });
207
+
208
+ const deduped = [...new Set(expanded)];
209
+ if (deduped.length === 0) {
210
+ throw new Error('At least one model identifier is required via --models.');
211
+ }
212
+
213
+ return deduped;
214
+ }
215
+
216
+ validateRuntime(runtime) {
217
+ const raw = String(runtime || 'ollama').trim().toLowerCase();
218
+ if (!SUPPORTED_RUNTIMES.includes(raw)) {
219
+ throw new Error(
220
+ `Unsupported runtime "${runtime}". Supported runtimes: ${SUPPORTED_RUNTIMES.join(', ')}`
221
+ );
222
+ }
223
+ return normalizeRuntime(raw);
224
+ }
225
+
226
+ validateObjective(objective = 'balanced') {
227
+ try {
228
+ return calibrationObjectiveSchema.parse(String(objective || 'balanced').trim().toLowerCase());
229
+ } catch (error) {
230
+ throw new Error(
231
+ `Unsupported objective "${objective}". Supported objectives: ${SUPPORTED_CALIBRATION_OBJECTIVES.join(', ')}`
232
+ );
233
+ }
234
+ }
235
+
236
+ resolveExecutionMode(options = {}) {
237
+ const providedMode = options.mode ? String(options.mode).trim().toLowerCase() : null;
238
+ const dryRun = Boolean(options.dryRun);
239
+
240
+ if (dryRun && providedMode && providedMode !== 'dry-run') {
241
+ throw new Error('Do not combine --dry-run with --mode other than "dry-run".');
242
+ }
243
+
244
+ const mode = dryRun ? 'dry-run' : providedMode || 'contract-only';
245
+
246
+ try {
247
+ return calibrationExecutionModeSchema.parse(mode);
248
+ } catch (error) {
249
+ throw new Error('Invalid execution mode. Use one of: dry-run, contract-only, full.');
250
+ }
251
+ }
252
+
253
+ getLocalHardwareSummary() {
254
+ const cpuModel = os.cpus()?.[0]?.model || os.arch();
255
+ const totalRamGb = Math.round(os.totalmem() / (1024 ** 3));
256
+
257
+ return {
258
+ fingerprint: `${os.platform()}-${os.arch()}-${totalRamGb}gb`,
259
+ description: `${cpuModel} | ${totalRamGb}GB RAM`
260
+ };
261
+ }
262
+
263
+ buildDraftCalibrationResult({
264
+ models,
265
+ suiteMetadata,
266
+ runtime,
267
+ objective,
268
+ executionMode,
269
+ hardware,
270
+ calibrationVersion
271
+ }) {
272
+ const modelResults = models.map((modelIdentifier) => ({
273
+ model_identifier: modelIdentifier,
274
+ status: 'pending'
275
+ }));
276
+
277
+ const summary = {
278
+ total_models: modelResults.length,
279
+ successful_models: 0,
280
+ failed_models: 0,
281
+ skipped_models: 0,
282
+ pending_models: modelResults.length
283
+ };
284
+
285
+ const result = {
286
+ schema_version: '1.0',
287
+ generated_at: new Date().toISOString(),
288
+ calibration_version:
289
+ calibrationVersion || `contract-${new Date().toISOString().replace(/[:.]/g, '-')}`,
290
+ execution_mode: executionMode,
291
+ runtime,
292
+ objective,
293
+ hardware: hardware || this.getLocalHardwareSummary(),
294
+ suite: suiteMetadata,
295
+ models: modelResults,
296
+ summary
297
+ };
298
+
299
+ return this.validateCalibrationResult(result);
300
+ }
301
+
302
+ ensureFullModeRuntime(runtime) {
303
+ if (!SUPPORTED_FULL_MODE_RUNTIMES.includes(runtime)) {
304
+ throw new Error(
305
+ `Full calibration mode currently supports: ${SUPPORTED_FULL_MODE_RUNTIMES.join(', ')}.`
306
+ );
307
+ }
308
+ }
309
+
310
+ executePromptWithRuntime({ runtime, modelIdentifier, prompt, timeoutMs = 120000 }) {
311
+ this.ensureFullModeRuntime(runtime);
312
+
313
+ const started = process.hrtime.bigint();
314
+ const result = spawnSync('ollama', ['run', modelIdentifier, prompt], {
315
+ encoding: 'utf8',
316
+ timeout: timeoutMs,
317
+ maxBuffer: 20 * 1024 * 1024,
318
+ env: {
319
+ ...process.env,
320
+ NO_COLOR: '1'
321
+ }
322
+ });
323
+ const latencyMs = Number((process.hrtime.bigint() - started) / 1_000_000n);
324
+
325
+ if (result.error) {
326
+ const error = new Error(result.error.message || 'Failed to execute runtime prompt.');
327
+ error.code = result.error.code || 'RUNTIME_EXECUTION_ERROR';
328
+ throw error;
329
+ }
330
+
331
+ if (result.status !== 0) {
332
+ const message = String(result.stderr || result.stdout || '')
333
+ .trim()
334
+ .slice(0, 500);
335
+ const error = new Error(
336
+ message || `Runtime command exited with status code ${result.status}`
337
+ );
338
+ error.code = 'RUNTIME_EXECUTION_ERROR';
339
+ throw error;
340
+ }
341
+
342
+ const output = String(result.stdout || '').trim();
343
+
344
+ return {
345
+ output,
346
+ latencyMs,
347
+ ttftMs: latencyMs
348
+ };
349
+ }
350
+
351
+ evaluatePromptChecks(responseText, checks = []) {
352
+ if (!Array.isArray(checks) || checks.length === 0) {
353
+ return {
354
+ passedWeight: 0,
355
+ totalWeight: 0,
356
+ passRate: 1,
357
+ checkResults: []
358
+ };
359
+ }
360
+
361
+ let passedWeight = 0;
362
+ let totalWeight = 0;
363
+ const checkResults = [];
364
+
365
+ checks.forEach((check) => {
366
+ const weight = toNumber(check.weight, 1) > 0 ? toNumber(check.weight, 1) : 1;
367
+ totalWeight += weight;
368
+
369
+ let passed = false;
370
+ let error = undefined;
371
+ const response = String(responseText || '');
372
+ const expected = String(check.expected || '');
373
+
374
+ try {
375
+ if (check.type === 'exact') {
376
+ passed = response.trim() === expected.trim();
377
+ } else if (check.type === 'contains') {
378
+ passed = response.includes(expected);
379
+ } else if (check.type === 'regex') {
380
+ const expression = new RegExp(expected);
381
+ passed = expression.test(response);
382
+ }
383
+ } catch (reason) {
384
+ passed = false;
385
+ error = String(reason.message || reason);
386
+ }
387
+
388
+ if (passed) {
389
+ passedWeight += weight;
390
+ }
391
+
392
+ checkResults.push({
393
+ type: check.type,
394
+ expected: expected,
395
+ weight,
396
+ passed,
397
+ error
398
+ });
399
+ });
400
+
401
+ return {
402
+ passedWeight,
403
+ totalWeight,
404
+ passRate: totalWeight > 0 ? passedWeight / totalWeight : 1,
405
+ checkResults
406
+ };
407
+ }
408
+
409
+ runPromptWithWarmup({
410
+ runtime,
411
+ modelIdentifier,
412
+ prompt,
413
+ warmupRuns,
414
+ measuredIterations,
415
+ timeoutMs
416
+ }) {
417
+ for (let index = 0; index < warmupRuns; index += 1) {
418
+ this.promptExecutor({
419
+ runtime,
420
+ modelIdentifier,
421
+ prompt,
422
+ timeoutMs
423
+ });
424
+ }
425
+
426
+ const measured = [];
427
+ for (let iteration = 0; iteration < measuredIterations; iteration += 1) {
428
+ const run = this.promptExecutor({
429
+ runtime,
430
+ modelIdentifier,
431
+ prompt,
432
+ timeoutMs
433
+ });
434
+
435
+ measured.push({
436
+ output: String(run.output || ''),
437
+ latencyMs: toNumber(run.latencyMs, 0),
438
+ ttftMs:
439
+ run.ttftMs === undefined || run.ttftMs === null
440
+ ? undefined
441
+ : toNumber(run.ttftMs, 0)
442
+ });
443
+ }
444
+
445
+ if (measured.length === 0) {
446
+ throw new Error('Measured iterations must be >= 1.');
447
+ }
448
+
449
+ const latencies = measured.map((entry) => entry.latencyMs);
450
+ const ttfts = measured
451
+ .map((entry) => entry.ttftMs)
452
+ .filter((value) => value !== undefined && Number.isFinite(value));
453
+ const totalTokens = measured.reduce(
454
+ (accumulator, entry) => accumulator + countTokensApprox(entry.output),
455
+ 0
456
+ );
457
+ const averageOutputTokens = Math.round(totalTokens / measured.length);
458
+ const representativeResponse = measured[measured.length - 1].output;
459
+
460
+ return {
461
+ response: representativeResponse,
462
+ latencies,
463
+ ttfts,
464
+ totalTokens,
465
+ averageOutputTokens
466
+ };
467
+ }
468
+
469
+ evaluateModel({
470
+ modelIdentifier,
471
+ suiteEntries,
472
+ runtime,
473
+ warmupRuns,
474
+ measuredIterations,
475
+ timeoutMs
476
+ }) {
477
+ const allLatencies = [];
478
+ const allTtfts = [];
479
+ let totalTokens = 0;
480
+ let totalCheckWeight = 0;
481
+ let passedCheckWeight = 0;
482
+ const taskWeightMap = {};
483
+ const taskPassedMap = {};
484
+ const promptRuns = [];
485
+
486
+ for (const entry of suiteEntries) {
487
+ const execution = this.runPromptWithWarmup({
488
+ runtime,
489
+ modelIdentifier,
490
+ prompt: entry.prompt,
491
+ warmupRuns,
492
+ measuredIterations,
493
+ timeoutMs
494
+ });
495
+
496
+ const checkEvaluation = this.evaluatePromptChecks(execution.response, entry.checks);
497
+ const task = toNonEmptyTaskName(entry.task);
498
+
499
+ taskWeightMap[task] = (taskWeightMap[task] || 0) + checkEvaluation.totalWeight;
500
+ taskPassedMap[task] = (taskPassedMap[task] || 0) + checkEvaluation.passedWeight;
501
+
502
+ totalCheckWeight += checkEvaluation.totalWeight;
503
+ passedCheckWeight += checkEvaluation.passedWeight;
504
+ totalTokens += execution.totalTokens;
505
+
506
+ allLatencies.push(...execution.latencies);
507
+ allTtfts.push(...execution.ttfts);
508
+
509
+ promptRuns.push({
510
+ prompt_id: entry.id,
511
+ task,
512
+ latency_ms: median(execution.latencies),
513
+ ttft_ms: execution.ttfts.length > 0 ? median(execution.ttfts) : undefined,
514
+ output_tokens: execution.averageOutputTokens,
515
+ response_excerpt: execution.response.slice(0, 400),
516
+ check_results: checkEvaluation.checkResults,
517
+ check_pass_rate: checkEvaluation.passRate
518
+ });
519
+ }
520
+
521
+ const taskScores = {};
522
+ Object.keys(taskWeightMap).forEach((task) => {
523
+ const taskWeight = taskWeightMap[task];
524
+ const taskPassed = taskPassedMap[task] || 0;
525
+ taskScores[task] = taskWeight > 0 ? (taskPassed / taskWeight) * 100 : 100;
526
+ });
527
+
528
+ const checkPassRate = totalCheckWeight > 0 ? passedCheckWeight / totalCheckWeight : 1;
529
+ const overallScore =
530
+ Object.keys(taskScores).length > 0
531
+ ? Object.values(taskScores).reduce((sum, value) => sum + value, 0) /
532
+ Object.values(taskScores).length
533
+ : checkPassRate * 100;
534
+
535
+ const totalLatencySec =
536
+ allLatencies.reduce((sum, value) => sum + value, 0) > 0
537
+ ? allLatencies.reduce((sum, value) => sum + value, 0) / 1000
538
+ : 0;
539
+ const tokensPerSecond = totalLatencySec > 0 ? totalTokens / totalLatencySec : 0;
540
+
541
+ return {
542
+ model_identifier: modelIdentifier,
543
+ status: 'success',
544
+ metrics: {
545
+ ttft_ms: allTtfts.length > 0 ? percentile(allTtfts, 50) : percentile(allLatencies, 50),
546
+ tokens_per_second: tokensPerSecond,
547
+ latency_ms_p50: percentile(allLatencies, 50),
548
+ latency_ms_p95: percentile(allLatencies, 95),
549
+ peak_memory_mb: estimatePeakMemoryMb(modelIdentifier)
550
+ },
551
+ quality: {
552
+ overall_score: overallScore,
553
+ task_scores: taskScores,
554
+ check_pass_rate: checkPassRate
555
+ },
556
+ traces: {
557
+ warmup_runs: warmupRuns,
558
+ measured_iterations: measuredIterations,
559
+ prompt_runs: promptRuns
560
+ }
561
+ };
562
+ }
563
+
564
+ runFullCalibration({
565
+ models,
566
+ suite,
567
+ runtime,
568
+ objective,
569
+ hardware,
570
+ calibrationVersion,
571
+ benchmarkConfig = {}
572
+ }) {
573
+ this.ensureFullModeRuntime(runtime);
574
+
575
+ const warmupRuns = toPositiveInt(benchmarkConfig.warmupRuns, 1);
576
+ const measuredIterations = Math.max(toPositiveInt(benchmarkConfig.measuredIterations, 2), 1);
577
+ const timeoutMs = Math.max(toPositiveInt(benchmarkConfig.timeoutMs, 120000), 1000);
578
+
579
+ const modelResults = models.map((modelIdentifier) => {
580
+ try {
581
+ return this.evaluateModel({
582
+ modelIdentifier,
583
+ suiteEntries: suite.entries,
584
+ runtime,
585
+ warmupRuns,
586
+ measuredIterations,
587
+ timeoutMs
588
+ });
589
+ } catch (error) {
590
+ return {
591
+ model_identifier: modelIdentifier,
592
+ status: 'failed',
593
+ error: String(error.message || 'Calibration execution failed.'),
594
+ traces: {
595
+ warmup_runs: warmupRuns,
596
+ measured_iterations: measuredIterations,
597
+ error_code: normalizeErrorCode(error)
598
+ }
599
+ };
600
+ }
601
+ });
602
+
603
+ const summary = {
604
+ total_models: modelResults.length,
605
+ successful_models: modelResults.filter((entry) => entry.status === 'success').length,
606
+ failed_models: modelResults.filter((entry) => entry.status === 'failed').length,
607
+ skipped_models: modelResults.filter((entry) => entry.status === 'skipped').length,
608
+ pending_models: modelResults.filter((entry) => entry.status === 'pending').length
609
+ };
610
+
611
+ const result = {
612
+ schema_version: '1.0',
613
+ generated_at: new Date().toISOString(),
614
+ calibration_version:
615
+ calibrationVersion || `full-${new Date().toISOString().replace(/[:.]/g, '-')}`,
616
+ execution_mode: 'full',
617
+ runtime,
618
+ objective,
619
+ hardware: hardware || this.getLocalHardwareSummary(),
620
+ suite: suite.metadata,
621
+ models: modelResults,
622
+ summary
623
+ };
624
+
625
+ return this.validateCalibrationResult(result);
626
+ }
627
+
628
+ computeTaskCandidates({ task, successfulModels, objective }) {
629
+ const candidates = successfulModels.map((model) => {
630
+ const qualityScore = toNumber(
631
+ model.quality?.task_scores?.[task],
632
+ toNumber(model.quality?.overall_score, 0)
633
+ );
634
+ const speedRaw =
635
+ toNumber(model.metrics?.tokens_per_second, 0) -
636
+ toNumber(model.metrics?.latency_ms_p50, 0) / 1000;
637
+ return {
638
+ model_identifier: model.model_identifier,
639
+ qualityScore,
640
+ speedRaw
641
+ };
642
+ });
643
+
644
+ const speedValues = candidates.map((entry) => entry.speedRaw);
645
+ const minSpeed = Math.min(...speedValues);
646
+ const maxSpeed = Math.max(...speedValues);
647
+ const speedRange = maxSpeed - minSpeed;
648
+
649
+ const weighted = candidates.map((entry) => {
650
+ const speedScore =
651
+ speedRange > 0 ? ((entry.speedRaw - minSpeed) / speedRange) * 100 : 50;
652
+ let combinedScore = 0;
653
+ if (objective === 'speed') {
654
+ combinedScore = speedScore * 0.75 + entry.qualityScore * 0.25;
655
+ } else if (objective === 'quality') {
656
+ combinedScore = entry.qualityScore * 0.8 + speedScore * 0.2;
657
+ } else {
658
+ combinedScore = entry.qualityScore * 0.5 + speedScore * 0.5;
659
+ }
660
+
661
+ return {
662
+ ...entry,
663
+ speedScore,
664
+ combinedScore
665
+ };
666
+ });
667
+
668
+ return weighted.sort((left, right) => {
669
+ if (right.combinedScore !== left.combinedScore) {
670
+ return right.combinedScore - left.combinedScore;
671
+ }
672
+ if (right.qualityScore !== left.qualityScore) {
673
+ return right.qualityScore - left.qualityScore;
674
+ }
675
+ return left.model_identifier.localeCompare(right.model_identifier);
676
+ });
677
+ }
678
+
679
+ synthesizePolicyRoutes(calibrationResult) {
680
+ const successfulModels = calibrationResult.models.filter(
681
+ (entry) => entry.status === 'success'
682
+ );
683
+
684
+ if (successfulModels.length === 0) {
685
+ throw new Error('Cannot synthesize policy: no successful model calibration results found.');
686
+ }
687
+
688
+ const tasks = Object.keys(calibrationResult.suite?.task_breakdown || {});
689
+ const taskList = tasks.length > 0 ? tasks : [DEFAULT_CALIBRATION_TASK];
690
+ const routing = {};
691
+
692
+ taskList.forEach((task) => {
693
+ const ranked = this.computeTaskCandidates({
694
+ task,
695
+ successfulModels,
696
+ objective: calibrationResult.objective
697
+ });
698
+
699
+ const minimumQuality = 50;
700
+ const eligible = ranked.filter((entry) => entry.qualityScore >= minimumQuality);
701
+ const selected = eligible.length > 0 ? eligible : ranked;
702
+
703
+ const primary = selected[0];
704
+ const fallbacks = selected.slice(1).map((entry) => entry.model_identifier);
705
+
706
+ routing[task] = {
707
+ primary: primary.model_identifier,
708
+ fallbacks,
709
+ min_quality: minimumQuality,
710
+ rationale: `objective=${calibrationResult.objective}; combined=${primary.combinedScore.toFixed(
711
+ 2
712
+ )}; quality=${primary.qualityScore.toFixed(2)}; speed=${primary.speedScore.toFixed(2)}`
713
+ };
714
+ });
715
+
716
+ return routing;
717
+ }
718
+
719
+ buildDraftCalibrationPolicy({ calibrationResult, calibrationResultPath }) {
720
+ const modelIdentifiers = calibrationResult.models.map((entry) => entry.model_identifier);
721
+ if (modelIdentifiers.length === 0) {
722
+ throw new Error('Calibration policy generation requires at least one model result.');
723
+ }
724
+
725
+ let routing;
726
+ if (
727
+ calibrationResult.execution_mode === 'full' &&
728
+ calibrationResult.models.some((entry) => entry.status === 'success')
729
+ ) {
730
+ routing = this.synthesizePolicyRoutes(calibrationResult);
731
+ } else {
732
+ const tasks = Object.keys(calibrationResult.suite?.task_breakdown || {});
733
+ const taskRoutes = tasks.length > 0 ? tasks : [DEFAULT_CALIBRATION_TASK];
734
+ routing = {};
735
+ taskRoutes.forEach((taskName) => {
736
+ routing[taskName] = {
737
+ primary: modelIdentifiers[0],
738
+ fallbacks: modelIdentifiers.slice(1),
739
+ rationale: 'Draft routing generated from calibration contract output.'
740
+ };
741
+ });
742
+ }
743
+
744
+ const policy = {
745
+ schema_version: '1.0',
746
+ generated_at: new Date().toISOString(),
747
+ objective: calibrationResult.objective,
748
+ source: {
749
+ calibration_version: calibrationResult.calibration_version,
750
+ calibration_result_path: calibrationResultPath || undefined
751
+ },
752
+ routing,
753
+ metadata: {
754
+ runtime: calibrationResult.runtime,
755
+ hardware_fingerprint: calibrationResult.hardware?.fingerprint || undefined
756
+ }
757
+ };
758
+
759
+ return this.validateCalibrationPolicy(policy);
760
+ }
761
+
762
+ validateCalibrationResult(payload) {
763
+ try {
764
+ return calibrationResultSchema.parse(payload);
765
+ } catch (error) {
766
+ throw new Error(`Invalid calibration result payload: ${formatZodIssues(error)}`);
767
+ }
768
+ }
769
+
770
+ validateCalibrationPolicy(payload) {
771
+ try {
772
+ return calibrationPolicySchema.parse(payload);
773
+ } catch (error) {
774
+ throw new Error(`Invalid calibration policy payload: ${formatZodIssues(error)}`);
775
+ }
776
+ }
777
+
778
+ writeArtifact(filePath, payload, options = {}) {
779
+ const cwd = options.cwd || process.cwd();
780
+ const resolvedPath = this.resolvePath(filePath, cwd);
781
+
782
+ if (fs.existsSync(resolvedPath) && fs.statSync(resolvedPath).isDirectory()) {
783
+ throw new Error(`Output path must be a file, received directory: ${resolvedPath}`);
784
+ }
785
+
786
+ const serialized = isYamlPath(resolvedPath)
787
+ ? `${YAML.stringify(payload)}`
788
+ : `${JSON.stringify(payload, null, 2)}\n`;
789
+
790
+ fs.mkdirSync(path.dirname(resolvedPath), { recursive: true });
791
+ fs.writeFileSync(resolvedPath, serialized, 'utf8');
792
+ return resolvedPath;
793
+ }
794
+ }
795
+
796
+ module.exports = {
797
+ CalibrationManager
798
+ };