llm-checker 3.2.8 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +119 -17
- package/bin/enhanced_cli.js +516 -3
- package/package.json +1 -1
- package/src/calibration/calibration-manager.js +798 -0
- package/src/calibration/policy-routing.js +376 -0
- package/src/calibration/schemas.js +212 -0
- package/src/hardware/backends/cuda-detector.js +355 -5
- package/src/ollama/capacity-planner.js +399 -0
|
@@ -0,0 +1,798 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const os = require('os');
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const { spawnSync } = require('child_process');
|
|
5
|
+
const YAML = require('yaml');
|
|
6
|
+
const {
|
|
7
|
+
SUPPORTED_CALIBRATION_OBJECTIVES,
|
|
8
|
+
calibrationObjectiveSchema,
|
|
9
|
+
calibrationExecutionModeSchema,
|
|
10
|
+
promptSuiteEntrySchema,
|
|
11
|
+
calibrationResultSchema,
|
|
12
|
+
calibrationPolicySchema,
|
|
13
|
+
DEFAULT_CALIBRATION_TASK
|
|
14
|
+
} = require('./schemas');
|
|
15
|
+
const { SUPPORTED_RUNTIMES, normalizeRuntime } = require('../runtime/runtime-support');
|
|
16
|
+
|
|
17
|
+
const SUPPORTED_FULL_MODE_RUNTIMES = ['ollama'];
|
|
18
|
+
|
|
19
|
+
function formatZodIssues(error) {
|
|
20
|
+
if (!error || !Array.isArray(error.issues) || error.issues.length === 0) {
|
|
21
|
+
return error?.message || 'Validation failed';
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
return error.issues
|
|
25
|
+
.map((issue) => {
|
|
26
|
+
const location = issue.path && issue.path.length > 0 ? issue.path.join('.') : 'root';
|
|
27
|
+
return `${location}: ${issue.message}`;
|
|
28
|
+
})
|
|
29
|
+
.join('; ');
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function toNonEmptyTaskName(task) {
|
|
33
|
+
const raw = String(task || '').trim().toLowerCase();
|
|
34
|
+
return raw || DEFAULT_CALIBRATION_TASK;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function isYamlPath(filePath = '') {
|
|
38
|
+
const extension = path.extname(String(filePath || '')).toLowerCase();
|
|
39
|
+
return extension === '.yaml' || extension === '.yml';
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function toNumber(value, fallback = 0) {
|
|
43
|
+
const parsed = Number(value);
|
|
44
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function toPositiveInt(value, fallback) {
|
|
48
|
+
const parsed = Number.parseInt(String(value), 10);
|
|
49
|
+
if (!Number.isFinite(parsed) || parsed < 0) return fallback;
|
|
50
|
+
return parsed;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function percentile(values, p) {
|
|
54
|
+
if (!Array.isArray(values) || values.length === 0) return 0;
|
|
55
|
+
const sorted = values
|
|
56
|
+
.map((value) => toNumber(value, 0))
|
|
57
|
+
.filter((value) => Number.isFinite(value))
|
|
58
|
+
.sort((a, b) => a - b);
|
|
59
|
+
if (sorted.length === 0) return 0;
|
|
60
|
+
const index = Math.ceil((p / 100) * sorted.length) - 1;
|
|
61
|
+
return sorted[Math.min(Math.max(index, 0), sorted.length - 1)];
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function median(values) {
|
|
65
|
+
return percentile(values, 50);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function normalizeModelQuantization(modelIdentifier) {
|
|
69
|
+
const value = String(modelIdentifier || '').toLowerCase();
|
|
70
|
+
if (value.includes('q2')) return 0.25;
|
|
71
|
+
if (value.includes('q3')) return 0.375;
|
|
72
|
+
if (value.includes('q4')) return 0.5;
|
|
73
|
+
if (value.includes('q5')) return 0.625;
|
|
74
|
+
if (value.includes('q6')) return 0.75;
|
|
75
|
+
if (value.includes('q8')) return 1.0;
|
|
76
|
+
if (value.includes('fp16') || value.includes('f16') || value.includes('bf16')) return 2.0;
|
|
77
|
+
return 1.0;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function extractParamsB(modelIdentifier) {
|
|
81
|
+
const match = String(modelIdentifier || '')
|
|
82
|
+
.toLowerCase()
|
|
83
|
+
.match(/(\d+(?:\.\d+)?)\s*b/);
|
|
84
|
+
if (!match) return null;
|
|
85
|
+
const value = Number.parseFloat(match[1]);
|
|
86
|
+
return Number.isFinite(value) ? value : null;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
function estimatePeakMemoryMb(modelIdentifier) {
|
|
90
|
+
const paramsB = extractParamsB(modelIdentifier);
|
|
91
|
+
if (!paramsB) return undefined;
|
|
92
|
+
|
|
93
|
+
const bytesPerParam = normalizeModelQuantization(modelIdentifier);
|
|
94
|
+
const bytes = paramsB * 1_000_000_000 * bytesPerParam;
|
|
95
|
+
const withOverhead = bytes * 1.15;
|
|
96
|
+
return Math.round(withOverhead / (1024 * 1024));
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function countTokensApprox(text) {
|
|
100
|
+
const source = String(text || '').trim();
|
|
101
|
+
if (!source) return 0;
|
|
102
|
+
return source.split(/\s+/).length;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function normalizeErrorCode(error) {
|
|
106
|
+
const explicitCode = String(error?.code || '').trim();
|
|
107
|
+
if (explicitCode) return explicitCode.toUpperCase();
|
|
108
|
+
|
|
109
|
+
const message = String(error?.message || '').toLowerCase();
|
|
110
|
+
if (message.includes('timed out')) return 'RUNTIME_TIMEOUT';
|
|
111
|
+
if (message.includes('unsupported runtime')) return 'UNSUPPORTED_RUNTIME';
|
|
112
|
+
if (message.includes('regex')) return 'QUALITY_REGEX_ERROR';
|
|
113
|
+
return 'CALIBRATION_RUNTIME_ERROR';
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
class CalibrationManager {
|
|
117
|
+
constructor(options = {}) {
|
|
118
|
+
this.promptExecutor =
|
|
119
|
+
typeof options.promptExecutor === 'function'
|
|
120
|
+
? options.promptExecutor
|
|
121
|
+
: this.executePromptWithRuntime.bind(this);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
resolvePath(filePath, cwd = process.cwd()) {
|
|
125
|
+
if (!filePath || typeof filePath !== 'string') {
|
|
126
|
+
throw new Error('A file path is required.');
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return path.isAbsolute(filePath) ? filePath : path.resolve(cwd, filePath);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
parsePromptSuite(suiteFilePath, options = {}) {
|
|
133
|
+
const cwd = options.cwd || process.cwd();
|
|
134
|
+
const resolvedPath = this.resolvePath(suiteFilePath, cwd);
|
|
135
|
+
|
|
136
|
+
if (!fs.existsSync(resolvedPath)) {
|
|
137
|
+
throw new Error(`Prompt suite file not found: ${resolvedPath}`);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const source = fs.readFileSync(resolvedPath, 'utf8');
|
|
141
|
+
const lines = source.split(/\r?\n/);
|
|
142
|
+
const entries = [];
|
|
143
|
+
const taskBreakdown = {};
|
|
144
|
+
|
|
145
|
+
lines.forEach((line, index) => {
|
|
146
|
+
const lineNumber = index + 1;
|
|
147
|
+
const trimmed = line.trim();
|
|
148
|
+
if (!trimmed) {
|
|
149
|
+
return;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
let parsed;
|
|
153
|
+
try {
|
|
154
|
+
parsed = JSON.parse(trimmed);
|
|
155
|
+
} catch (error) {
|
|
156
|
+
throw new Error(`Invalid JSON in prompt suite at line ${lineNumber}: ${error.message}`);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
let validated;
|
|
160
|
+
try {
|
|
161
|
+
validated = promptSuiteEntrySchema.parse(parsed);
|
|
162
|
+
} catch (error) {
|
|
163
|
+
throw new Error(
|
|
164
|
+
`Invalid prompt suite entry at line ${lineNumber}: ${formatZodIssues(error)}`
|
|
165
|
+
);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const task = toNonEmptyTaskName(validated.task);
|
|
169
|
+
const id = validated.id || `prompt-${entries.length + 1}`;
|
|
170
|
+
|
|
171
|
+
entries.push({
|
|
172
|
+
...validated,
|
|
173
|
+
id,
|
|
174
|
+
task,
|
|
175
|
+
checks: Array.isArray(validated.checks) ? validated.checks : []
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
taskBreakdown[task] = (taskBreakdown[task] || 0) + 1;
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
if (entries.length === 0) {
|
|
182
|
+
throw new Error('Prompt suite must contain at least one JSONL entry.');
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return {
|
|
186
|
+
path: resolvedPath,
|
|
187
|
+
entries,
|
|
188
|
+
metadata: {
|
|
189
|
+
path: resolvedPath,
|
|
190
|
+
total_prompts: entries.length,
|
|
191
|
+
task_breakdown: taskBreakdown
|
|
192
|
+
}
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
parseModelIdentifiers(modelInput) {
|
|
197
|
+
const values = Array.isArray(modelInput) ? modelInput : [modelInput];
|
|
198
|
+
const expanded = [];
|
|
199
|
+
|
|
200
|
+
values.forEach((entry) => {
|
|
201
|
+
String(entry || '')
|
|
202
|
+
.split(',')
|
|
203
|
+
.map((value) => value.trim())
|
|
204
|
+
.filter(Boolean)
|
|
205
|
+
.forEach((value) => expanded.push(value));
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
const deduped = [...new Set(expanded)];
|
|
209
|
+
if (deduped.length === 0) {
|
|
210
|
+
throw new Error('At least one model identifier is required via --models.');
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return deduped;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
validateRuntime(runtime) {
|
|
217
|
+
const raw = String(runtime || 'ollama').trim().toLowerCase();
|
|
218
|
+
if (!SUPPORTED_RUNTIMES.includes(raw)) {
|
|
219
|
+
throw new Error(
|
|
220
|
+
`Unsupported runtime "${runtime}". Supported runtimes: ${SUPPORTED_RUNTIMES.join(', ')}`
|
|
221
|
+
);
|
|
222
|
+
}
|
|
223
|
+
return normalizeRuntime(raw);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
validateObjective(objective = 'balanced') {
|
|
227
|
+
try {
|
|
228
|
+
return calibrationObjectiveSchema.parse(String(objective || 'balanced').trim().toLowerCase());
|
|
229
|
+
} catch (error) {
|
|
230
|
+
throw new Error(
|
|
231
|
+
`Unsupported objective "${objective}". Supported objectives: ${SUPPORTED_CALIBRATION_OBJECTIVES.join(', ')}`
|
|
232
|
+
);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
resolveExecutionMode(options = {}) {
|
|
237
|
+
const providedMode = options.mode ? String(options.mode).trim().toLowerCase() : null;
|
|
238
|
+
const dryRun = Boolean(options.dryRun);
|
|
239
|
+
|
|
240
|
+
if (dryRun && providedMode && providedMode !== 'dry-run') {
|
|
241
|
+
throw new Error('Do not combine --dry-run with --mode other than "dry-run".');
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
const mode = dryRun ? 'dry-run' : providedMode || 'contract-only';
|
|
245
|
+
|
|
246
|
+
try {
|
|
247
|
+
return calibrationExecutionModeSchema.parse(mode);
|
|
248
|
+
} catch (error) {
|
|
249
|
+
throw new Error('Invalid execution mode. Use one of: dry-run, contract-only, full.');
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
getLocalHardwareSummary() {
|
|
254
|
+
const cpuModel = os.cpus()?.[0]?.model || os.arch();
|
|
255
|
+
const totalRamGb = Math.round(os.totalmem() / (1024 ** 3));
|
|
256
|
+
|
|
257
|
+
return {
|
|
258
|
+
fingerprint: `${os.platform()}-${os.arch()}-${totalRamGb}gb`,
|
|
259
|
+
description: `${cpuModel} | ${totalRamGb}GB RAM`
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
buildDraftCalibrationResult({
|
|
264
|
+
models,
|
|
265
|
+
suiteMetadata,
|
|
266
|
+
runtime,
|
|
267
|
+
objective,
|
|
268
|
+
executionMode,
|
|
269
|
+
hardware,
|
|
270
|
+
calibrationVersion
|
|
271
|
+
}) {
|
|
272
|
+
const modelResults = models.map((modelIdentifier) => ({
|
|
273
|
+
model_identifier: modelIdentifier,
|
|
274
|
+
status: 'pending'
|
|
275
|
+
}));
|
|
276
|
+
|
|
277
|
+
const summary = {
|
|
278
|
+
total_models: modelResults.length,
|
|
279
|
+
successful_models: 0,
|
|
280
|
+
failed_models: 0,
|
|
281
|
+
skipped_models: 0,
|
|
282
|
+
pending_models: modelResults.length
|
|
283
|
+
};
|
|
284
|
+
|
|
285
|
+
const result = {
|
|
286
|
+
schema_version: '1.0',
|
|
287
|
+
generated_at: new Date().toISOString(),
|
|
288
|
+
calibration_version:
|
|
289
|
+
calibrationVersion || `contract-${new Date().toISOString().replace(/[:.]/g, '-')}`,
|
|
290
|
+
execution_mode: executionMode,
|
|
291
|
+
runtime,
|
|
292
|
+
objective,
|
|
293
|
+
hardware: hardware || this.getLocalHardwareSummary(),
|
|
294
|
+
suite: suiteMetadata,
|
|
295
|
+
models: modelResults,
|
|
296
|
+
summary
|
|
297
|
+
};
|
|
298
|
+
|
|
299
|
+
return this.validateCalibrationResult(result);
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
ensureFullModeRuntime(runtime) {
|
|
303
|
+
if (!SUPPORTED_FULL_MODE_RUNTIMES.includes(runtime)) {
|
|
304
|
+
throw new Error(
|
|
305
|
+
`Full calibration mode currently supports: ${SUPPORTED_FULL_MODE_RUNTIMES.join(', ')}.`
|
|
306
|
+
);
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
executePromptWithRuntime({ runtime, modelIdentifier, prompt, timeoutMs = 120000 }) {
|
|
311
|
+
this.ensureFullModeRuntime(runtime);
|
|
312
|
+
|
|
313
|
+
const started = process.hrtime.bigint();
|
|
314
|
+
const result = spawnSync('ollama', ['run', modelIdentifier, prompt], {
|
|
315
|
+
encoding: 'utf8',
|
|
316
|
+
timeout: timeoutMs,
|
|
317
|
+
maxBuffer: 20 * 1024 * 1024,
|
|
318
|
+
env: {
|
|
319
|
+
...process.env,
|
|
320
|
+
NO_COLOR: '1'
|
|
321
|
+
}
|
|
322
|
+
});
|
|
323
|
+
const latencyMs = Number((process.hrtime.bigint() - started) / 1_000_000n);
|
|
324
|
+
|
|
325
|
+
if (result.error) {
|
|
326
|
+
const error = new Error(result.error.message || 'Failed to execute runtime prompt.');
|
|
327
|
+
error.code = result.error.code || 'RUNTIME_EXECUTION_ERROR';
|
|
328
|
+
throw error;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
if (result.status !== 0) {
|
|
332
|
+
const message = String(result.stderr || result.stdout || '')
|
|
333
|
+
.trim()
|
|
334
|
+
.slice(0, 500);
|
|
335
|
+
const error = new Error(
|
|
336
|
+
message || `Runtime command exited with status code ${result.status}`
|
|
337
|
+
);
|
|
338
|
+
error.code = 'RUNTIME_EXECUTION_ERROR';
|
|
339
|
+
throw error;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
const output = String(result.stdout || '').trim();
|
|
343
|
+
|
|
344
|
+
return {
|
|
345
|
+
output,
|
|
346
|
+
latencyMs,
|
|
347
|
+
ttftMs: latencyMs
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
evaluatePromptChecks(responseText, checks = []) {
|
|
352
|
+
if (!Array.isArray(checks) || checks.length === 0) {
|
|
353
|
+
return {
|
|
354
|
+
passedWeight: 0,
|
|
355
|
+
totalWeight: 0,
|
|
356
|
+
passRate: 1,
|
|
357
|
+
checkResults: []
|
|
358
|
+
};
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
let passedWeight = 0;
|
|
362
|
+
let totalWeight = 0;
|
|
363
|
+
const checkResults = [];
|
|
364
|
+
|
|
365
|
+
checks.forEach((check) => {
|
|
366
|
+
const weight = toNumber(check.weight, 1) > 0 ? toNumber(check.weight, 1) : 1;
|
|
367
|
+
totalWeight += weight;
|
|
368
|
+
|
|
369
|
+
let passed = false;
|
|
370
|
+
let error = undefined;
|
|
371
|
+
const response = String(responseText || '');
|
|
372
|
+
const expected = String(check.expected || '');
|
|
373
|
+
|
|
374
|
+
try {
|
|
375
|
+
if (check.type === 'exact') {
|
|
376
|
+
passed = response.trim() === expected.trim();
|
|
377
|
+
} else if (check.type === 'contains') {
|
|
378
|
+
passed = response.includes(expected);
|
|
379
|
+
} else if (check.type === 'regex') {
|
|
380
|
+
const expression = new RegExp(expected);
|
|
381
|
+
passed = expression.test(response);
|
|
382
|
+
}
|
|
383
|
+
} catch (reason) {
|
|
384
|
+
passed = false;
|
|
385
|
+
error = String(reason.message || reason);
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
if (passed) {
|
|
389
|
+
passedWeight += weight;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
checkResults.push({
|
|
393
|
+
type: check.type,
|
|
394
|
+
expected: expected,
|
|
395
|
+
weight,
|
|
396
|
+
passed,
|
|
397
|
+
error
|
|
398
|
+
});
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
return {
|
|
402
|
+
passedWeight,
|
|
403
|
+
totalWeight,
|
|
404
|
+
passRate: totalWeight > 0 ? passedWeight / totalWeight : 1,
|
|
405
|
+
checkResults
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
runPromptWithWarmup({
|
|
410
|
+
runtime,
|
|
411
|
+
modelIdentifier,
|
|
412
|
+
prompt,
|
|
413
|
+
warmupRuns,
|
|
414
|
+
measuredIterations,
|
|
415
|
+
timeoutMs
|
|
416
|
+
}) {
|
|
417
|
+
for (let index = 0; index < warmupRuns; index += 1) {
|
|
418
|
+
this.promptExecutor({
|
|
419
|
+
runtime,
|
|
420
|
+
modelIdentifier,
|
|
421
|
+
prompt,
|
|
422
|
+
timeoutMs
|
|
423
|
+
});
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
const measured = [];
|
|
427
|
+
for (let iteration = 0; iteration < measuredIterations; iteration += 1) {
|
|
428
|
+
const run = this.promptExecutor({
|
|
429
|
+
runtime,
|
|
430
|
+
modelIdentifier,
|
|
431
|
+
prompt,
|
|
432
|
+
timeoutMs
|
|
433
|
+
});
|
|
434
|
+
|
|
435
|
+
measured.push({
|
|
436
|
+
output: String(run.output || ''),
|
|
437
|
+
latencyMs: toNumber(run.latencyMs, 0),
|
|
438
|
+
ttftMs:
|
|
439
|
+
run.ttftMs === undefined || run.ttftMs === null
|
|
440
|
+
? undefined
|
|
441
|
+
: toNumber(run.ttftMs, 0)
|
|
442
|
+
});
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
if (measured.length === 0) {
|
|
446
|
+
throw new Error('Measured iterations must be >= 1.');
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
const latencies = measured.map((entry) => entry.latencyMs);
|
|
450
|
+
const ttfts = measured
|
|
451
|
+
.map((entry) => entry.ttftMs)
|
|
452
|
+
.filter((value) => value !== undefined && Number.isFinite(value));
|
|
453
|
+
const totalTokens = measured.reduce(
|
|
454
|
+
(accumulator, entry) => accumulator + countTokensApprox(entry.output),
|
|
455
|
+
0
|
|
456
|
+
);
|
|
457
|
+
const averageOutputTokens = Math.round(totalTokens / measured.length);
|
|
458
|
+
const representativeResponse = measured[measured.length - 1].output;
|
|
459
|
+
|
|
460
|
+
return {
|
|
461
|
+
response: representativeResponse,
|
|
462
|
+
latencies,
|
|
463
|
+
ttfts,
|
|
464
|
+
totalTokens,
|
|
465
|
+
averageOutputTokens
|
|
466
|
+
};
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
evaluateModel({
|
|
470
|
+
modelIdentifier,
|
|
471
|
+
suiteEntries,
|
|
472
|
+
runtime,
|
|
473
|
+
warmupRuns,
|
|
474
|
+
measuredIterations,
|
|
475
|
+
timeoutMs
|
|
476
|
+
}) {
|
|
477
|
+
const allLatencies = [];
|
|
478
|
+
const allTtfts = [];
|
|
479
|
+
let totalTokens = 0;
|
|
480
|
+
let totalCheckWeight = 0;
|
|
481
|
+
let passedCheckWeight = 0;
|
|
482
|
+
const taskWeightMap = {};
|
|
483
|
+
const taskPassedMap = {};
|
|
484
|
+
const promptRuns = [];
|
|
485
|
+
|
|
486
|
+
for (const entry of suiteEntries) {
|
|
487
|
+
const execution = this.runPromptWithWarmup({
|
|
488
|
+
runtime,
|
|
489
|
+
modelIdentifier,
|
|
490
|
+
prompt: entry.prompt,
|
|
491
|
+
warmupRuns,
|
|
492
|
+
measuredIterations,
|
|
493
|
+
timeoutMs
|
|
494
|
+
});
|
|
495
|
+
|
|
496
|
+
const checkEvaluation = this.evaluatePromptChecks(execution.response, entry.checks);
|
|
497
|
+
const task = toNonEmptyTaskName(entry.task);
|
|
498
|
+
|
|
499
|
+
taskWeightMap[task] = (taskWeightMap[task] || 0) + checkEvaluation.totalWeight;
|
|
500
|
+
taskPassedMap[task] = (taskPassedMap[task] || 0) + checkEvaluation.passedWeight;
|
|
501
|
+
|
|
502
|
+
totalCheckWeight += checkEvaluation.totalWeight;
|
|
503
|
+
passedCheckWeight += checkEvaluation.passedWeight;
|
|
504
|
+
totalTokens += execution.totalTokens;
|
|
505
|
+
|
|
506
|
+
allLatencies.push(...execution.latencies);
|
|
507
|
+
allTtfts.push(...execution.ttfts);
|
|
508
|
+
|
|
509
|
+
promptRuns.push({
|
|
510
|
+
prompt_id: entry.id,
|
|
511
|
+
task,
|
|
512
|
+
latency_ms: median(execution.latencies),
|
|
513
|
+
ttft_ms: execution.ttfts.length > 0 ? median(execution.ttfts) : undefined,
|
|
514
|
+
output_tokens: execution.averageOutputTokens,
|
|
515
|
+
response_excerpt: execution.response.slice(0, 400),
|
|
516
|
+
check_results: checkEvaluation.checkResults,
|
|
517
|
+
check_pass_rate: checkEvaluation.passRate
|
|
518
|
+
});
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
const taskScores = {};
|
|
522
|
+
Object.keys(taskWeightMap).forEach((task) => {
|
|
523
|
+
const taskWeight = taskWeightMap[task];
|
|
524
|
+
const taskPassed = taskPassedMap[task] || 0;
|
|
525
|
+
taskScores[task] = taskWeight > 0 ? (taskPassed / taskWeight) * 100 : 100;
|
|
526
|
+
});
|
|
527
|
+
|
|
528
|
+
const checkPassRate = totalCheckWeight > 0 ? passedCheckWeight / totalCheckWeight : 1;
|
|
529
|
+
const overallScore =
|
|
530
|
+
Object.keys(taskScores).length > 0
|
|
531
|
+
? Object.values(taskScores).reduce((sum, value) => sum + value, 0) /
|
|
532
|
+
Object.values(taskScores).length
|
|
533
|
+
: checkPassRate * 100;
|
|
534
|
+
|
|
535
|
+
const totalLatencySec =
|
|
536
|
+
allLatencies.reduce((sum, value) => sum + value, 0) > 0
|
|
537
|
+
? allLatencies.reduce((sum, value) => sum + value, 0) / 1000
|
|
538
|
+
: 0;
|
|
539
|
+
const tokensPerSecond = totalLatencySec > 0 ? totalTokens / totalLatencySec : 0;
|
|
540
|
+
|
|
541
|
+
return {
|
|
542
|
+
model_identifier: modelIdentifier,
|
|
543
|
+
status: 'success',
|
|
544
|
+
metrics: {
|
|
545
|
+
ttft_ms: allTtfts.length > 0 ? percentile(allTtfts, 50) : percentile(allLatencies, 50),
|
|
546
|
+
tokens_per_second: tokensPerSecond,
|
|
547
|
+
latency_ms_p50: percentile(allLatencies, 50),
|
|
548
|
+
latency_ms_p95: percentile(allLatencies, 95),
|
|
549
|
+
peak_memory_mb: estimatePeakMemoryMb(modelIdentifier)
|
|
550
|
+
},
|
|
551
|
+
quality: {
|
|
552
|
+
overall_score: overallScore,
|
|
553
|
+
task_scores: taskScores,
|
|
554
|
+
check_pass_rate: checkPassRate
|
|
555
|
+
},
|
|
556
|
+
traces: {
|
|
557
|
+
warmup_runs: warmupRuns,
|
|
558
|
+
measured_iterations: measuredIterations,
|
|
559
|
+
prompt_runs: promptRuns
|
|
560
|
+
}
|
|
561
|
+
};
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
runFullCalibration({
|
|
565
|
+
models,
|
|
566
|
+
suite,
|
|
567
|
+
runtime,
|
|
568
|
+
objective,
|
|
569
|
+
hardware,
|
|
570
|
+
calibrationVersion,
|
|
571
|
+
benchmarkConfig = {}
|
|
572
|
+
}) {
|
|
573
|
+
this.ensureFullModeRuntime(runtime);
|
|
574
|
+
|
|
575
|
+
const warmupRuns = toPositiveInt(benchmarkConfig.warmupRuns, 1);
|
|
576
|
+
const measuredIterations = Math.max(toPositiveInt(benchmarkConfig.measuredIterations, 2), 1);
|
|
577
|
+
const timeoutMs = Math.max(toPositiveInt(benchmarkConfig.timeoutMs, 120000), 1000);
|
|
578
|
+
|
|
579
|
+
const modelResults = models.map((modelIdentifier) => {
|
|
580
|
+
try {
|
|
581
|
+
return this.evaluateModel({
|
|
582
|
+
modelIdentifier,
|
|
583
|
+
suiteEntries: suite.entries,
|
|
584
|
+
runtime,
|
|
585
|
+
warmupRuns,
|
|
586
|
+
measuredIterations,
|
|
587
|
+
timeoutMs
|
|
588
|
+
});
|
|
589
|
+
} catch (error) {
|
|
590
|
+
return {
|
|
591
|
+
model_identifier: modelIdentifier,
|
|
592
|
+
status: 'failed',
|
|
593
|
+
error: String(error.message || 'Calibration execution failed.'),
|
|
594
|
+
traces: {
|
|
595
|
+
warmup_runs: warmupRuns,
|
|
596
|
+
measured_iterations: measuredIterations,
|
|
597
|
+
error_code: normalizeErrorCode(error)
|
|
598
|
+
}
|
|
599
|
+
};
|
|
600
|
+
}
|
|
601
|
+
});
|
|
602
|
+
|
|
603
|
+
const summary = {
|
|
604
|
+
total_models: modelResults.length,
|
|
605
|
+
successful_models: modelResults.filter((entry) => entry.status === 'success').length,
|
|
606
|
+
failed_models: modelResults.filter((entry) => entry.status === 'failed').length,
|
|
607
|
+
skipped_models: modelResults.filter((entry) => entry.status === 'skipped').length,
|
|
608
|
+
pending_models: modelResults.filter((entry) => entry.status === 'pending').length
|
|
609
|
+
};
|
|
610
|
+
|
|
611
|
+
const result = {
|
|
612
|
+
schema_version: '1.0',
|
|
613
|
+
generated_at: new Date().toISOString(),
|
|
614
|
+
calibration_version:
|
|
615
|
+
calibrationVersion || `full-${new Date().toISOString().replace(/[:.]/g, '-')}`,
|
|
616
|
+
execution_mode: 'full',
|
|
617
|
+
runtime,
|
|
618
|
+
objective,
|
|
619
|
+
hardware: hardware || this.getLocalHardwareSummary(),
|
|
620
|
+
suite: suite.metadata,
|
|
621
|
+
models: modelResults,
|
|
622
|
+
summary
|
|
623
|
+
};
|
|
624
|
+
|
|
625
|
+
return this.validateCalibrationResult(result);
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
computeTaskCandidates({ task, successfulModels, objective }) {
|
|
629
|
+
const candidates = successfulModels.map((model) => {
|
|
630
|
+
const qualityScore = toNumber(
|
|
631
|
+
model.quality?.task_scores?.[task],
|
|
632
|
+
toNumber(model.quality?.overall_score, 0)
|
|
633
|
+
);
|
|
634
|
+
const speedRaw =
|
|
635
|
+
toNumber(model.metrics?.tokens_per_second, 0) -
|
|
636
|
+
toNumber(model.metrics?.latency_ms_p50, 0) / 1000;
|
|
637
|
+
return {
|
|
638
|
+
model_identifier: model.model_identifier,
|
|
639
|
+
qualityScore,
|
|
640
|
+
speedRaw
|
|
641
|
+
};
|
|
642
|
+
});
|
|
643
|
+
|
|
644
|
+
const speedValues = candidates.map((entry) => entry.speedRaw);
|
|
645
|
+
const minSpeed = Math.min(...speedValues);
|
|
646
|
+
const maxSpeed = Math.max(...speedValues);
|
|
647
|
+
const speedRange = maxSpeed - minSpeed;
|
|
648
|
+
|
|
649
|
+
const weighted = candidates.map((entry) => {
|
|
650
|
+
const speedScore =
|
|
651
|
+
speedRange > 0 ? ((entry.speedRaw - minSpeed) / speedRange) * 100 : 50;
|
|
652
|
+
let combinedScore = 0;
|
|
653
|
+
if (objective === 'speed') {
|
|
654
|
+
combinedScore = speedScore * 0.75 + entry.qualityScore * 0.25;
|
|
655
|
+
} else if (objective === 'quality') {
|
|
656
|
+
combinedScore = entry.qualityScore * 0.8 + speedScore * 0.2;
|
|
657
|
+
} else {
|
|
658
|
+
combinedScore = entry.qualityScore * 0.5 + speedScore * 0.5;
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
return {
|
|
662
|
+
...entry,
|
|
663
|
+
speedScore,
|
|
664
|
+
combinedScore
|
|
665
|
+
};
|
|
666
|
+
});
|
|
667
|
+
|
|
668
|
+
return weighted.sort((left, right) => {
|
|
669
|
+
if (right.combinedScore !== left.combinedScore) {
|
|
670
|
+
return right.combinedScore - left.combinedScore;
|
|
671
|
+
}
|
|
672
|
+
if (right.qualityScore !== left.qualityScore) {
|
|
673
|
+
return right.qualityScore - left.qualityScore;
|
|
674
|
+
}
|
|
675
|
+
return left.model_identifier.localeCompare(right.model_identifier);
|
|
676
|
+
});
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
synthesizePolicyRoutes(calibrationResult) {
|
|
680
|
+
const successfulModels = calibrationResult.models.filter(
|
|
681
|
+
(entry) => entry.status === 'success'
|
|
682
|
+
);
|
|
683
|
+
|
|
684
|
+
if (successfulModels.length === 0) {
|
|
685
|
+
throw new Error('Cannot synthesize policy: no successful model calibration results found.');
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
const tasks = Object.keys(calibrationResult.suite?.task_breakdown || {});
|
|
689
|
+
const taskList = tasks.length > 0 ? tasks : [DEFAULT_CALIBRATION_TASK];
|
|
690
|
+
const routing = {};
|
|
691
|
+
|
|
692
|
+
taskList.forEach((task) => {
|
|
693
|
+
const ranked = this.computeTaskCandidates({
|
|
694
|
+
task,
|
|
695
|
+
successfulModels,
|
|
696
|
+
objective: calibrationResult.objective
|
|
697
|
+
});
|
|
698
|
+
|
|
699
|
+
const minimumQuality = 50;
|
|
700
|
+
const eligible = ranked.filter((entry) => entry.qualityScore >= minimumQuality);
|
|
701
|
+
const selected = eligible.length > 0 ? eligible : ranked;
|
|
702
|
+
|
|
703
|
+
const primary = selected[0];
|
|
704
|
+
const fallbacks = selected.slice(1).map((entry) => entry.model_identifier);
|
|
705
|
+
|
|
706
|
+
routing[task] = {
|
|
707
|
+
primary: primary.model_identifier,
|
|
708
|
+
fallbacks,
|
|
709
|
+
min_quality: minimumQuality,
|
|
710
|
+
rationale: `objective=${calibrationResult.objective}; combined=${primary.combinedScore.toFixed(
|
|
711
|
+
2
|
|
712
|
+
)}; quality=${primary.qualityScore.toFixed(2)}; speed=${primary.speedScore.toFixed(2)}`
|
|
713
|
+
};
|
|
714
|
+
});
|
|
715
|
+
|
|
716
|
+
return routing;
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
buildDraftCalibrationPolicy({ calibrationResult, calibrationResultPath }) {
|
|
720
|
+
const modelIdentifiers = calibrationResult.models.map((entry) => entry.model_identifier);
|
|
721
|
+
if (modelIdentifiers.length === 0) {
|
|
722
|
+
throw new Error('Calibration policy generation requires at least one model result.');
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
let routing;
|
|
726
|
+
if (
|
|
727
|
+
calibrationResult.execution_mode === 'full' &&
|
|
728
|
+
calibrationResult.models.some((entry) => entry.status === 'success')
|
|
729
|
+
) {
|
|
730
|
+
routing = this.synthesizePolicyRoutes(calibrationResult);
|
|
731
|
+
} else {
|
|
732
|
+
const tasks = Object.keys(calibrationResult.suite?.task_breakdown || {});
|
|
733
|
+
const taskRoutes = tasks.length > 0 ? tasks : [DEFAULT_CALIBRATION_TASK];
|
|
734
|
+
routing = {};
|
|
735
|
+
taskRoutes.forEach((taskName) => {
|
|
736
|
+
routing[taskName] = {
|
|
737
|
+
primary: modelIdentifiers[0],
|
|
738
|
+
fallbacks: modelIdentifiers.slice(1),
|
|
739
|
+
rationale: 'Draft routing generated from calibration contract output.'
|
|
740
|
+
};
|
|
741
|
+
});
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
const policy = {
|
|
745
|
+
schema_version: '1.0',
|
|
746
|
+
generated_at: new Date().toISOString(),
|
|
747
|
+
objective: calibrationResult.objective,
|
|
748
|
+
source: {
|
|
749
|
+
calibration_version: calibrationResult.calibration_version,
|
|
750
|
+
calibration_result_path: calibrationResultPath || undefined
|
|
751
|
+
},
|
|
752
|
+
routing,
|
|
753
|
+
metadata: {
|
|
754
|
+
runtime: calibrationResult.runtime,
|
|
755
|
+
hardware_fingerprint: calibrationResult.hardware?.fingerprint || undefined
|
|
756
|
+
}
|
|
757
|
+
};
|
|
758
|
+
|
|
759
|
+
return this.validateCalibrationPolicy(policy);
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
validateCalibrationResult(payload) {
|
|
763
|
+
try {
|
|
764
|
+
return calibrationResultSchema.parse(payload);
|
|
765
|
+
} catch (error) {
|
|
766
|
+
throw new Error(`Invalid calibration result payload: ${formatZodIssues(error)}`);
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
validateCalibrationPolicy(payload) {
|
|
771
|
+
try {
|
|
772
|
+
return calibrationPolicySchema.parse(payload);
|
|
773
|
+
} catch (error) {
|
|
774
|
+
throw new Error(`Invalid calibration policy payload: ${formatZodIssues(error)}`);
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
writeArtifact(filePath, payload, options = {}) {
|
|
779
|
+
const cwd = options.cwd || process.cwd();
|
|
780
|
+
const resolvedPath = this.resolvePath(filePath, cwd);
|
|
781
|
+
|
|
782
|
+
if (fs.existsSync(resolvedPath) && fs.statSync(resolvedPath).isDirectory()) {
|
|
783
|
+
throw new Error(`Output path must be a file, received directory: ${resolvedPath}`);
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
const serialized = isYamlPath(resolvedPath)
|
|
787
|
+
? `${YAML.stringify(payload)}`
|
|
788
|
+
: `${JSON.stringify(payload, null, 2)}\n`;
|
|
789
|
+
|
|
790
|
+
fs.mkdirSync(path.dirname(resolvedPath), { recursive: true });
|
|
791
|
+
fs.writeFileSync(resolvedPath, serialized, 'utf8');
|
|
792
|
+
return resolvedPath;
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
module.exports = {
|
|
797
|
+
CalibrationManager
|
|
798
|
+
};
|