llm-checker 3.2.8 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +118 -17
- package/bin/enhanced_cli.js +303 -3
- package/package.json +1 -1
- package/src/calibration/calibration-manager.js +798 -0
- package/src/calibration/policy-routing.js +376 -0
- package/src/calibration/schemas.js +212 -0
- package/src/hardware/backends/cuda-detector.js +355 -5
package/README.md
CHANGED
|
@@ -22,8 +22,11 @@
|
|
|
22
22
|
</p>
|
|
23
23
|
|
|
24
24
|
<p align="center">
|
|
25
|
+
<a href="#start-here-2-minutes">Start Here</a> •
|
|
25
26
|
<a href="#installation">Installation</a> •
|
|
26
27
|
<a href="#quick-start">Quick Start</a> •
|
|
28
|
+
<a href="#calibration-quick-start-10-minutes">Calibration Quick Start</a> •
|
|
29
|
+
<a href="https://github.com/Pavelevich/llm-checker/tree/main/docs">Docs</a> •
|
|
27
30
|
<a href="#claude-code-mcp">Claude MCP</a> •
|
|
28
31
|
<a href="#commands">Commands</a> •
|
|
29
32
|
<a href="#scoring-system">Scoring</a> •
|
|
@@ -54,6 +57,17 @@ Choosing the right LLM for your hardware is complex. With thousands of model var
|
|
|
54
57
|
|
|
55
58
|
---
|
|
56
59
|
|
|
60
|
+
## Documentation
|
|
61
|
+
|
|
62
|
+
- [Docs Hub](https://github.com/Pavelevich/llm-checker/tree/main/docs)
|
|
63
|
+
- [Usage Guide](https://github.com/Pavelevich/llm-checker/blob/main/docs/guides/usage-guide.md)
|
|
64
|
+
- [Advanced Usage](https://github.com/Pavelevich/llm-checker/blob/main/docs/guides/advanced-usage.md)
|
|
65
|
+
- [Technical Reference](https://github.com/Pavelevich/llm-checker/blob/main/docs/reference/technical-docs.md)
|
|
66
|
+
- [Changelog](https://github.com/Pavelevich/llm-checker/blob/main/docs/reference/changelog.md)
|
|
67
|
+
- [Calibration Fixtures](https://github.com/Pavelevich/llm-checker/tree/main/docs/fixtures/calibration)
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
57
71
|
## Comparison with Other Tooling (e.g. `llmfit`)
|
|
58
72
|
|
|
59
73
|
LLM Checker and `llmfit` solve related but different problems:
|
|
@@ -89,6 +103,32 @@ npm install sql.js
|
|
|
89
103
|
|
|
90
104
|
---
|
|
91
105
|
|
|
106
|
+
## Start Here (2 Minutes)
|
|
107
|
+
|
|
108
|
+
If you are new, use this exact flow:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# 1) Install
|
|
112
|
+
npm install -g llm-checker
|
|
113
|
+
|
|
114
|
+
# 2) Detect your hardware
|
|
115
|
+
llm-checker hw-detect
|
|
116
|
+
|
|
117
|
+
# 3) Get recommendations by category
|
|
118
|
+
llm-checker recommend --category coding
|
|
119
|
+
|
|
120
|
+
# 4) Run with auto-selection
|
|
121
|
+
llm-checker ai-run --category coding --prompt "Write a hello world in Python"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
If you already calibrated routing:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
llm-checker ai-run --calibrated --category coding --prompt "Refactor this function"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
92
132
|
## Distribution
|
|
93
133
|
|
|
94
134
|
LLM Checker is published in all primary channels:
|
|
@@ -97,23 +137,16 @@ LLM Checker is published in all primary channels:
|
|
|
97
137
|
- GitHub Releases: [Release history](https://github.com/Pavelevich/llm-checker/releases)
|
|
98
138
|
- GitHub Packages: [`@pavelevich/llm-checker`](https://github.com/users/Pavelevich/packages/npm/package/llm-checker)
|
|
99
139
|
|
|
100
|
-
### v3.
|
|
101
|
-
|
|
102
|
-
- Fixed multimodal recommendation false positives from noisy metadata.
|
|
103
|
-
- Coding-only models with incidental `input_types: image` flags are no longer treated as vision models.
|
|
104
|
-
- Added regression tests to keep multimodal category picks aligned with true vision-capable models.
|
|
105
|
-
|
|
106
|
-
### v3.2.7 Highlights
|
|
140
|
+
### v3.3.0 Highlights
|
|
107
141
|
|
|
108
|
-
-
|
|
109
|
-
|
|
110
|
-
-
|
|
111
|
-
-
|
|
112
|
-
-
|
|
113
|
-
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
- Expanded deterministic and hardware regression coverage for multi-GPU and unified-memory edge cases.
|
|
142
|
+
- Calibrated routing is now first-class in `recommend` and `ai-run`:
|
|
143
|
+
- `--calibrated [file]` support with default discovery path.
|
|
144
|
+
- clear precedence: `--policy` > `--calibrated` > deterministic fallback.
|
|
145
|
+
- routing provenance output (source, route, selected model).
|
|
146
|
+
- New calibration fixtures and end-to-end tests for:
|
|
147
|
+
- `calibrate --policy-out ...` → `recommend --calibrated ...`
|
|
148
|
+
- Hardened Jetson CUDA detection to avoid false CPU-only fallback.
|
|
149
|
+
- Documentation reorganized under `docs/` with clearer onboarding paths.
|
|
117
150
|
|
|
118
151
|
### Optional: Install from GitHub Packages
|
|
119
152
|
|
|
@@ -147,6 +180,51 @@ llm-checker search qwen --use-case coding
|
|
|
147
180
|
|
|
148
181
|
---
|
|
149
182
|
|
|
183
|
+
## Calibration Quick Start (10 Minutes)
|
|
184
|
+
|
|
185
|
+
This path produces both calibration artifacts and verifies calibrated routing in one pass.
|
|
186
|
+
|
|
187
|
+
### 1) Use the sample prompt suite
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
cp ./docs/fixtures/calibration/sample-suite.jsonl ./sample-suite.jsonl
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### 2) Generate calibration artifacts (dry-run)
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
mkdir -p ./artifacts
|
|
197
|
+
llm-checker calibrate \
|
|
198
|
+
--suite ./sample-suite.jsonl \
|
|
199
|
+
--models qwen2.5-coder:7b llama3.2:3b \
|
|
200
|
+
--runtime ollama \
|
|
201
|
+
--objective balanced \
|
|
202
|
+
--dry-run \
|
|
203
|
+
--output ./artifacts/calibration-result.json \
|
|
204
|
+
--policy-out ./artifacts/calibration-policy.yaml
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
Artifacts created:
|
|
208
|
+
|
|
209
|
+
- `./artifacts/calibration-result.json` (calibration contract)
|
|
210
|
+
- `./artifacts/calibration-policy.yaml` (routing policy for runtime commands)
|
|
211
|
+
|
|
212
|
+
### 3) Apply calibrated routing
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
llm-checker recommend --calibrated ./artifacts/calibration-policy.yaml --category coding
|
|
216
|
+
llm-checker ai-run --calibrated ./artifacts/calibration-policy.yaml --category coding --prompt "Refactor this function"
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
Notes:
|
|
220
|
+
|
|
221
|
+
- `--policy <file>` has precedence over `--calibrated [file]`.
|
|
222
|
+
- If `--calibrated` has no path, discovery uses `~/.llm-checker/calibration-policy.{yaml,yml,json}`.
|
|
223
|
+
- `--mode full` currently requires `--runtime ollama`.
|
|
224
|
+
- `./docs/fixtures/calibration/sample-generated-policy.yaml` shows the expected policy structure.
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
150
228
|
## Claude Code MCP
|
|
151
229
|
|
|
152
230
|
LLM Checker includes a built-in [Model Context Protocol](https://modelcontextprotocol.io/) (MCP) server, allowing **Claude Code** and other MCP-compatible AI assistants to analyze your hardware and manage local models directly.
|
|
@@ -229,6 +307,7 @@ Claude will automatically call the right tools and give you actionable results.
|
|
|
229
307
|
| `hw-detect` | Detect GPU/CPU capabilities, memory, backends |
|
|
230
308
|
| `check` | Full system analysis with compatible models and recommendations |
|
|
231
309
|
| `recommend` | Intelligent recommendations by category (coding, reasoning, multimodal, etc.) |
|
|
310
|
+
| `calibrate` | Generate calibration result + routing policy artifacts from a JSONL prompt suite |
|
|
232
311
|
| `installed` | Rank your installed Ollama models by compatibility |
|
|
233
312
|
|
|
234
313
|
### Advanced Commands (require `sql.js`)
|
|
@@ -263,6 +342,28 @@ llm-checker check --policy ./policy.yaml --use-case coding --runtime vllm
|
|
|
263
342
|
llm-checker recommend --policy ./policy.yaml --category coding
|
|
264
343
|
```
|
|
265
344
|
|
|
345
|
+
### Calibrated Routing in `recommend` and `ai-run`
|
|
346
|
+
|
|
347
|
+
`recommend` and `ai-run` now support calibration routing policies generated by `calibrate --policy-out`.
|
|
348
|
+
|
|
349
|
+
- `--calibrated [file]`:
|
|
350
|
+
- If `file` is omitted, discovery defaults to `~/.llm-checker/calibration-policy.{yaml,yml,json}`.
|
|
351
|
+
- `--policy <file>` takes precedence over `--calibrated` for routing resolution.
|
|
352
|
+
- Resolution precedence:
|
|
353
|
+
- `--policy` (explicit)
|
|
354
|
+
- `--calibrated` (explicit file or default discovery)
|
|
355
|
+
- deterministic selector fallback
|
|
356
|
+
- CLI output includes routing provenance (`--policy`, `--calibrated`, or default discovery) and the selected route/model.
|
|
357
|
+
|
|
358
|
+
Examples:
|
|
359
|
+
|
|
360
|
+
```bash
|
|
361
|
+
llm-checker recommend --calibrated --category coding
|
|
362
|
+
llm-checker recommend --calibrated ./calibration-policy.yaml --category reasoning
|
|
363
|
+
llm-checker ai-run --calibrated --category coding --prompt "Refactor this function"
|
|
364
|
+
llm-checker ai-run --policy ./calibration-policy.yaml --prompt "Summarize this report"
|
|
365
|
+
```
|
|
366
|
+
|
|
266
367
|
### Policy Audit Export
|
|
267
368
|
|
|
268
369
|
Use `audit export` when you need machine-readable compliance evidence for CI/CD gates, governance reviews, or security tooling.
|
|
@@ -722,7 +823,7 @@ LLM Checker is licensed under **NPDL-1.0** (No Paid Distribution License).
|
|
|
722
823
|
- Free use, modification, and redistribution are allowed.
|
|
723
824
|
- Selling the software or offering it as a paid hosted/API service is not allowed without a separate commercial license.
|
|
724
825
|
|
|
725
|
-
See [LICENSE](LICENSE) for full terms.
|
|
826
|
+
See [LICENSE](https://github.com/Pavelevich/llm-checker/blob/main/LICENSE) for full terms.
|
|
726
827
|
|
|
727
828
|
---
|
|
728
829
|
|
package/bin/enhanced_cli.js
CHANGED
|
@@ -23,6 +23,16 @@ const {
|
|
|
23
23
|
getRuntimeDisplayName,
|
|
24
24
|
getRuntimeCommandSet
|
|
25
25
|
} = require('../src/runtime/runtime-support');
|
|
26
|
+
const { CalibrationManager } = require('../src/calibration/calibration-manager');
|
|
27
|
+
const { SUPPORTED_CALIBRATION_OBJECTIVES } = require('../src/calibration/schemas');
|
|
28
|
+
const {
|
|
29
|
+
resolveRoutingPolicyPreference,
|
|
30
|
+
normalizeTaskName,
|
|
31
|
+
inferTaskFromPrompt,
|
|
32
|
+
resolveCalibrationRoute,
|
|
33
|
+
getRouteModelCandidates,
|
|
34
|
+
selectModelFromRoute
|
|
35
|
+
} = require('../src/calibration/policy-routing');
|
|
26
36
|
const SpeculativeDecodingEstimator = require('../src/models/speculative-decoding-estimator');
|
|
27
37
|
const PolicyManager = require('../src/policy/policy-manager');
|
|
28
38
|
const PolicyEngine = require('../src/policy/policy-engine');
|
|
@@ -38,6 +48,7 @@ const {
|
|
|
38
48
|
serializeComplianceReport
|
|
39
49
|
} = require('../src/policy/audit-reporter');
|
|
40
50
|
const policyManager = new PolicyManager();
|
|
51
|
+
const calibrationManager = new CalibrationManager();
|
|
41
52
|
|
|
42
53
|
// ASCII Art for each command - Large text banners
|
|
43
54
|
const ASCII_ART = {
|
|
@@ -1073,6 +1084,119 @@ function displayIntelligentRecommendations(intelligentData) {
|
|
|
1073
1084
|
console.log(chalk.red('╰'));
|
|
1074
1085
|
}
|
|
1075
1086
|
|
|
1087
|
+
function toCalibrationSourceLabel(source) {
|
|
1088
|
+
if (source === 'default-discovery') {
|
|
1089
|
+
return '~/.llm-checker/calibration-policy.{yaml,yml,json}';
|
|
1090
|
+
}
|
|
1091
|
+
return source || 'unknown';
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
function collectRecommendationModelIdentifiers(intelligentData) {
|
|
1095
|
+
const identifiers = new Set();
|
|
1096
|
+
const summary = intelligentData?.summary || {};
|
|
1097
|
+
|
|
1098
|
+
if (summary.best_overall?.identifier) {
|
|
1099
|
+
identifiers.add(summary.best_overall.identifier);
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1102
|
+
if (summary.by_category && typeof summary.by_category === 'object') {
|
|
1103
|
+
Object.values(summary.by_category).forEach((entry) => {
|
|
1104
|
+
if (entry?.identifier) {
|
|
1105
|
+
identifiers.add(entry.identifier);
|
|
1106
|
+
}
|
|
1107
|
+
});
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
const recommendationGroups = intelligentData?.recommendations || {};
|
|
1111
|
+
Object.values(recommendationGroups).forEach((group) => {
|
|
1112
|
+
const models = Array.isArray(group?.bestModels) ? group.bestModels : [];
|
|
1113
|
+
models.forEach((model) => {
|
|
1114
|
+
if (model?.model_identifier) {
|
|
1115
|
+
identifiers.add(model.model_identifier);
|
|
1116
|
+
}
|
|
1117
|
+
});
|
|
1118
|
+
});
|
|
1119
|
+
|
|
1120
|
+
return Array.from(identifiers);
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
function resolveCalibratedRouteDecision(calibratedPolicy, requestedTask, availableModels = []) {
|
|
1124
|
+
if (!calibratedPolicy?.policy) return null;
|
|
1125
|
+
|
|
1126
|
+
const resolvedRoute = resolveCalibrationRoute(calibratedPolicy.policy, requestedTask);
|
|
1127
|
+
if (!resolvedRoute?.route) return null;
|
|
1128
|
+
|
|
1129
|
+
const routeCandidates = getRouteModelCandidates(resolvedRoute.route);
|
|
1130
|
+
const routeSelection = selectModelFromRoute(resolvedRoute.route, availableModels);
|
|
1131
|
+
|
|
1132
|
+
const selectedModel = routeSelection?.selectedModel || routeCandidates[0] || null;
|
|
1133
|
+
|
|
1134
|
+
return {
|
|
1135
|
+
requestedTask: resolvedRoute.requestedTask,
|
|
1136
|
+
resolvedTask: resolvedRoute.resolvedTask,
|
|
1137
|
+
usedTaskFallback: Boolean(resolvedRoute.usedTaskFallback),
|
|
1138
|
+
primary: resolvedRoute.route.primary,
|
|
1139
|
+
fallbacks: Array.isArray(resolvedRoute.route.fallbacks) ? resolvedRoute.route.fallbacks : [],
|
|
1140
|
+
routeCandidates,
|
|
1141
|
+
selectedModel,
|
|
1142
|
+
matchedRouteModel: routeSelection?.matchedRouteModel || (routeCandidates[0] || null),
|
|
1143
|
+
matchedAvailableModel: Boolean(routeSelection),
|
|
1144
|
+
usedRouteFallbackModel: Boolean(routeSelection?.usedFallback)
|
|
1145
|
+
};
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1148
|
+
function displayCalibratedRoutingDecision(commandName, calibratedPolicy, routeDecision, warnings = []) {
|
|
1149
|
+
if (!calibratedPolicy && (!warnings || warnings.length === 0)) {
|
|
1150
|
+
return;
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
console.log('\n' + chalk.bgBlue.white.bold(' CALIBRATED ROUTING '));
|
|
1154
|
+
console.log(chalk.blue('╭' + '─'.repeat(78)));
|
|
1155
|
+
console.log(chalk.blue('│') + ` Command: ${chalk.cyan(commandName)}`);
|
|
1156
|
+
|
|
1157
|
+
if (calibratedPolicy) {
|
|
1158
|
+
console.log(chalk.blue('│') + ` Policy: ${chalk.green(calibratedPolicy.policyPath)}`);
|
|
1159
|
+
console.log(chalk.blue('│') + ` Source: ${chalk.magenta(toCalibrationSourceLabel(calibratedPolicy.source))}`);
|
|
1160
|
+
} else {
|
|
1161
|
+
console.log(chalk.blue('│') + chalk.yellow(' Policy: not active (deterministic fallback)'));
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
if (routeDecision) {
|
|
1165
|
+
const requestedTask = routeDecision.requestedTask || 'general';
|
|
1166
|
+
const resolvedTask = routeDecision.resolvedTask || requestedTask;
|
|
1167
|
+
const taskDisplay = routeDecision.usedTaskFallback
|
|
1168
|
+
? `${requestedTask} → ${resolvedTask}`
|
|
1169
|
+
: requestedTask;
|
|
1170
|
+
|
|
1171
|
+
const selectedModel = routeDecision.selectedModel || routeDecision.primary || 'N/A';
|
|
1172
|
+
const selectedLabel = routeDecision.usedRouteFallbackModel
|
|
1173
|
+
? `${selectedModel} (fallback)`
|
|
1174
|
+
: selectedModel;
|
|
1175
|
+
|
|
1176
|
+
console.log(chalk.blue('│') + ` Task: ${chalk.white(taskDisplay)}`);
|
|
1177
|
+
console.log(chalk.blue('│') + ` Route primary: ${chalk.green(routeDecision.primary || 'N/A')}`);
|
|
1178
|
+
if (routeDecision.fallbacks && routeDecision.fallbacks.length > 0) {
|
|
1179
|
+
console.log(chalk.blue('│') + ` Route fallbacks: ${chalk.gray(routeDecision.fallbacks.join(', '))}`);
|
|
1180
|
+
}
|
|
1181
|
+
console.log(chalk.blue('│') + ` Selected model: ${chalk.green.bold(selectedLabel)}`);
|
|
1182
|
+
|
|
1183
|
+
if (!routeDecision.matchedAvailableModel) {
|
|
1184
|
+
console.log(
|
|
1185
|
+
chalk.blue('│') +
|
|
1186
|
+
chalk.yellow(' Route did not match local/recommended models; using route primary for visibility.')
|
|
1187
|
+
);
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
if (warnings && warnings.length > 0) {
|
|
1192
|
+
warnings.forEach((warning) => {
|
|
1193
|
+
console.log(chalk.blue('│') + chalk.yellow(` Warning: ${warning}`));
|
|
1194
|
+
});
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
console.log(chalk.blue('╰'));
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1076
1200
|
function displayModelsStats(originalCount, filteredCount, options) {
|
|
1077
1201
|
console.log('\n' + chalk.bgGreen.white.bold(' DATABASE STATS '));
|
|
1078
1202
|
console.log(chalk.green('╭' + '─'.repeat(60)));
|
|
@@ -2441,6 +2565,122 @@ auditCommand.action(() => {
|
|
|
2441
2565
|
auditCommand.outputHelp();
|
|
2442
2566
|
});
|
|
2443
2567
|
|
|
2568
|
+
program
|
|
2569
|
+
.command('calibrate')
|
|
2570
|
+
.description('Generate calibration contract artifacts from a JSONL prompt suite')
|
|
2571
|
+
.requiredOption('--suite <file>', 'Prompt suite path in JSONL format')
|
|
2572
|
+
.requiredOption(
|
|
2573
|
+
'--models <identifiers...>',
|
|
2574
|
+
'Model identifiers to include (repeat flag and/or comma-separate values)'
|
|
2575
|
+
)
|
|
2576
|
+
.requiredOption(
|
|
2577
|
+
'--output <file>',
|
|
2578
|
+
'Calibration result output path (.json, .yaml, or .yml)'
|
|
2579
|
+
)
|
|
2580
|
+
.option(
|
|
2581
|
+
'--runtime <runtime>',
|
|
2582
|
+
`Inference runtime (${SUPPORTED_RUNTIMES.join('|')})`,
|
|
2583
|
+
'ollama'
|
|
2584
|
+
)
|
|
2585
|
+
.option(
|
|
2586
|
+
'--mode <mode>',
|
|
2587
|
+
'Execution mode (dry-run|contract-only|full). Default: contract-only'
|
|
2588
|
+
)
|
|
2589
|
+
.option(
|
|
2590
|
+
'--objective <objective>',
|
|
2591
|
+
`Calibration objective (${SUPPORTED_CALIBRATION_OBJECTIVES.join('|')})`,
|
|
2592
|
+
'balanced'
|
|
2593
|
+
)
|
|
2594
|
+
.option(
|
|
2595
|
+
'--policy-out <file>',
|
|
2596
|
+
'Optional calibration policy output path (.json, .yaml, or .yml)'
|
|
2597
|
+
)
|
|
2598
|
+
.option('--warmup <count>', 'Warmup runs per prompt in full mode', '1')
|
|
2599
|
+
.option('--iterations <count>', 'Measured iterations per prompt in full mode', '2')
|
|
2600
|
+
.option('--timeout-ms <ms>', 'Per-prompt timeout in full mode', '120000')
|
|
2601
|
+
.option('--dry-run', 'Produce draft artifacts without benchmark execution')
|
|
2602
|
+
.addHelpText(
|
|
2603
|
+
'after',
|
|
2604
|
+
`
|
|
2605
|
+
Examples:
|
|
2606
|
+
$ llm-checker calibrate --suite ./prompts.jsonl --models qwen2.5-coder:7b llama3.2:3b --output ./calibration.json
|
|
2607
|
+
$ llm-checker calibrate --suite ./prompts.jsonl --models qwen2.5-coder:7b --mode full --iterations 3 --output ./calibration.json --policy-out ./routing.yaml
|
|
2608
|
+
$ llm-checker calibrate --suite ./prompts.jsonl --models qwen2.5-coder:7b,llama3.2:3b --output ./calibration.yaml --policy-out ./routing.yaml --dry-run
|
|
2609
|
+
`
|
|
2610
|
+
)
|
|
2611
|
+
.action((options) => {
|
|
2612
|
+
try {
|
|
2613
|
+
const runtime = calibrationManager.validateRuntime(options.runtime);
|
|
2614
|
+
const objective = calibrationManager.validateObjective(options.objective);
|
|
2615
|
+
const executionMode = calibrationManager.resolveExecutionMode({
|
|
2616
|
+
mode: options.mode,
|
|
2617
|
+
dryRun: Boolean(options.dryRun)
|
|
2618
|
+
});
|
|
2619
|
+
const models = calibrationManager.parseModelIdentifiers(options.models);
|
|
2620
|
+
const suite = calibrationManager.parsePromptSuite(options.suite);
|
|
2621
|
+
|
|
2622
|
+
let calibrationResult = null;
|
|
2623
|
+
if (executionMode === 'full') {
|
|
2624
|
+
calibrationResult = calibrationManager.runFullCalibration({
|
|
2625
|
+
models,
|
|
2626
|
+
suite,
|
|
2627
|
+
runtime,
|
|
2628
|
+
objective,
|
|
2629
|
+
benchmarkConfig: {
|
|
2630
|
+
warmupRuns: Number.parseInt(options.warmup, 10),
|
|
2631
|
+
measuredIterations: Number.parseInt(options.iterations, 10),
|
|
2632
|
+
timeoutMs: Number.parseInt(options.timeoutMs, 10)
|
|
2633
|
+
}
|
|
2634
|
+
});
|
|
2635
|
+
} else {
|
|
2636
|
+
calibrationResult = calibrationManager.buildDraftCalibrationResult({
|
|
2637
|
+
models,
|
|
2638
|
+
suiteMetadata: suite.metadata,
|
|
2639
|
+
runtime,
|
|
2640
|
+
objective,
|
|
2641
|
+
executionMode
|
|
2642
|
+
});
|
|
2643
|
+
}
|
|
2644
|
+
|
|
2645
|
+
const resultPath = calibrationManager.writeArtifact(options.output, calibrationResult);
|
|
2646
|
+
|
|
2647
|
+
let policyPath = null;
|
|
2648
|
+
if (options.policyOut) {
|
|
2649
|
+
const calibrationPolicy = calibrationManager.buildDraftCalibrationPolicy({
|
|
2650
|
+
calibrationResult,
|
|
2651
|
+
calibrationResultPath: resultPath
|
|
2652
|
+
});
|
|
2653
|
+
policyPath = calibrationManager.writeArtifact(options.policyOut, calibrationPolicy);
|
|
2654
|
+
}
|
|
2655
|
+
|
|
2656
|
+
console.log('\n' + chalk.bgBlue.white.bold(' CALIBRATION ARTIFACTS GENERATED '));
|
|
2657
|
+
console.log(chalk.blue('╭' + '─'.repeat(72)));
|
|
2658
|
+
console.log(chalk.blue('│') + ` Suite: ${chalk.white(suite.path)}`);
|
|
2659
|
+
console.log(chalk.blue('│') + ` Runtime: ${chalk.cyan(runtime)} | Objective: ${chalk.cyan(objective)}`);
|
|
2660
|
+
console.log(chalk.blue('│') + ` Models: ${chalk.white(String(models.length))}`);
|
|
2661
|
+
console.log(chalk.blue('│') + ` Execution mode: ${chalk.yellow(executionMode)}`);
|
|
2662
|
+
if (executionMode === 'full') {
|
|
2663
|
+
console.log(
|
|
2664
|
+
chalk.blue('│') +
|
|
2665
|
+
` Successful: ${chalk.green(
|
|
2666
|
+
String(calibrationResult.summary.successful_models)
|
|
2667
|
+
)} | Failed: ${chalk.red(String(calibrationResult.summary.failed_models))}`
|
|
2668
|
+
);
|
|
2669
|
+
}
|
|
2670
|
+
console.log(chalk.blue('│') + ` Result: ${chalk.green(resultPath)}`);
|
|
2671
|
+
if (policyPath) {
|
|
2672
|
+
console.log(chalk.blue('│') + ` Policy: ${chalk.green(policyPath)}`);
|
|
2673
|
+
}
|
|
2674
|
+
console.log(chalk.blue('╰' + '─'.repeat(72)));
|
|
2675
|
+
} catch (error) {
|
|
2676
|
+
console.error(chalk.red(`Calibration failed: ${error.message}`));
|
|
2677
|
+
if (process.env.DEBUG) {
|
|
2678
|
+
console.error(error.stack);
|
|
2679
|
+
}
|
|
2680
|
+
process.exit(1);
|
|
2681
|
+
}
|
|
2682
|
+
});
|
|
2683
|
+
|
|
2444
2684
|
program
|
|
2445
2685
|
.command('check')
|
|
2446
2686
|
.description('Analyze your system and show compatible LLM models')
|
|
@@ -2809,6 +3049,10 @@ program
|
|
|
2809
3049
|
.option('--optimize <profile>', 'Optimization profile (balanced|speed|quality|context|coding)', 'balanced')
|
|
2810
3050
|
.option('--no-verbose', 'Disable step-by-step progress display')
|
|
2811
3051
|
.option('--policy <file>', 'Evaluate recommendations against a policy file')
|
|
3052
|
+
.option(
|
|
3053
|
+
'--calibrated [file]',
|
|
3054
|
+
'Use calibrated routing policy (optional file path; defaults to ~/.llm-checker/calibration-policy.{yaml,yml,json})'
|
|
3055
|
+
)
|
|
2812
3056
|
.addHelpText(
|
|
2813
3057
|
'after',
|
|
2814
3058
|
`
|
|
@@ -2816,6 +3060,11 @@ Enterprise policy examples:
|
|
|
2816
3060
|
$ llm-checker recommend --policy ./policy.yaml
|
|
2817
3061
|
$ llm-checker recommend --policy ./policy.yaml --category coding
|
|
2818
3062
|
$ llm-checker recommend --policy ./policy.yaml --no-verbose
|
|
3063
|
+
|
|
3064
|
+
Calibrated routing examples:
|
|
3065
|
+
$ llm-checker recommend --calibrated --category coding
|
|
3066
|
+
$ llm-checker recommend --calibrated ./calibration-policy.yaml --category reasoning
|
|
3067
|
+
$ llm-checker recommend --policy ./calibration-policy.yaml --category coding
|
|
2819
3068
|
`
|
|
2820
3069
|
)
|
|
2821
3070
|
.action(async (options) => {
|
|
@@ -2823,7 +3072,13 @@ Enterprise policy examples:
|
|
|
2823
3072
|
try {
|
|
2824
3073
|
const verboseEnabled = options.verbose !== false;
|
|
2825
3074
|
const checker = new (getLLMChecker())({ verbose: verboseEnabled });
|
|
2826
|
-
const
|
|
3075
|
+
const routingPreference = resolveRoutingPolicyPreference({
|
|
3076
|
+
policyOption: options.policy,
|
|
3077
|
+
calibratedOption: options.calibrated,
|
|
3078
|
+
loadEnterprisePolicy: loadPolicyConfiguration
|
|
3079
|
+
});
|
|
3080
|
+
const policyConfig = routingPreference.enterprisePolicy;
|
|
3081
|
+
const calibratedPolicy = routingPreference.calibratedPolicy;
|
|
2827
3082
|
|
|
2828
3083
|
if (!verboseEnabled) {
|
|
2829
3084
|
process.stdout.write(chalk.gray('Generating recommendations...'));
|
|
@@ -2860,11 +3115,18 @@ Enterprise policy examples:
|
|
|
2860
3115
|
policyEnforcement = resolvePolicyEnforcement(policyConfig.policy, policyEvaluation);
|
|
2861
3116
|
}
|
|
2862
3117
|
|
|
3118
|
+
const routingTask = normalizeTaskName(options.category || 'general');
|
|
3119
|
+
const recommendationIdentifiers = collectRecommendationModelIdentifiers(intelligentRecommendations);
|
|
3120
|
+
const routeDecision = calibratedPolicy
|
|
3121
|
+
? resolveCalibratedRouteDecision(calibratedPolicy, routingTask, recommendationIdentifiers)
|
|
3122
|
+
: null;
|
|
3123
|
+
|
|
2863
3124
|
// Mostrar información del sistema
|
|
2864
3125
|
displaySystemInfo(hardware, { summary: { hardwareTier: intelligentRecommendations.summary.hardware_tier } });
|
|
2865
3126
|
|
|
2866
3127
|
// Mostrar recomendaciones
|
|
2867
3128
|
displayIntelligentRecommendations(intelligentRecommendations);
|
|
3129
|
+
displayCalibratedRoutingDecision('recommend', calibratedPolicy, routeDecision, routingPreference.warnings);
|
|
2868
3130
|
|
|
2869
3131
|
if (policyConfig && policyEvaluation && policyEnforcement) {
|
|
2870
3132
|
displayPolicySummary('recommend', policyConfig, policyEvaluation, policyEnforcement);
|
|
@@ -3124,7 +3386,13 @@ program
|
|
|
3124
3386
|
.command('ai-run')
|
|
3125
3387
|
.description('AI-powered model selection and execution')
|
|
3126
3388
|
.option('-m, --models <models...>', 'Specific models to choose from')
|
|
3389
|
+
.option('-c, --category <category>', 'Task category hint (coding, reasoning, multimodal, general, etc.)')
|
|
3127
3390
|
.option('--prompt <prompt>', 'Prompt to run with selected model')
|
|
3391
|
+
.option('--policy <file>', 'Explicit calibrated routing policy file (takes precedence over --calibrated)')
|
|
3392
|
+
.option(
|
|
3393
|
+
'--calibrated [file]',
|
|
3394
|
+
'Enable calibrated routing policy (optional file path; defaults to ~/.llm-checker/calibration-policy.{yaml,yml,json})'
|
|
3395
|
+
)
|
|
3128
3396
|
.action(async (options) => {
|
|
3129
3397
|
showAsciiArt('ai-run');
|
|
3130
3398
|
// Check if Ollama is installed first
|
|
@@ -3138,6 +3406,11 @@ program
|
|
|
3138
3406
|
const aiSelector = new AIModelSelector();
|
|
3139
3407
|
const checker = new (getLLMChecker())();
|
|
3140
3408
|
const systemInfo = await checker.getSystemInfo();
|
|
3409
|
+
const routingPreference = resolveRoutingPolicyPreference({
|
|
3410
|
+
policyOption: options.policy,
|
|
3411
|
+
calibratedOption: options.calibrated
|
|
3412
|
+
});
|
|
3413
|
+
const calibratedPolicy = routingPreference.calibratedPolicy;
|
|
3141
3414
|
|
|
3142
3415
|
// Get available models or use provided ones
|
|
3143
3416
|
let candidateModels = options.models;
|
|
@@ -3165,6 +3438,10 @@ program
|
|
|
3165
3438
|
return;
|
|
3166
3439
|
}
|
|
3167
3440
|
}
|
|
3441
|
+
|
|
3442
|
+
candidateModels = Array.isArray(candidateModels)
|
|
3443
|
+
? candidateModels.filter((model) => typeof model === 'string' && model.trim().length > 0)
|
|
3444
|
+
: [];
|
|
3168
3445
|
|
|
3169
3446
|
// AI selection
|
|
3170
3447
|
const systemSpecs = {
|
|
@@ -3175,10 +3452,33 @@ program
|
|
|
3175
3452
|
gpu_model_normalized: systemInfo.gpu?.model ||
|
|
3176
3453
|
(systemInfo.cpu?.manufacturer === 'Apple' ? 'apple_silicon' : 'cpu_only')
|
|
3177
3454
|
};
|
|
3178
|
-
|
|
3179
|
-
const
|
|
3455
|
+
|
|
3456
|
+
const taskHint = normalizeTaskName(options.category || inferTaskFromPrompt(options.prompt));
|
|
3457
|
+
const routeDecision = calibratedPolicy
|
|
3458
|
+
? resolveCalibratedRouteDecision(calibratedPolicy, taskHint, candidateModels)
|
|
3459
|
+
: null;
|
|
3460
|
+
|
|
3461
|
+
let result;
|
|
3462
|
+
if (routeDecision && routeDecision.matchedAvailableModel && routeDecision.selectedModel) {
|
|
3463
|
+
result = {
|
|
3464
|
+
bestModel: routeDecision.selectedModel,
|
|
3465
|
+
confidence: routeDecision.usedRouteFallbackModel ? 0.82 : 0.94,
|
|
3466
|
+
method: 'calibrated-policy-route',
|
|
3467
|
+
reasoning: `Selected from calibrated policy route for ${routeDecision.resolvedTask}`
|
|
3468
|
+
};
|
|
3469
|
+
} else {
|
|
3470
|
+
if (routeDecision && routeDecision.routeCandidates.length > 0) {
|
|
3471
|
+
routingPreference.warnings.push(
|
|
3472
|
+
`Calibrated route candidates (${routeDecision.routeCandidates.join(
|
|
3473
|
+
', '
|
|
3474
|
+
)}) are not installed locally. Falling back to AI selector.`
|
|
3475
|
+
);
|
|
3476
|
+
}
|
|
3477
|
+
result = await aiSelector.selectBestModel(candidateModels, systemSpecs, taskHint);
|
|
3478
|
+
}
|
|
3180
3479
|
|
|
3181
3480
|
spinner.succeed(`Selected ${chalk.green.bold(result.bestModel)} (${result.method}, ${Math.round(result.confidence * 100)}% confidence)`);
|
|
3481
|
+
displayCalibratedRoutingDecision('ai-run', calibratedPolicy, routeDecision, routingPreference.warnings);
|
|
3182
3482
|
|
|
3183
3483
|
// Execute the selected model
|
|
3184
3484
|
console.log(chalk.magenta.bold(`\nLaunching ${result.bestModel}...`));
|
package/package.json
CHANGED