llm-checker 3.2.8 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,376 @@
1
+ const fs = require('fs');
2
+ const os = require('os');
3
+ const path = require('path');
4
+ const YAML = require('yaml');
5
+ const { calibrationPolicySchema } = require('./schemas');
6
+
7
+ const DEFAULT_CALIBRATION_POLICY_FILENAMES = [
8
+ 'calibration-policy.yaml',
9
+ 'calibration-policy.yml',
10
+ 'calibration-policy.json'
11
+ ];
12
+
13
+ const TASK_ALIASES = {
14
+ code: 'coding',
15
+ coder: 'coding',
16
+ programming: 'coding',
17
+ chat: 'talking',
18
+ conversation: 'talking',
19
+ talk: 'talking',
20
+ summarize: 'reading',
21
+ summary: 'reading',
22
+ summarization: 'reading',
23
+ vision: 'multimodal',
24
+ image: 'multimodal'
25
+ };
26
+
27
+ function isNonEmptyString(value) {
28
+ return typeof value === 'string' && value.trim().length > 0;
29
+ }
30
+
31
+ function formatValidationError(error) {
32
+ if (!error || !Array.isArray(error.issues)) {
33
+ return String(error?.message || 'validation failed');
34
+ }
35
+
36
+ return error.issues
37
+ .map((issue) => {
38
+ const location = Array.isArray(issue.path) && issue.path.length > 0
39
+ ? issue.path.join('.')
40
+ : 'root';
41
+ return `${location}: ${issue.message}`;
42
+ })
43
+ .join('; ');
44
+ }
45
+
46
+ function resolvePolicyPath(policyPath, cwd = process.cwd()) {
47
+ if (!isNonEmptyString(policyPath)) {
48
+ throw new Error('Calibration policy path must be a non-empty string.');
49
+ }
50
+
51
+ return path.isAbsolute(policyPath)
52
+ ? policyPath
53
+ : path.resolve(cwd, policyPath);
54
+ }
55
+
56
+ function parseCalibrationPolicyPayload(payloadText, policyPath) {
57
+ const extension = path.extname(policyPath).toLowerCase();
58
+ let parsed;
59
+
60
+ try {
61
+ if (extension === '.json') {
62
+ parsed = JSON.parse(payloadText);
63
+ } else {
64
+ parsed = YAML.parse(payloadText);
65
+ }
66
+ } catch (error) {
67
+ throw new Error(`Failed to parse calibration policy file: ${error.message}`);
68
+ }
69
+
70
+ try {
71
+ return calibrationPolicySchema.parse(parsed);
72
+ } catch (error) {
73
+ throw new Error(`Invalid calibration policy payload: ${formatValidationError(error)}`);
74
+ }
75
+ }
76
+
77
+ function loadCalibrationPolicyFile(policyPath, options = {}) {
78
+ const resolvedPath = resolvePolicyPath(policyPath, options.cwd);
79
+
80
+ if (!fs.existsSync(resolvedPath)) {
81
+ throw new Error(`Calibration policy file not found: ${resolvedPath}`);
82
+ }
83
+
84
+ const stats = fs.statSync(resolvedPath);
85
+ if (!stats.isFile()) {
86
+ throw new Error(`Calibration policy path must be a file: ${resolvedPath}`);
87
+ }
88
+
89
+ const payloadText = fs.readFileSync(resolvedPath, 'utf8');
90
+ const policy = parseCalibrationPolicyPayload(payloadText, resolvedPath);
91
+ return {
92
+ policyPath: resolvedPath,
93
+ policy
94
+ };
95
+ }
96
+
97
+ function tryLoadCalibrationPolicy(policyPath, options = {}) {
98
+ const resolvedPath = resolvePolicyPath(policyPath, options.cwd);
99
+
100
+ try {
101
+ const loaded = loadCalibrationPolicyFile(resolvedPath, options);
102
+ return {
103
+ ok: true,
104
+ ...loaded
105
+ };
106
+ } catch (error) {
107
+ return {
108
+ ok: false,
109
+ resolvedPath,
110
+ error
111
+ };
112
+ }
113
+ }
114
+
115
+ function getDefaultCalibrationPolicyCandidates(homeDir = os.homedir()) {
116
+ const baseDir = path.join(homeDir, '.llm-checker');
117
+ return DEFAULT_CALIBRATION_POLICY_FILENAMES.map((fileName) => path.join(baseDir, fileName));
118
+ }
119
+
120
+ function discoverDefaultCalibrationPolicyPath(homeDir = os.homedir()) {
121
+ const candidates = getDefaultCalibrationPolicyCandidates(homeDir);
122
+ for (const candidate of candidates) {
123
+ if (fs.existsSync(candidate) && fs.statSync(candidate).isFile()) {
124
+ return candidate;
125
+ }
126
+ }
127
+ return null;
128
+ }
129
+
130
+ function normalizeTaskName(task) {
131
+ const normalized = String(task || 'general').trim().toLowerCase();
132
+ if (!normalized) return 'general';
133
+ return TASK_ALIASES[normalized] || normalized;
134
+ }
135
+
136
+ function inferTaskFromPrompt(prompt) {
137
+ const text = String(prompt || '').toLowerCase();
138
+ if (!text.trim()) return 'general';
139
+
140
+ if (/\b(code|coding|refactor|function|bug|debug|typescript|javascript|python|java|rust|go)\b/.test(text)) {
141
+ return 'coding';
142
+ }
143
+ if (/\b(reason|reasoning|analy[sz]e|logic|prove|derive|step by step)\b/.test(text)) {
144
+ return 'reasoning';
145
+ }
146
+ if (/\b(vision|image|photo|diagram|screenshot)\b/.test(text)) {
147
+ return 'multimodal';
148
+ }
149
+ if (/\b(summarize|summary|read|reading|article|document)\b/.test(text)) {
150
+ return 'reading';
151
+ }
152
+ if (/\b(creative|story|poem|brainstorm|marketing copy)\b/.test(text)) {
153
+ return 'creative';
154
+ }
155
+ if (/\b(chat|talk|conversation|assistant)\b/.test(text)) {
156
+ return 'talking';
157
+ }
158
+
159
+ return 'general';
160
+ }
161
+
162
+ function buildTaskCandidates(requestedTask) {
163
+ const normalized = normalizeTaskName(requestedTask);
164
+ const candidates = [normalized];
165
+
166
+ if (normalized === 'talking') {
167
+ candidates.push('chat');
168
+ } else if (normalized === 'chat') {
169
+ candidates.push('talking');
170
+ }
171
+
172
+ if (!candidates.includes('general')) {
173
+ candidates.push('general');
174
+ }
175
+
176
+ return [...new Set(candidates)];
177
+ }
178
+
179
+ function resolveCalibrationRoute(policy, requestedTask) {
180
+ const routing = policy && typeof policy.routing === 'object' ? policy.routing : null;
181
+ if (!routing) return null;
182
+
183
+ const routeKeys = Object.keys(routing);
184
+ if (routeKeys.length === 0) return null;
185
+
186
+ const normalizedTask = normalizeTaskName(requestedTask);
187
+ const taskCandidates = buildTaskCandidates(normalizedTask);
188
+
189
+ for (const taskName of taskCandidates) {
190
+ if (Object.prototype.hasOwnProperty.call(routing, taskName)) {
191
+ return {
192
+ requestedTask: normalizedTask,
193
+ resolvedTask: taskName,
194
+ usedTaskFallback: taskName !== normalizedTask,
195
+ route: routing[taskName]
196
+ };
197
+ }
198
+ }
199
+
200
+ const fallbackTask = routeKeys[0];
201
+ return {
202
+ requestedTask: normalizedTask,
203
+ resolvedTask: fallbackTask,
204
+ usedTaskFallback: true,
205
+ route: routing[fallbackTask]
206
+ };
207
+ }
208
+
209
+ function getRouteModelCandidates(route) {
210
+ if (!route || !isNonEmptyString(route.primary)) return [];
211
+
212
+ const merged = [route.primary, ...(Array.isArray(route.fallbacks) ? route.fallbacks : [])];
213
+ const unique = [];
214
+ for (const item of merged) {
215
+ if (!isNonEmptyString(item)) continue;
216
+ const trimmed = item.trim();
217
+ if (!unique.includes(trimmed)) {
218
+ unique.push(trimmed);
219
+ }
220
+ }
221
+
222
+ return unique;
223
+ }
224
+
225
+ function normalizeModelIdentifier(value) {
226
+ return String(value || '').trim().toLowerCase();
227
+ }
228
+
229
+ function splitModelIdentifier(value) {
230
+ const normalized = normalizeModelIdentifier(value);
231
+ if (!normalized) return { full: '', base: '' };
232
+
233
+ const [base] = normalized.split(':');
234
+ return { full: normalized, base: base || normalized };
235
+ }
236
+
237
+ function modelIdentifiersMatch(left, right) {
238
+ const leftId = splitModelIdentifier(left);
239
+ const rightId = splitModelIdentifier(right);
240
+
241
+ if (!leftId.full || !rightId.full) return false;
242
+ if (leftId.full === rightId.full) return true;
243
+ if (leftId.base === rightId.base) return true;
244
+ if (leftId.full.startsWith(`${rightId.base}:`)) return true;
245
+ if (rightId.full.startsWith(`${leftId.base}:`)) return true;
246
+
247
+ return false;
248
+ }
249
+
250
+ function selectModelFromRoute(route, availableModels = []) {
251
+ const routeCandidates = getRouteModelCandidates(route);
252
+ if (routeCandidates.length === 0) return null;
253
+
254
+ if (!Array.isArray(availableModels) || availableModels.length === 0) {
255
+ return {
256
+ selectedModel: routeCandidates[0],
257
+ matchedRouteModel: routeCandidates[0],
258
+ usedFallback: false,
259
+ routeCandidates
260
+ };
261
+ }
262
+
263
+ for (const routeModel of routeCandidates) {
264
+ const matched = availableModels.find((candidate) =>
265
+ modelIdentifiersMatch(routeModel, candidate)
266
+ );
267
+
268
+ if (matched) {
269
+ return {
270
+ selectedModel: matched,
271
+ matchedRouteModel: routeModel,
272
+ usedFallback: routeModel !== routeCandidates[0],
273
+ routeCandidates
274
+ };
275
+ }
276
+ }
277
+
278
+ return null;
279
+ }
280
+
281
+ function resolveRoutingPolicyPreference({
282
+ policyOption,
283
+ calibratedOption,
284
+ loadEnterprisePolicy,
285
+ cwd = process.cwd(),
286
+ homeDir = os.homedir()
287
+ } = {}) {
288
+ const result = {
289
+ enterprisePolicy: null,
290
+ calibratedPolicy: null,
291
+ warnings: []
292
+ };
293
+
294
+ const calibratedRequested = calibratedOption !== undefined && calibratedOption !== false;
295
+
296
+ if (isNonEmptyString(policyOption)) {
297
+ const calibrationAttempt = tryLoadCalibrationPolicy(policyOption, { cwd });
298
+ if (calibrationAttempt.ok) {
299
+ result.calibratedPolicy = {
300
+ policyPath: calibrationAttempt.policyPath,
301
+ policy: calibrationAttempt.policy,
302
+ source: '--policy'
303
+ };
304
+ } else {
305
+ if (typeof loadEnterprisePolicy !== 'function') {
306
+ throw calibrationAttempt.error;
307
+ }
308
+ result.enterprisePolicy = loadEnterprisePolicy(policyOption);
309
+ }
310
+
311
+ if (calibratedRequested) {
312
+ result.warnings.push('Ignoring --calibrated because --policy takes precedence.');
313
+ }
314
+
315
+ return result;
316
+ }
317
+
318
+ if (!calibratedRequested) {
319
+ return result;
320
+ }
321
+
322
+ if (isNonEmptyString(calibratedOption)) {
323
+ const attempt = tryLoadCalibrationPolicy(calibratedOption, { cwd });
324
+ if (attempt.ok) {
325
+ result.calibratedPolicy = {
326
+ policyPath: attempt.policyPath,
327
+ policy: attempt.policy,
328
+ source: '--calibrated'
329
+ };
330
+ return result;
331
+ }
332
+
333
+ result.warnings.push(
334
+ `Unable to load calibrated policy from ${attempt.resolvedPath}: ${attempt.error.message}. Falling back to deterministic selector.`
335
+ );
336
+ return result;
337
+ }
338
+
339
+ const discoveredPath = discoverDefaultCalibrationPolicyPath(homeDir);
340
+ if (!discoveredPath) {
341
+ result.warnings.push(
342
+ 'No default calibrated policy found at ~/.llm-checker/calibration-policy.{yaml,yml,json}. Falling back to deterministic selector.'
343
+ );
344
+ return result;
345
+ }
346
+
347
+ const defaultAttempt = tryLoadCalibrationPolicy(discoveredPath, { cwd });
348
+ if (defaultAttempt.ok) {
349
+ result.calibratedPolicy = {
350
+ policyPath: defaultAttempt.policyPath,
351
+ policy: defaultAttempt.policy,
352
+ source: 'default-discovery'
353
+ };
354
+ return result;
355
+ }
356
+
357
+ result.warnings.push(
358
+ `Unable to load discovered calibrated policy ${defaultAttempt.resolvedPath}: ${defaultAttempt.error.message}. Falling back to deterministic selector.`
359
+ );
360
+ return result;
361
+ }
362
+
363
+ module.exports = {
364
+ DEFAULT_CALIBRATION_POLICY_FILENAMES,
365
+ getDefaultCalibrationPolicyCandidates,
366
+ discoverDefaultCalibrationPolicyPath,
367
+ loadCalibrationPolicyFile,
368
+ tryLoadCalibrationPolicy,
369
+ normalizeTaskName,
370
+ inferTaskFromPrompt,
371
+ resolveCalibrationRoute,
372
+ getRouteModelCandidates,
373
+ modelIdentifiersMatch,
374
+ selectModelFromRoute,
375
+ resolveRoutingPolicyPreference
376
+ };
@@ -0,0 +1,212 @@
1
+ const { z } = require('zod');
2
+
3
+ const SUPPORTED_CALIBRATION_OBJECTIVES = ['speed', 'quality', 'balanced'];
4
+ const SUPPORTED_CALIBRATION_EXECUTION_MODES = ['dry-run', 'contract-only', 'full'];
5
+ const DEFAULT_CALIBRATION_TASK = 'general';
6
+
7
+ const nonEmptyStringSchema = z.string().trim().min(1);
8
+ const nonNegativeNumberSchema = z.number().finite().min(0);
9
+ const percentageScoreSchema = z.number().finite().min(0).max(100);
10
+ const nonNegativeIntegerSchema = z.number().int().min(0);
11
+
12
+ const isoDateTimeSchema = nonEmptyStringSchema.refine(
13
+ (value) => !Number.isNaN(Date.parse(value)),
14
+ 'Must be a valid ISO timestamp'
15
+ );
16
+
17
+ const calibrationObjectiveSchema = z.enum(SUPPORTED_CALIBRATION_OBJECTIVES);
18
+ const calibrationExecutionModeSchema = z.enum(SUPPORTED_CALIBRATION_EXECUTION_MODES);
19
+ const runtimeSchema = z.enum(['ollama', 'vllm', 'mlx']);
20
+
21
+ const promptSuiteCheckSchema = z
22
+ .object({
23
+ type: z.enum(['exact', 'contains', 'regex']),
24
+ expected: nonEmptyStringSchema,
25
+ weight: z.number().finite().positive().optional()
26
+ })
27
+ .strict();
28
+
29
+ const promptSuiteEntrySchema = z
30
+ .object({
31
+ id: nonEmptyStringSchema.optional(),
32
+ task: nonEmptyStringSchema.optional(),
33
+ prompt: nonEmptyStringSchema,
34
+ checks: z.array(promptSuiteCheckSchema).optional()
35
+ })
36
+ .strict();
37
+
38
+ const calibrationMetricsSchema = z
39
+ .object({
40
+ ttft_ms: nonNegativeNumberSchema.optional(),
41
+ tokens_per_second: nonNegativeNumberSchema.optional(),
42
+ latency_ms_p50: nonNegativeNumberSchema.optional(),
43
+ latency_ms_p95: nonNegativeNumberSchema.optional(),
44
+ peak_memory_mb: nonNegativeNumberSchema.optional()
45
+ })
46
+ .strict();
47
+
48
+ const calibrationQualitySchema = z
49
+ .object({
50
+ overall_score: percentageScoreSchema.optional(),
51
+ task_scores: z.record(nonEmptyStringSchema, percentageScoreSchema).optional(),
52
+ check_pass_rate: z.number().finite().min(0).max(1).optional()
53
+ })
54
+ .strict();
55
+
56
+ const calibrationCheckResultTraceSchema = z
57
+ .object({
58
+ type: z.enum(['exact', 'contains', 'regex']),
59
+ expected: nonEmptyStringSchema,
60
+ weight: z.number().finite().positive(),
61
+ passed: z.boolean(),
62
+ error: nonEmptyStringSchema.optional()
63
+ })
64
+ .strict();
65
+
66
+ const calibrationPromptRunTraceSchema = z
67
+ .object({
68
+ prompt_id: nonEmptyStringSchema,
69
+ task: nonEmptyStringSchema,
70
+ latency_ms: nonNegativeNumberSchema,
71
+ ttft_ms: nonNegativeNumberSchema.optional(),
72
+ output_tokens: nonNegativeIntegerSchema,
73
+ response_excerpt: z.string().optional(),
74
+ check_results: z.array(calibrationCheckResultTraceSchema),
75
+ check_pass_rate: z.number().finite().min(0).max(1)
76
+ })
77
+ .strict();
78
+
79
+ const calibrationTraceSchema = z
80
+ .object({
81
+ warmup_runs: nonNegativeIntegerSchema.optional(),
82
+ measured_iterations: z.number().int().min(1).optional(),
83
+ prompt_runs: z.array(calibrationPromptRunTraceSchema).optional(),
84
+ error_code: nonEmptyStringSchema.optional()
85
+ })
86
+ .strict();
87
+
88
+ const calibrationModelResultSchema = z
89
+ .object({
90
+ model_identifier: nonEmptyStringSchema,
91
+ status: z.enum(['success', 'failed', 'skipped', 'pending']),
92
+ metrics: calibrationMetricsSchema.optional(),
93
+ quality: calibrationQualitySchema.optional(),
94
+ traces: calibrationTraceSchema.optional(),
95
+ error: nonEmptyStringSchema.optional()
96
+ })
97
+ .strict()
98
+ .superRefine((value, context) => {
99
+ if (value.status === 'success' && !value.metrics) {
100
+ context.addIssue({
101
+ code: z.ZodIssueCode.custom,
102
+ path: ['metrics'],
103
+ message: 'metrics are required when status is success'
104
+ });
105
+ }
106
+
107
+ if (value.status === 'failed' && !value.error) {
108
+ context.addIssue({
109
+ code: z.ZodIssueCode.custom,
110
+ path: ['error'],
111
+ message: 'error is required when status is failed'
112
+ });
113
+ }
114
+ });
115
+
116
+ const calibrationSuiteMetadataSchema = z
117
+ .object({
118
+ path: nonEmptyStringSchema,
119
+ total_prompts: nonNegativeIntegerSchema,
120
+ task_breakdown: z.record(nonEmptyStringSchema, nonNegativeIntegerSchema)
121
+ })
122
+ .strict();
123
+
124
+ const calibrationSummarySchema = z
125
+ .object({
126
+ total_models: nonNegativeIntegerSchema,
127
+ successful_models: nonNegativeIntegerSchema,
128
+ failed_models: nonNegativeIntegerSchema,
129
+ skipped_models: nonNegativeIntegerSchema,
130
+ pending_models: nonNegativeIntegerSchema
131
+ })
132
+ .strict()
133
+ .superRefine((value, context) => {
134
+ const countedTotal =
135
+ value.successful_models +
136
+ value.failed_models +
137
+ value.skipped_models +
138
+ value.pending_models;
139
+
140
+ if (countedTotal !== value.total_models) {
141
+ context.addIssue({
142
+ code: z.ZodIssueCode.custom,
143
+ path: ['total_models'],
144
+ message:
145
+ 'total_models must equal successful_models + failed_models + skipped_models + pending_models'
146
+ });
147
+ }
148
+ });
149
+
150
+ const calibrationResultSchema = z
151
+ .object({
152
+ schema_version: z.literal('1.0'),
153
+ generated_at: isoDateTimeSchema,
154
+ calibration_version: nonEmptyStringSchema,
155
+ execution_mode: calibrationExecutionModeSchema,
156
+ runtime: runtimeSchema,
157
+ objective: calibrationObjectiveSchema,
158
+ hardware: z
159
+ .object({
160
+ fingerprint: nonEmptyStringSchema.optional(),
161
+ description: nonEmptyStringSchema.optional()
162
+ })
163
+ .strict()
164
+ .default({}),
165
+ suite: calibrationSuiteMetadataSchema,
166
+ models: z.array(calibrationModelResultSchema),
167
+ summary: calibrationSummarySchema
168
+ })
169
+ .strict();
170
+
171
+ const calibrationRouteSchema = z
172
+ .object({
173
+ primary: nonEmptyStringSchema,
174
+ fallbacks: z.array(nonEmptyStringSchema),
175
+ min_quality: percentageScoreSchema.optional(),
176
+ rationale: nonEmptyStringSchema.optional()
177
+ })
178
+ .strict();
179
+
180
+ const calibrationPolicySchema = z
181
+ .object({
182
+ schema_version: z.literal('1.0'),
183
+ generated_at: isoDateTimeSchema,
184
+ objective: calibrationObjectiveSchema,
185
+ source: z
186
+ .object({
187
+ calibration_version: nonEmptyStringSchema,
188
+ calibration_result_path: nonEmptyStringSchema.optional()
189
+ })
190
+ .strict(),
191
+ routing: z.record(nonEmptyStringSchema, calibrationRouteSchema),
192
+ metadata: z
193
+ .object({
194
+ runtime: runtimeSchema.optional(),
195
+ hardware_fingerprint: nonEmptyStringSchema.optional()
196
+ })
197
+ .strict()
198
+ .optional()
199
+ })
200
+ .strict();
201
+
202
+ module.exports = {
203
+ SUPPORTED_CALIBRATION_OBJECTIVES,
204
+ SUPPORTED_CALIBRATION_EXECUTION_MODES,
205
+ DEFAULT_CALIBRATION_TASK,
206
+ calibrationObjectiveSchema,
207
+ calibrationExecutionModeSchema,
208
+ promptSuiteCheckSchema,
209
+ promptSuiteEntrySchema,
210
+ calibrationResultSchema,
211
+ calibrationPolicySchema
212
+ };