@grekt/cli 6.42.1 → 6.43.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +571 -8
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -69100,25 +69100,43 @@ var __getProtoOf2 = Object.getPrototypeOf;
|
|
|
69100
69100
|
var __defProp2 = Object.defineProperty;
|
|
69101
69101
|
var __getOwnPropNames2 = Object.getOwnPropertyNames;
|
|
69102
69102
|
var __hasOwnProp2 = Object.prototype.hasOwnProperty;
|
|
69103
|
+
function __accessProp2(key) {
|
|
69104
|
+
return this[key];
|
|
69105
|
+
}
|
|
69106
|
+
var __toESMCache_node2;
|
|
69107
|
+
var __toESMCache_esm2;
|
|
69103
69108
|
var __toESM2 = (mod, isNodeMode, target) => {
|
|
69109
|
+
var canCache = mod != null && typeof mod === "object";
|
|
69110
|
+
if (canCache) {
|
|
69111
|
+
var cache2 = isNodeMode ? __toESMCache_node2 ??= new WeakMap : __toESMCache_esm2 ??= new WeakMap;
|
|
69112
|
+
var cached = cache2.get(mod);
|
|
69113
|
+
if (cached)
|
|
69114
|
+
return cached;
|
|
69115
|
+
}
|
|
69104
69116
|
target = mod != null ? __create2(__getProtoOf2(mod)) : {};
|
|
69105
69117
|
const to = isNodeMode || !mod || !mod.__esModule ? __defProp2(target, "default", { value: mod, enumerable: true }) : target;
|
|
69106
69118
|
for (let key of __getOwnPropNames2(mod))
|
|
69107
69119
|
if (!__hasOwnProp2.call(to, key))
|
|
69108
69120
|
__defProp2(to, key, {
|
|
69109
|
-
get: (
|
|
69121
|
+
get: __accessProp2.bind(mod, key),
|
|
69110
69122
|
enumerable: true
|
|
69111
69123
|
});
|
|
69124
|
+
if (canCache)
|
|
69125
|
+
cache2.set(mod, to);
|
|
69112
69126
|
return to;
|
|
69113
69127
|
};
|
|
69114
69128
|
var __commonJS2 = (cb, mod) => () => (mod || cb((mod = { exports: {} }).exports, mod), mod.exports);
|
|
69129
|
+
var __returnValue2 = (v) => v;
|
|
69130
|
+
function __exportSetter2(name2, newValue) {
|
|
69131
|
+
this[name2] = __returnValue2.bind(null, newValue);
|
|
69132
|
+
}
|
|
69115
69133
|
var __export2 = (target, all) => {
|
|
69116
69134
|
for (var name2 in all)
|
|
69117
69135
|
__defProp2(target, name2, {
|
|
69118
69136
|
get: all[name2],
|
|
69119
69137
|
enumerable: true,
|
|
69120
69138
|
configurable: true,
|
|
69121
|
-
set: (
|
|
69139
|
+
set: __exportSetter2.bind(all, name2)
|
|
69122
69140
|
});
|
|
69123
69141
|
};
|
|
69124
69142
|
var __require2 = /* @__PURE__ */ createRequire2(import.meta.url);
|
|
@@ -85174,10 +85192,15 @@ var StoredSessionSchema = exports_external.object({
|
|
|
85174
85192
|
expires_at: exports_external.number().optional()
|
|
85175
85193
|
});
|
|
85176
85194
|
var TokensSchema = exports_external.record(exports_external.string(), exports_external.string());
|
|
85195
|
+
var EvalLocalConfigSchema = exports_external.object({
|
|
85196
|
+
providers: exports_external.array(exports_external.string()).optional(),
|
|
85197
|
+
server: exports_external.string().optional()
|
|
85198
|
+
});
|
|
85177
85199
|
var LocalConfigSchema = exports_external.object({
|
|
85178
85200
|
registries: exports_external.record(exports_external.string().regex(/^@/, "Registry scope must start with @"), RegistryEntrySchema).optional(),
|
|
85179
85201
|
session: StoredSessionSchema.optional(),
|
|
85180
|
-
tokens: TokensSchema.optional()
|
|
85202
|
+
tokens: TokensSchema.optional(),
|
|
85203
|
+
eval: EvalLocalConfigSchema.optional()
|
|
85181
85204
|
});
|
|
85182
85205
|
var ComponentTypeSchema = exports_external.enum(CATEGORIES);
|
|
85183
85206
|
var WorkspaceConfigSchema = exports_external.object({
|
|
@@ -90151,6 +90174,169 @@ function verifyTrustSignature(artifactId, signature, key) {
|
|
|
90151
90174
|
return false;
|
|
90152
90175
|
return timingSafeEqual(signatureBuffer, expectedBuffer);
|
|
90153
90176
|
}
|
|
90177
|
+
var EVALUABLE_CATEGORIES = ["agents", "skills", "commands"];
|
|
90178
|
+
var EvalAssertionSchema = exports_external.object({
|
|
90179
|
+
type: exports_external.string(),
|
|
90180
|
+
value: exports_external.union([exports_external.string(), exports_external.array(exports_external.string())]).optional(),
|
|
90181
|
+
threshold: exports_external.number().optional(),
|
|
90182
|
+
weight: exports_external.number().optional()
|
|
90183
|
+
});
|
|
90184
|
+
var EvalTestCaseSchema = exports_external.object({
|
|
90185
|
+
description: exports_external.string().optional(),
|
|
90186
|
+
vars: exports_external.record(exports_external.string(), exports_external.string()),
|
|
90187
|
+
assert: exports_external.array(EvalAssertionSchema).min(1, "At least one assertion is required")
|
|
90188
|
+
});
|
|
90189
|
+
var EvalFileConfigSchema = exports_external.object({
|
|
90190
|
+
provider: exports_external.string().optional(),
|
|
90191
|
+
tests: exports_external.array(EvalTestCaseSchema).min(1, "At least one test case is required")
|
|
90192
|
+
});
|
|
90193
|
+
var EVAL_EXTENSION = ".eval.yaml";
|
|
90194
|
+
function collectFiles2(fs12, dir, basePath = "") {
|
|
90195
|
+
const paths = [];
|
|
90196
|
+
let entries;
|
|
90197
|
+
try {
|
|
90198
|
+
entries = fs12.readdir(dir);
|
|
90199
|
+
} catch {
|
|
90200
|
+
return paths;
|
|
90201
|
+
}
|
|
90202
|
+
for (const entry of entries) {
|
|
90203
|
+
const fullPath = `${dir}/${entry}`;
|
|
90204
|
+
const relativePath = basePath ? `${basePath}/${entry}` : entry;
|
|
90205
|
+
try {
|
|
90206
|
+
const stat = fs12.stat(fullPath);
|
|
90207
|
+
if (stat.isDirectory) {
|
|
90208
|
+
paths.push(...collectFiles2(fs12, fullPath, relativePath));
|
|
90209
|
+
} else {
|
|
90210
|
+
paths.push(relativePath);
|
|
90211
|
+
}
|
|
90212
|
+
} catch {}
|
|
90213
|
+
}
|
|
90214
|
+
return paths;
|
|
90215
|
+
}
|
|
90216
|
+
function discoverEvals(fs12, options2) {
|
|
90217
|
+
const { artifactDir, artifactId, filter } = options2;
|
|
90218
|
+
const evals = [];
|
|
90219
|
+
const warnings = [];
|
|
90220
|
+
const allFiles = collectFiles2(fs12, artifactDir);
|
|
90221
|
+
const evalFiles = allFiles.filter((f) => f.endsWith(EVAL_EXTENSION));
|
|
90222
|
+
for (const evalRelativePath of evalFiles) {
|
|
90223
|
+
const evalFullPath = `${artifactDir}/${evalRelativePath}`;
|
|
90224
|
+
const baseName = evalRelativePath.slice(0, -EVAL_EXTENSION.length);
|
|
90225
|
+
const mdRelativePath = `${baseName}.md`;
|
|
90226
|
+
const mdFullPath = `${artifactDir}/${mdRelativePath}`;
|
|
90227
|
+
if (!fs12.exists(mdFullPath)) {
|
|
90228
|
+
warnings.push({
|
|
90229
|
+
evalFilePath: evalRelativePath,
|
|
90230
|
+
message: `Skipped ${evalRelativePath}: no matching .md file found (expected ${mdRelativePath})`
|
|
90231
|
+
});
|
|
90232
|
+
continue;
|
|
90233
|
+
}
|
|
90234
|
+
let mdContent;
|
|
90235
|
+
try {
|
|
90236
|
+
mdContent = fs12.readFile(mdFullPath);
|
|
90237
|
+
} catch {
|
|
90238
|
+
warnings.push({
|
|
90239
|
+
evalFilePath: evalRelativePath,
|
|
90240
|
+
message: `Skipped ${evalRelativePath}: could not read ${mdRelativePath}`
|
|
90241
|
+
});
|
|
90242
|
+
continue;
|
|
90243
|
+
}
|
|
90244
|
+
const frontmatterResult = parseFrontmatter(mdContent);
|
|
90245
|
+
if (!frontmatterResult.success) {
|
|
90246
|
+
warnings.push({
|
|
90247
|
+
evalFilePath: evalRelativePath,
|
|
90248
|
+
message: `Skipped ${evalRelativePath}: ${mdRelativePath} has invalid or missing frontmatter`
|
|
90249
|
+
});
|
|
90250
|
+
continue;
|
|
90251
|
+
}
|
|
90252
|
+
const { frontmatter: frontmatter2, content: systemPrompt } = frontmatterResult.parsed;
|
|
90253
|
+
const elementType = frontmatter2["grk-type"];
|
|
90254
|
+
const elementName = frontmatter2["grk-name"];
|
|
90255
|
+
if (!EVALUABLE_CATEGORIES.includes(elementType)) {
|
|
90256
|
+
warnings.push({
|
|
90257
|
+
evalFilePath: evalRelativePath,
|
|
90258
|
+
message: `Skipped ${evalRelativePath}: ${elementType} is not evaluable (only agents, skills, commands)`
|
|
90259
|
+
});
|
|
90260
|
+
continue;
|
|
90261
|
+
}
|
|
90262
|
+
let evalYamlContent;
|
|
90263
|
+
try {
|
|
90264
|
+
evalYamlContent = fs12.readFile(evalFullPath);
|
|
90265
|
+
} catch {
|
|
90266
|
+
warnings.push({
|
|
90267
|
+
evalFilePath: evalRelativePath,
|
|
90268
|
+
message: `Skipped ${evalRelativePath}: file could not be read`
|
|
90269
|
+
});
|
|
90270
|
+
continue;
|
|
90271
|
+
}
|
|
90272
|
+
const parseResult = safeParseYaml(evalYamlContent, EvalFileConfigSchema, evalRelativePath);
|
|
90273
|
+
if (!parseResult.success) {
|
|
90274
|
+
const details = parseResult.error.details?.join(", ") ?? "";
|
|
90275
|
+
warnings.push({
|
|
90276
|
+
evalFilePath: evalRelativePath,
|
|
90277
|
+
message: `Skipped ${evalRelativePath}: ${parseResult.error.message}${details ? ` (${details})` : ""}`
|
|
90278
|
+
});
|
|
90279
|
+
continue;
|
|
90280
|
+
}
|
|
90281
|
+
const evalConfig = parseResult.data;
|
|
90282
|
+
if (filter?.elementName && elementName !== filter.elementName)
|
|
90283
|
+
continue;
|
|
90284
|
+
if (filter?.elementType && elementType !== filter.elementType)
|
|
90285
|
+
continue;
|
|
90286
|
+
evals.push({
|
|
90287
|
+
artifactId,
|
|
90288
|
+
elementName,
|
|
90289
|
+
elementType,
|
|
90290
|
+
elementPath: mdRelativePath,
|
|
90291
|
+
systemPrompt: systemPrompt.trim(),
|
|
90292
|
+
evalConfig,
|
|
90293
|
+
evalFilePath: evalRelativePath
|
|
90294
|
+
});
|
|
90295
|
+
}
|
|
90296
|
+
return { evals, warnings };
|
|
90297
|
+
}
|
|
90298
|
+
function calculateScore(passed, total) {
|
|
90299
|
+
if (total === 0)
|
|
90300
|
+
return 0;
|
|
90301
|
+
return Math.round(passed / total * 100);
|
|
90302
|
+
}
|
|
90303
|
+
function scoreToGrade(score) {
|
|
90304
|
+
if (score >= 95)
|
|
90305
|
+
return "A";
|
|
90306
|
+
if (score >= 80)
|
|
90307
|
+
return "B";
|
|
90308
|
+
if (score >= 65)
|
|
90309
|
+
return "C";
|
|
90310
|
+
if (score >= 50)
|
|
90311
|
+
return "D";
|
|
90312
|
+
return "F";
|
|
90313
|
+
}
|
|
90314
|
+
function summarizeResults(results) {
|
|
90315
|
+
if (results.length === 0) {
|
|
90316
|
+
return {
|
|
90317
|
+
results: [],
|
|
90318
|
+
overallScore: 0,
|
|
90319
|
+
overallGrade: "F",
|
|
90320
|
+
totalPassed: 0,
|
|
90321
|
+
totalTests: 0,
|
|
90322
|
+
totalIssues: 0
|
|
90323
|
+
};
|
|
90324
|
+
}
|
|
90325
|
+
const totalPassed = results.reduce((sum, r) => sum + r.passed, 0);
|
|
90326
|
+
const totalTests = results.reduce((sum, r) => sum + r.total, 0);
|
|
90327
|
+
const totalIssues = results.reduce((sum, r) => sum + r.failures.length, 0);
|
|
90328
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.total, 0);
|
|
90329
|
+
const overallScore = totalTests > 0 ? Math.round(weightedSum / totalTests) : 0;
|
|
90330
|
+
const overallGrade = scoreToGrade(overallScore);
|
|
90331
|
+
return {
|
|
90332
|
+
results,
|
|
90333
|
+
overallScore,
|
|
90334
|
+
overallGrade,
|
|
90335
|
+
totalPassed,
|
|
90336
|
+
totalTests,
|
|
90337
|
+
totalIssues
|
|
90338
|
+
};
|
|
90339
|
+
}
|
|
90154
90340
|
|
|
90155
90341
|
// src/constants.ts
|
|
90156
90342
|
var REGISTRY_HOST2 = "registry.grekt.com";
|
|
@@ -104184,6 +104370,382 @@ var untrustCommand = new Command("untrust").description("Remove trusted status f
|
|
|
104184
104370
|
success(`Removed trusted status from ${colors5.highlight(artifactId)}`);
|
|
104185
104371
|
});
|
|
104186
104372
|
|
|
104373
|
+
// src/commands/eval.ts
|
|
104374
|
+
import { join as join35 } from "path";
|
|
104375
|
+
|
|
104376
|
+
// src/eval/promptfoo-engine.ts
|
|
104377
|
+
function detectPromptfoo() {
|
|
104378
|
+
try {
|
|
104379
|
+
const result = Bun.spawnSync(["promptfoo", "--version"], { stdout: "pipe", stderr: "pipe" });
|
|
104380
|
+
if (result.exitCode === 0)
|
|
104381
|
+
return "global";
|
|
104382
|
+
} catch {}
|
|
104383
|
+
try {
|
|
104384
|
+
const result = Bun.spawnSync(["npx", "promptfoo", "--version"], { stdout: "pipe", stderr: "pipe" });
|
|
104385
|
+
if (result.exitCode === 0)
|
|
104386
|
+
return "npx";
|
|
104387
|
+
} catch {}
|
|
104388
|
+
return "none";
|
|
104389
|
+
}
|
|
104390
|
+
function getCommand(mode) {
|
|
104391
|
+
return mode === "npx" ? ["npx", "promptfoo"] : ["promptfoo"];
|
|
104392
|
+
}
|
|
104393
|
+
function assemblePromptfooConfig(config) {
|
|
104394
|
+
return {
|
|
104395
|
+
prompts: [
|
|
104396
|
+
{
|
|
104397
|
+
raw: JSON.stringify([
|
|
104398
|
+
{ role: "system", content: config.systemPrompt },
|
|
104399
|
+
{ role: "user", content: "{{input}}" }
|
|
104400
|
+
])
|
|
104401
|
+
}
|
|
104402
|
+
],
|
|
104403
|
+
providers: [config.provider],
|
|
104404
|
+
tests: config.tests.map((test) => ({
|
|
104405
|
+
description: test.description,
|
|
104406
|
+
vars: test.vars,
|
|
104407
|
+
assert: test.assert.map((a) => ({
|
|
104408
|
+
type: a.type,
|
|
104409
|
+
...a.value !== undefined && { value: a.value },
|
|
104410
|
+
...a.threshold !== undefined && { threshold: a.threshold },
|
|
104411
|
+
...a.weight !== undefined && { weight: a.weight }
|
|
104412
|
+
}))
|
|
104413
|
+
}))
|
|
104414
|
+
};
|
|
104415
|
+
}
|
|
104416
|
+
function extractFailures(promptfooResults) {
|
|
104417
|
+
let passed = 0;
|
|
104418
|
+
let total = 0;
|
|
104419
|
+
const failures = [];
|
|
104420
|
+
for (const result of promptfooResults) {
|
|
104421
|
+
const r = result;
|
|
104422
|
+
const success2 = r.success;
|
|
104423
|
+
total++;
|
|
104424
|
+
if (success2) {
|
|
104425
|
+
passed++;
|
|
104426
|
+
} else {
|
|
104427
|
+
const gradingResult = r.gradingResult;
|
|
104428
|
+
const componentResults = gradingResult?.componentResults ?? [];
|
|
104429
|
+
for (const component of componentResults) {
|
|
104430
|
+
if (!component.pass) {
|
|
104431
|
+
const assertion = component.assertion;
|
|
104432
|
+
failures.push({
|
|
104433
|
+
testDescription: r.description ?? `Test ${total}`,
|
|
104434
|
+
assertionType: assertion?.type ?? "unknown",
|
|
104435
|
+
expected: String(assertion?.value ?? ""),
|
|
104436
|
+
actual: String(component.reason ?? "")
|
|
104437
|
+
});
|
|
104438
|
+
}
|
|
104439
|
+
}
|
|
104440
|
+
if (componentResults.length === 0) {
|
|
104441
|
+
failures.push({
|
|
104442
|
+
testDescription: r.description ?? `Test ${total}`,
|
|
104443
|
+
assertionType: "unknown",
|
|
104444
|
+
expected: "",
|
|
104445
|
+
actual: String(gradingResult?.reason ?? "unknown error")
|
|
104446
|
+
});
|
|
104447
|
+
}
|
|
104448
|
+
}
|
|
104449
|
+
}
|
|
104450
|
+
return { passed, total, failures };
|
|
104451
|
+
}
|
|
104452
|
+
function createPromptfooEngine() {
|
|
104453
|
+
let mode = detectPromptfoo();
|
|
104454
|
+
return {
|
|
104455
|
+
name: "promptfoo",
|
|
104456
|
+
isAvailable() {
|
|
104457
|
+
return mode !== "none";
|
|
104458
|
+
},
|
|
104459
|
+
async ensureAvailable() {
|
|
104460
|
+
if (mode !== "none")
|
|
104461
|
+
return true;
|
|
104462
|
+
const spin = spinner("Downloading promptfoo via npx...");
|
|
104463
|
+
spin.start();
|
|
104464
|
+
Bun.spawnSync(["npx", "promptfoo@latest", "--version"], { stdout: "pipe", stderr: "pipe" });
|
|
104465
|
+
spin.stop();
|
|
104466
|
+
mode = detectPromptfoo();
|
|
104467
|
+
return mode !== "none";
|
|
104468
|
+
},
|
|
104469
|
+
async run(config) {
|
|
104470
|
+
const promptfooConfig = assemblePromptfooConfig(config);
|
|
104471
|
+
if (mode === "global") {
|
|
104472
|
+
try {
|
|
104473
|
+
const promptfoo = await import("promptfoo");
|
|
104474
|
+
const evaluate = promptfoo.evaluate;
|
|
104475
|
+
if (typeof evaluate === "function") {
|
|
104476
|
+
const evaluateResult = await evaluate(promptfooConfig);
|
|
104477
|
+
const results = evaluateResult.results ?? [];
|
|
104478
|
+
return extractFailures(results);
|
|
104479
|
+
}
|
|
104480
|
+
} catch {}
|
|
104481
|
+
}
|
|
104482
|
+
return runViaCli(mode, promptfooConfig);
|
|
104483
|
+
},
|
|
104484
|
+
openReport() {
|
|
104485
|
+
const cmd = getCommand(mode);
|
|
104486
|
+
Bun.spawnSync([...cmd, "view"], { stdout: "inherit", stderr: "inherit" });
|
|
104487
|
+
}
|
|
104488
|
+
};
|
|
104489
|
+
}
|
|
104490
|
+
async function runViaCli(mode, promptfooConfig) {
|
|
104491
|
+
const tempDir = `${process.env.TMPDIR ?? "/tmp"}/grekt-eval-${Date.now()}`;
|
|
104492
|
+
const configPath = `${tempDir}/promptfoo-config.json`;
|
|
104493
|
+
const outputPath = `${tempDir}/output.json`;
|
|
104494
|
+
const { mkdirSync: mkdirSync3, writeFileSync: writeFileSync2, readFileSync: readFileSync2, rmSync: rmSync2 } = await import("fs");
|
|
104495
|
+
mkdirSync3(tempDir, { recursive: true });
|
|
104496
|
+
try {
|
|
104497
|
+
writeFileSync2(configPath, JSON.stringify(promptfooConfig, null, 2));
|
|
104498
|
+
const cmd = getCommand(mode);
|
|
104499
|
+
const result = Bun.spawnSync([...cmd, "eval", "--config", configPath, "--output", outputPath, "--no-cache"], { stdout: "pipe", stderr: "pipe" });
|
|
104500
|
+
if (result.exitCode !== 0) {
|
|
104501
|
+
const stderr = result.stderr.toString();
|
|
104502
|
+
throw new Error(`promptfoo eval failed: ${stderr}`);
|
|
104503
|
+
}
|
|
104504
|
+
const output = JSON.parse(readFileSync2(outputPath, "utf-8"));
|
|
104505
|
+
const results = output.results ?? [];
|
|
104506
|
+
return extractFailures(results);
|
|
104507
|
+
} finally {
|
|
104508
|
+
rmSync2(tempDir, { recursive: true, force: true });
|
|
104509
|
+
}
|
|
104510
|
+
}
|
|
104511
|
+
|
|
104512
|
+
// src/eval/engine-resolver.ts
|
|
104513
|
+
var AVAILABLE_ENGINES = [
|
|
104514
|
+
{
|
|
104515
|
+
name: "promptfoo",
|
|
104516
|
+
description: "Open source LLM eval framework. Runs via npx if not installed",
|
|
104517
|
+
create: createPromptfooEngine
|
|
104518
|
+
}
|
|
104519
|
+
];
|
|
104520
|
+
async function resolveEvalEngine() {
|
|
104521
|
+
for (const entry2 of AVAILABLE_ENGINES) {
|
|
104522
|
+
const engine3 = entry2.create();
|
|
104523
|
+
if (engine3.isAvailable())
|
|
104524
|
+
return engine3;
|
|
104525
|
+
}
|
|
104526
|
+
newline();
|
|
104527
|
+
info("No eval engine detected");
|
|
104528
|
+
newline();
|
|
104529
|
+
const selected = await esm_default6({
|
|
104530
|
+
message: "Select an eval engine to use:",
|
|
104531
|
+
choices: AVAILABLE_ENGINES.map((entry2) => ({
|
|
104532
|
+
name: `${entry2.name} - ${entry2.description}`,
|
|
104533
|
+
value: entry2.name
|
|
104534
|
+
}))
|
|
104535
|
+
});
|
|
104536
|
+
const entry = AVAILABLE_ENGINES.find((e) => e.name === selected);
|
|
104537
|
+
if (!entry)
|
|
104538
|
+
return null;
|
|
104539
|
+
const engine2 = entry.create();
|
|
104540
|
+
const ready = await engine2.ensureAvailable();
|
|
104541
|
+
if (!ready) {
|
|
104542
|
+
newline();
|
|
104543
|
+
error(`Failed to set up ${entry.name}`);
|
|
104544
|
+
newline();
|
|
104545
|
+
log(" Install it manually with one of:");
|
|
104546
|
+
log(` ${colors5.dim("$")} npm install -g ${entry.name}`);
|
|
104547
|
+
log(` ${colors5.dim("$")} brew install ${entry.name}`);
|
|
104548
|
+
return null;
|
|
104549
|
+
}
|
|
104550
|
+
return engine2;
|
|
104551
|
+
}
|
|
104552
|
+
|
|
104553
|
+
// src/eval/runner.ts
|
|
104554
|
+
async function runEval(discovered, options2) {
|
|
104555
|
+
const provider = discovered.evalConfig.provider ?? options2.defaultProvider;
|
|
104556
|
+
const result = await options2.engine.run({
|
|
104557
|
+
systemPrompt: discovered.systemPrompt,
|
|
104558
|
+
tests: discovered.evalConfig.tests,
|
|
104559
|
+
provider
|
|
104560
|
+
});
|
|
104561
|
+
const score = calculateScore(result.passed, result.total);
|
|
104562
|
+
const grade = scoreToGrade(score);
|
|
104563
|
+
return {
|
|
104564
|
+
artifactId: discovered.artifactId,
|
|
104565
|
+
elementName: discovered.elementName,
|
|
104566
|
+
elementType: discovered.elementType,
|
|
104567
|
+
passed: result.passed,
|
|
104568
|
+
total: result.total,
|
|
104569
|
+
score,
|
|
104570
|
+
grade,
|
|
104571
|
+
failures: result.failures
|
|
104572
|
+
};
|
|
104573
|
+
}
|
|
104574
|
+
async function runAllEvals(discovered, options2) {
|
|
104575
|
+
const results = [];
|
|
104576
|
+
for (const [index, disc] of discovered.entries()) {
|
|
104577
|
+
options2.onProgress?.(index, discovered.length, `${disc.elementType}/${disc.elementName}`);
|
|
104578
|
+
const result = await runEval(disc, options2);
|
|
104579
|
+
results.push(result);
|
|
104580
|
+
}
|
|
104581
|
+
options2.onProgress?.(discovered.length, discovered.length, "done");
|
|
104582
|
+
return results;
|
|
104583
|
+
}
|
|
104584
|
+
|
|
104585
|
+
// src/eval/display.ts
|
|
104586
|
+
function gradeColor(grade) {
|
|
104587
|
+
switch (grade) {
|
|
104588
|
+
case "A":
|
|
104589
|
+
case "B":
|
|
104590
|
+
return colors5.success;
|
|
104591
|
+
case "C":
|
|
104592
|
+
case "D":
|
|
104593
|
+
return colors5.warning;
|
|
104594
|
+
case "F":
|
|
104595
|
+
default:
|
|
104596
|
+
return colors5.error;
|
|
104597
|
+
}
|
|
104598
|
+
}
|
|
104599
|
+
function formatDots(name2, maxWidth) {
|
|
104600
|
+
const dotsNeeded = maxWidth - name2.length;
|
|
104601
|
+
const dots = dotsNeeded > 2 ? " " + ".".repeat(dotsNeeded - 2) + " " : " ";
|
|
104602
|
+
return colors5.dim(dots);
|
|
104603
|
+
}
|
|
104604
|
+
function displaySummary(summary) {
|
|
104605
|
+
const byArtifact = new Map;
|
|
104606
|
+
for (const result of summary.results) {
|
|
104607
|
+
const existing = byArtifact.get(result.artifactId) ?? [];
|
|
104608
|
+
existing.push(result);
|
|
104609
|
+
byArtifact.set(result.artifactId, existing);
|
|
104610
|
+
}
|
|
104611
|
+
const maxNameWidth = Math.max(...summary.results.map((r) => `${r.elementType}/${r.elementName}`.length), 20);
|
|
104612
|
+
for (const [artifactId, results] of byArtifact) {
|
|
104613
|
+
log(colors5.bold(artifactId));
|
|
104614
|
+
for (const result of results) {
|
|
104615
|
+
const elementPath = `${result.elementType}/${result.elementName}`;
|
|
104616
|
+
const dots = formatDots(elementPath, maxNameWidth);
|
|
104617
|
+
const passText = `${result.passed}/${result.total} passed`;
|
|
104618
|
+
const gradeText = gradeColor(result.grade)(result.grade);
|
|
104619
|
+
log(` ${elementPath}${dots}${passText} ${gradeText}`);
|
|
104620
|
+
}
|
|
104621
|
+
newline();
|
|
104622
|
+
}
|
|
104623
|
+
const overallColor = gradeColor(summary.overallGrade);
|
|
104624
|
+
log(`Overall: ${overallColor(summary.overallGrade)} (${summary.overallScore}/100)`);
|
|
104625
|
+
if (summary.totalIssues > 0) {
|
|
104626
|
+
log(`${summary.totalIssues} issue${summary.totalIssues === 1 ? "" : "s"} found`);
|
|
104627
|
+
log(`Run ${colors5.highlight("grekt eval --details")} for more info`);
|
|
104628
|
+
}
|
|
104629
|
+
}
|
|
104630
|
+
function displayDetails(summary) {
|
|
104631
|
+
displaySummary(summary);
|
|
104632
|
+
const failingResults = summary.results.filter((r) => r.failures.length > 0);
|
|
104633
|
+
if (failingResults.length === 0)
|
|
104634
|
+
return;
|
|
104635
|
+
newline();
|
|
104636
|
+
log(colors5.bold("Failures:"));
|
|
104637
|
+
newline();
|
|
104638
|
+
for (const result of failingResults) {
|
|
104639
|
+
log(` ${colors5.bold(`${result.elementType}/${result.elementName}`)} (${result.artifactId})`);
|
|
104640
|
+
for (const failure of result.failures) {
|
|
104641
|
+
log(` ${symbols.error} ${failure.testDescription}`);
|
|
104642
|
+
log(` ${colors5.dim("assertion:")} ${failure.assertionType}`);
|
|
104643
|
+
if (failure.expected) {
|
|
104644
|
+
log(` ${colors5.dim("expected:")} ${failure.expected}`);
|
|
104645
|
+
}
|
|
104646
|
+
if (failure.actual) {
|
|
104647
|
+
log(` ${colors5.dim("actual:")} ${failure.actual}`);
|
|
104648
|
+
}
|
|
104649
|
+
}
|
|
104650
|
+
newline();
|
|
104651
|
+
}
|
|
104652
|
+
}
|
|
104653
|
+
function displayJson(summary) {
|
|
104654
|
+
log(JSON.stringify(summary, null, 2));
|
|
104655
|
+
}
|
|
104656
|
+
|
|
104657
|
+
// src/commands/eval.ts
|
|
104658
|
+
function buildFilter(options2) {
|
|
104659
|
+
if (options2.skill)
|
|
104660
|
+
return { elementName: options2.skill, elementType: "skills" };
|
|
104661
|
+
if (options2.agent)
|
|
104662
|
+
return { elementName: options2.agent, elementType: "agents" };
|
|
104663
|
+
if (options2.command)
|
|
104664
|
+
return { elementName: options2.command, elementType: "commands" };
|
|
104665
|
+
return;
|
|
104666
|
+
}
|
|
104667
|
+
function requireProvider(projectRoot) {
|
|
104668
|
+
const localConfig = getLocalConfig(projectRoot);
|
|
104669
|
+
const providers = localConfig?.eval?.providers;
|
|
104670
|
+
const firstProvider = providers?.[0];
|
|
104671
|
+
if (firstProvider) {
|
|
104672
|
+
return firstProvider;
|
|
104673
|
+
}
|
|
104674
|
+
error("No eval provider configured");
|
|
104675
|
+
newline();
|
|
104676
|
+
log(" Add a provider to .grekt/config.yaml:");
|
|
104677
|
+
newline();
|
|
104678
|
+
log(` ${colors5.dim("eval:")}`);
|
|
104679
|
+
log(` ${colors5.dim(" providers:")}`);
|
|
104680
|
+
log(` ${colors5.dim(" - openai:gpt-4.1-mini")}`);
|
|
104681
|
+
newline();
|
|
104682
|
+
return process.exit(1);
|
|
104683
|
+
}
|
|
104684
|
+
var evalCommand = new Command("eval").description("Run eval tests against artifact elements (skills, agents, commands)").option("--artifact <name>", "Run evals for a specific artifact only").option("--skill <name>", "Run evals for a specific skill only").option("--agent <name>", "Run evals for a specific agent only").option("--command <name>", "Run evals for a specific command only").option("--details", "Show failure details").option("--report", "Open eval dashboard in browser").option("--format <format>", "Output format: text (default), json").action(async (options2) => {
|
|
104685
|
+
const projectRoot = process.cwd();
|
|
104686
|
+
requireInitialized(projectRoot);
|
|
104687
|
+
const engine2 = await resolveEvalEngine();
|
|
104688
|
+
if (!engine2)
|
|
104689
|
+
process.exit(1);
|
|
104690
|
+
if (options2.report) {
|
|
104691
|
+
engine2.openReport?.();
|
|
104692
|
+
return;
|
|
104693
|
+
}
|
|
104694
|
+
const defaultProvider = requireProvider(projectRoot);
|
|
104695
|
+
const lockfile = getLockfile2(projectRoot);
|
|
104696
|
+
const artifactIds = Object.keys(lockfile.artifacts);
|
|
104697
|
+
if (artifactIds.length === 0) {
|
|
104698
|
+
info("No artifacts installed");
|
|
104699
|
+
process.exit(0);
|
|
104700
|
+
}
|
|
104701
|
+
const filter = buildFilter(options2);
|
|
104702
|
+
const allDiscovered = [];
|
|
104703
|
+
const allWarnings = [];
|
|
104704
|
+
for (const artifactId of artifactIds) {
|
|
104705
|
+
if (options2.artifact && artifactId !== options2.artifact)
|
|
104706
|
+
continue;
|
|
104707
|
+
const artifactDir = join35(projectRoot, ARTIFACTS_DIR, artifactId);
|
|
104708
|
+
const result = discoverEvals(fs, { artifactDir, artifactId, filter });
|
|
104709
|
+
allDiscovered.push(...result.evals);
|
|
104710
|
+
allWarnings.push(...result.warnings);
|
|
104711
|
+
}
|
|
104712
|
+
for (const w of allWarnings) {
|
|
104713
|
+
warning(w.message);
|
|
104714
|
+
}
|
|
104715
|
+
if (allDiscovered.length === 0) {
|
|
104716
|
+
info("No eval files found");
|
|
104717
|
+
if (!filter && !options2.artifact) {
|
|
104718
|
+
info("Create a .eval.yaml file next to any skill, agent, or command");
|
|
104719
|
+
}
|
|
104720
|
+
process.exit(0);
|
|
104721
|
+
}
|
|
104722
|
+
const spin = spinner("Running evals...");
|
|
104723
|
+
spin.start();
|
|
104724
|
+
const results = await runAllEvals(allDiscovered, {
|
|
104725
|
+
engine: engine2,
|
|
104726
|
+
defaultProvider,
|
|
104727
|
+
onProgress(completed, total, current) {
|
|
104728
|
+
if (current === "done") {
|
|
104729
|
+
spin.stop();
|
|
104730
|
+
} else {
|
|
104731
|
+
spin.text = `Running evals... (${completed + 1}/${total}) ${current}`;
|
|
104732
|
+
}
|
|
104733
|
+
}
|
|
104734
|
+
});
|
|
104735
|
+
const summary = summarizeResults(results);
|
|
104736
|
+
newline();
|
|
104737
|
+
if (options2.format === "json") {
|
|
104738
|
+
displayJson(summary);
|
|
104739
|
+
} else if (options2.details) {
|
|
104740
|
+
displayDetails(summary);
|
|
104741
|
+
} else {
|
|
104742
|
+
displaySummary(summary);
|
|
104743
|
+
}
|
|
104744
|
+
if (summary.totalIssues > 0) {
|
|
104745
|
+
process.exit(1);
|
|
104746
|
+
}
|
|
104747
|
+
});
|
|
104748
|
+
|
|
104187
104749
|
// src/auth/oauth/oauth.ts
|
|
104188
104750
|
import { spawn } from "child_process";
|
|
104189
104751
|
import { randomUUID as randomUUID4 } from "crypto";
|
|
@@ -104409,7 +104971,7 @@ var whoamiCommand = new Command("whoami").description("Show current user").actio
|
|
|
104409
104971
|
// package.json
|
|
104410
104972
|
var package_default = {
|
|
104411
104973
|
name: "@grekt/cli",
|
|
104412
|
-
version: "6.
|
|
104974
|
+
version: "6.43.0-beta.2",
|
|
104413
104975
|
description: "AI tools versioned, synced, and shared across tools and teams",
|
|
104414
104976
|
type: "module",
|
|
104415
104977
|
bin: {
|
|
@@ -104444,7 +105006,7 @@ var package_default = {
|
|
|
104444
105006
|
},
|
|
104445
105007
|
dependencies: {
|
|
104446
105008
|
"@aws-sdk/client-s3": "^3.971.0",
|
|
104447
|
-
"@grekt/engine": "6.
|
|
105009
|
+
"@grekt/engine": "6.2.0-beta.1",
|
|
104448
105010
|
"@inquirer/prompts": "^7.2.0",
|
|
104449
105011
|
"@supabase/supabase-js": "^2.91.0",
|
|
104450
105012
|
chalk: "^5.4.1",
|
|
@@ -104478,13 +105040,13 @@ var package_default = {
|
|
|
104478
105040
|
// src/update-check/update-check.ts
|
|
104479
105041
|
import { existsSync as existsSync2, mkdirSync as mkdirSync3, readFileSync as readFileSync2, writeFileSync as writeFileSync2 } from "fs";
|
|
104480
105042
|
import { homedir as homedir3 } from "os";
|
|
104481
|
-
import { join as
|
|
105043
|
+
import { join as join36 } from "path";
|
|
104482
105044
|
var CACHE_FILENAME = ".update-check";
|
|
104483
105045
|
var STALENESS_MS = 24 * 60 * 60 * 1000;
|
|
104484
105046
|
var FETCH_TIMEOUT_MS = 1500;
|
|
104485
105047
|
var GITHUB_RELEASES_URL = "https://api.github.com/repos/grekt-labs/cli/releases/latest";
|
|
104486
105048
|
function getCachePath() {
|
|
104487
|
-
return
|
|
105049
|
+
return join36(homedir3(), ".grekt", CACHE_FILENAME);
|
|
104488
105050
|
}
|
|
104489
105051
|
function isOptedOut() {
|
|
104490
105052
|
return process.env.GREKT_NO_UPDATE_CHECK === "1";
|
|
@@ -104503,7 +105065,7 @@ function readCache() {
|
|
|
104503
105065
|
}
|
|
104504
105066
|
function writeCache(cache2) {
|
|
104505
105067
|
try {
|
|
104506
|
-
const dir =
|
|
105068
|
+
const dir = join36(homedir3(), ".grekt");
|
|
104507
105069
|
if (!existsSync2(dir)) {
|
|
104508
105070
|
mkdirSync3(dir, { recursive: true });
|
|
104509
105071
|
}
|
|
@@ -104617,6 +105179,7 @@ program2.addCommand(worktreeCommand);
|
|
|
104617
105179
|
program2.addCommand(scanCommand);
|
|
104618
105180
|
program2.addCommand(trustCommand);
|
|
104619
105181
|
program2.addCommand(untrustCommand);
|
|
105182
|
+
program2.addCommand(evalCommand);
|
|
104620
105183
|
setupUpdateCheck(package_default.version);
|
|
104621
105184
|
try {
|
|
104622
105185
|
await program2.parseAsync();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@grekt/cli",
|
|
3
|
-
"version": "6.
|
|
3
|
+
"version": "6.43.0-beta.2",
|
|
4
4
|
"description": "AI tools versioned, synced, and shared across tools and teams",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -35,7 +35,7 @@
|
|
|
35
35
|
},
|
|
36
36
|
"dependencies": {
|
|
37
37
|
"@aws-sdk/client-s3": "^3.971.0",
|
|
38
|
-
"@grekt/engine": "6.
|
|
38
|
+
"@grekt/engine": "6.2.0-beta.1",
|
|
39
39
|
"@inquirer/prompts": "^7.2.0",
|
|
40
40
|
"@supabase/supabase-js": "^2.91.0",
|
|
41
41
|
"chalk": "^5.4.1",
|