@mutagent/cli 0.1.36 → 0.1.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/cli.js +134 -5
- package/dist/bin/cli.js.map +5 -4
- package/dist/index.js +134 -5
- package/dist/index.js.map +5 -4
- package/package.json +1 -1
package/dist/bin/cli.js
CHANGED
|
@@ -317,6 +317,125 @@ var init_errors = __esm(() => {
|
|
|
317
317
|
};
|
|
318
318
|
});
|
|
319
319
|
|
|
320
|
+
// src/lib/scorecard-extraction.ts
|
|
321
|
+
function extractScorecardDetails(rawState, iterCtx) {
|
|
322
|
+
if (!iterCtx)
|
|
323
|
+
return {};
|
|
324
|
+
const gc = rawState.globalContext;
|
|
325
|
+
const gcCtx = gc?.context;
|
|
326
|
+
const beforeExec = gcCtx?.executions;
|
|
327
|
+
const beforeResults = beforeExec?.results ?? [];
|
|
328
|
+
const beforeById = new Map;
|
|
329
|
+
for (const r of beforeResults) {
|
|
330
|
+
const id = r.id;
|
|
331
|
+
if (id)
|
|
332
|
+
beforeById.set(id, r);
|
|
333
|
+
}
|
|
334
|
+
let originalScore;
|
|
335
|
+
if (beforeResults.length > 0) {
|
|
336
|
+
const sum = beforeResults.reduce((acc, r) => {
|
|
337
|
+
const eval_ = r.evaluation;
|
|
338
|
+
return acc + (eval_?.score ?? 0);
|
|
339
|
+
}, 0);
|
|
340
|
+
originalScore = sum / beforeResults.length;
|
|
341
|
+
}
|
|
342
|
+
const afterExec = iterCtx.executionResults;
|
|
343
|
+
const afterResults = afterExec?.executions ?? [];
|
|
344
|
+
const datasetResults = afterResults.length > 0 ? afterResults.map((r) => {
|
|
345
|
+
const id = r.id || "unknown";
|
|
346
|
+
const afterEval = r.evaluation;
|
|
347
|
+
const afterScore = afterEval?.score ?? 0;
|
|
348
|
+
const beforeResult = beforeById.get(id);
|
|
349
|
+
const beforeEval = beforeResult?.evaluation;
|
|
350
|
+
const beforeScore = beforeEval?.score;
|
|
351
|
+
return { id, beforeScore, afterScore };
|
|
352
|
+
}) : undefined;
|
|
353
|
+
const criteriaScores = extractCriteriaScores(beforeResults, afterResults);
|
|
354
|
+
const rawFailureModes = iterCtx.failureModes;
|
|
355
|
+
const failureModes = rawFailureModes?.categories && rawFailureModes.failures ? rawFailureModes.categories.map((category) => ({
|
|
356
|
+
category,
|
|
357
|
+
failures: (rawFailureModes.failures?.[category] ?? []).map((f) => ({
|
|
358
|
+
description: f.description ?? f.label,
|
|
359
|
+
summary: f.summary
|
|
360
|
+
}))
|
|
361
|
+
})) : undefined;
|
|
362
|
+
const rawMutations = iterCtx.mutations;
|
|
363
|
+
const mutations = rawMutations && rawMutations.length > 0 ? rawMutations.map((m) => ({
|
|
364
|
+
label: m.label ?? "Unknown mutation",
|
|
365
|
+
status: m.status ?? "pending",
|
|
366
|
+
priority: m.priority,
|
|
367
|
+
rationale: m.target?.rationale
|
|
368
|
+
})) : undefined;
|
|
369
|
+
const evaluationDetails = afterResults.length > 0 ? afterResults.map((r) => {
|
|
370
|
+
const id = r.id || "unknown";
|
|
371
|
+
const eval_ = r.evaluation;
|
|
372
|
+
const score = eval_?.score ?? 0;
|
|
373
|
+
const success = eval_?.success ?? false;
|
|
374
|
+
const metrics = eval_?.evaluations?.map((metric) => {
|
|
375
|
+
const criteria = metric.evaluationChecklist?.items?.map((item) => ({
|
|
376
|
+
name: item.evaluationParameter ?? item.criteria ?? "unknown",
|
|
377
|
+
score: item.llmScore ?? 0,
|
|
378
|
+
success: item.success ?? false
|
|
379
|
+
}));
|
|
380
|
+
return {
|
|
381
|
+
name: metric.name ?? "unknown",
|
|
382
|
+
score: metric.score ?? 0,
|
|
383
|
+
success: metric.success ?? false,
|
|
384
|
+
failureMode: metric.failureMode,
|
|
385
|
+
reasoning: metric.reasoning,
|
|
386
|
+
criteria: criteria && criteria.length > 0 ? criteria : undefined
|
|
387
|
+
};
|
|
388
|
+
});
|
|
389
|
+
return {
|
|
390
|
+
itemId: id,
|
|
391
|
+
score,
|
|
392
|
+
success,
|
|
393
|
+
metrics: metrics && metrics.length > 0 ? metrics : undefined
|
|
394
|
+
};
|
|
395
|
+
}) : undefined;
|
|
396
|
+
return {
|
|
397
|
+
originalScore,
|
|
398
|
+
criteriaScores,
|
|
399
|
+
datasetResults,
|
|
400
|
+
failureModes,
|
|
401
|
+
mutations,
|
|
402
|
+
evaluationDetails
|
|
403
|
+
};
|
|
404
|
+
}
|
|
405
|
+
function extractCriteriaScores(beforeResults, afterResults) {
|
|
406
|
+
const metricNames = new Set;
|
|
407
|
+
const beforeScores = new Map;
|
|
408
|
+
const afterScores = new Map;
|
|
409
|
+
for (const r of beforeResults) {
|
|
410
|
+
const eval_ = r.evaluation;
|
|
411
|
+
for (const m of eval_?.evaluations ?? []) {
|
|
412
|
+
const name = m.name ?? "unknown";
|
|
413
|
+
metricNames.add(name);
|
|
414
|
+
const existing = beforeScores.get(name) ?? [];
|
|
415
|
+
existing.push(m.score ?? 0);
|
|
416
|
+
beforeScores.set(name, existing);
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
for (const r of afterResults) {
|
|
420
|
+
const eval_ = r.evaluation;
|
|
421
|
+
for (const m of eval_?.evaluations ?? []) {
|
|
422
|
+
const name = m.name ?? "unknown";
|
|
423
|
+
metricNames.add(name);
|
|
424
|
+
const existing = afterScores.get(name) ?? [];
|
|
425
|
+
existing.push(m.score ?? 0);
|
|
426
|
+
afterScores.set(name, existing);
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
if (metricNames.size === 0)
|
|
430
|
+
return;
|
|
431
|
+
const avg = (arr) => arr.reduce((a, b) => a + b, 0) / arr.length;
|
|
432
|
+
return Array.from(metricNames).map((name) => ({
|
|
433
|
+
name,
|
|
434
|
+
before: beforeScores.has(name) ? avg(beforeScores.get(name) ?? []) : undefined,
|
|
435
|
+
after: afterScores.has(name) ? avg(afterScores.get(name) ?? []) : undefined
|
|
436
|
+
}));
|
|
437
|
+
}
|
|
438
|
+
|
|
320
439
|
// src/lib/sdk-client.ts
|
|
321
440
|
var exports_sdk_client = {};
|
|
322
441
|
__export(exports_sdk_client, {
|
|
@@ -646,9 +765,13 @@ class SDKClientWrapper {
|
|
|
646
765
|
const prompt = await this.getPrompt(String(job.promptId ?? ""));
|
|
647
766
|
const statesRes = await this.request(`/api/optimization/${jobId}/states`).catch(() => ({ states: [] }));
|
|
648
767
|
const latestState = statesRes.states[statesRes.states.length - 1];
|
|
649
|
-
const
|
|
650
|
-
const
|
|
651
|
-
const
|
|
768
|
+
const rawState = latestState?.state ?? {};
|
|
769
|
+
const iterCtx = rawState.iterationContext ?? rawState.current?.context;
|
|
770
|
+
const basePromptObj = iterCtx?.basePrompt;
|
|
771
|
+
const currentPromptObj = iterCtx?.currentPrompt;
|
|
772
|
+
const mutatedPromptText = typeof currentPromptObj?.prompt === "string" ? currentPromptObj.prompt : undefined;
|
|
773
|
+
const originalPromptText = typeof basePromptObj?.prompt === "string" ? basePromptObj.prompt : undefined;
|
|
774
|
+
const extracted = extractScorecardDetails(rawState, iterCtx);
|
|
652
775
|
return {
|
|
653
776
|
job: {
|
|
654
777
|
id: job.id ?? jobId,
|
|
@@ -658,10 +781,16 @@ class SDKClientWrapper {
|
|
|
658
781
|
},
|
|
659
782
|
prompt,
|
|
660
783
|
bestScore: job.bestScore,
|
|
784
|
+
originalScore: extracted.originalScore,
|
|
661
785
|
iterationsCompleted: job.currentIteration,
|
|
662
786
|
scoreProgression: Array.isArray(progress.progression) ? progress.progression.map((p) => typeof p.score === "number" ? p.score : 0) : undefined,
|
|
663
787
|
mutatedPromptText,
|
|
664
|
-
originalPromptText
|
|
788
|
+
originalPromptText,
|
|
789
|
+
criteriaScores: extracted.criteriaScores,
|
|
790
|
+
datasetResults: extracted.datasetResults,
|
|
791
|
+
failureModes: extracted.failureModes,
|
|
792
|
+
mutations: extracted.mutations,
|
|
793
|
+
evaluationDetails: extracted.evaluationDetails
|
|
665
794
|
};
|
|
666
795
|
} catch (error) {
|
|
667
796
|
this.handleError(error);
|
|
@@ -7451,5 +7580,5 @@ program.addCommand(createSkillsCommand());
|
|
|
7451
7580
|
program.addCommand(createUsageCommand());
|
|
7452
7581
|
program.parse();
|
|
7453
7582
|
|
|
7454
|
-
//# debugId=
|
|
7583
|
+
//# debugId=873FF0DF0E58222164756E2164756E21
|
|
7455
7584
|
//# sourceMappingURL=cli.js.map
|