@mutagent/cli 0.1.36 → 0.1.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin/cli.js CHANGED
@@ -317,6 +317,125 @@ var init_errors = __esm(() => {
317
317
  };
318
318
  });
319
319
 
320
+ // src/lib/scorecard-extraction.ts
321
+ function extractScorecardDetails(rawState, iterCtx) {
322
+ if (!iterCtx)
323
+ return {};
324
+ const gc = rawState.globalContext;
325
+ const gcCtx = gc?.context;
326
+ const beforeExec = gcCtx?.executions;
327
+ const beforeResults = beforeExec?.results ?? [];
328
+ const beforeById = new Map;
329
+ for (const r of beforeResults) {
330
+ const id = r.id;
331
+ if (id)
332
+ beforeById.set(id, r);
333
+ }
334
+ let originalScore;
335
+ if (beforeResults.length > 0) {
336
+ const sum = beforeResults.reduce((acc, r) => {
337
+ const eval_ = r.evaluation;
338
+ return acc + (eval_?.score ?? 0);
339
+ }, 0);
340
+ originalScore = sum / beforeResults.length;
341
+ }
342
+ const afterExec = iterCtx.executionResults;
343
+ const afterResults = afterExec?.executions ?? [];
344
+ const datasetResults = afterResults.length > 0 ? afterResults.map((r) => {
345
+ const id = r.id || "unknown";
346
+ const afterEval = r.evaluation;
347
+ const afterScore = afterEval?.score ?? 0;
348
+ const beforeResult = beforeById.get(id);
349
+ const beforeEval = beforeResult?.evaluation;
350
+ const beforeScore = beforeEval?.score;
351
+ return { id, beforeScore, afterScore };
352
+ }) : undefined;
353
+ const criteriaScores = extractCriteriaScores(beforeResults, afterResults);
354
+ const rawFailureModes = iterCtx.failureModes;
355
+ const failureModes = rawFailureModes?.categories && rawFailureModes.failures ? rawFailureModes.categories.map((category) => ({
356
+ category,
357
+ failures: (rawFailureModes.failures?.[category] ?? []).map((f) => ({
358
+ description: f.description ?? f.label,
359
+ summary: f.summary
360
+ }))
361
+ })) : undefined;
362
+ const rawMutations = iterCtx.mutations;
363
+ const mutations = rawMutations && rawMutations.length > 0 ? rawMutations.map((m) => ({
364
+ label: m.label ?? "Unknown mutation",
365
+ status: m.status ?? "pending",
366
+ priority: m.priority,
367
+ rationale: m.target?.rationale
368
+ })) : undefined;
369
+ const evaluationDetails = afterResults.length > 0 ? afterResults.map((r) => {
370
+ const id = r.id || "unknown";
371
+ const eval_ = r.evaluation;
372
+ const score = eval_?.score ?? 0;
373
+ const success = eval_?.success ?? false;
374
+ const metrics = eval_?.evaluations?.map((metric) => {
375
+ const criteria = metric.evaluationChecklist?.items?.map((item) => ({
376
+ name: item.evaluationParameter ?? item.criteria ?? "unknown",
377
+ score: item.llmScore ?? 0,
378
+ success: item.success ?? false
379
+ }));
380
+ return {
381
+ name: metric.name ?? "unknown",
382
+ score: metric.score ?? 0,
383
+ success: metric.success ?? false,
384
+ failureMode: metric.failureMode,
385
+ reasoning: metric.reasoning,
386
+ criteria: criteria && criteria.length > 0 ? criteria : undefined
387
+ };
388
+ });
389
+ return {
390
+ itemId: id,
391
+ score,
392
+ success,
393
+ metrics: metrics && metrics.length > 0 ? metrics : undefined
394
+ };
395
+ }) : undefined;
396
+ return {
397
+ originalScore,
398
+ criteriaScores,
399
+ datasetResults,
400
+ failureModes,
401
+ mutations,
402
+ evaluationDetails
403
+ };
404
+ }
405
+ function extractCriteriaScores(beforeResults, afterResults) {
406
+ const metricNames = new Set;
407
+ const beforeScores = new Map;
408
+ const afterScores = new Map;
409
+ for (const r of beforeResults) {
410
+ const eval_ = r.evaluation;
411
+ for (const m of eval_?.evaluations ?? []) {
412
+ const name = m.name ?? "unknown";
413
+ metricNames.add(name);
414
+ const existing = beforeScores.get(name) ?? [];
415
+ existing.push(m.score ?? 0);
416
+ beforeScores.set(name, existing);
417
+ }
418
+ }
419
+ for (const r of afterResults) {
420
+ const eval_ = r.evaluation;
421
+ for (const m of eval_?.evaluations ?? []) {
422
+ const name = m.name ?? "unknown";
423
+ metricNames.add(name);
424
+ const existing = afterScores.get(name) ?? [];
425
+ existing.push(m.score ?? 0);
426
+ afterScores.set(name, existing);
427
+ }
428
+ }
429
+ if (metricNames.size === 0)
430
+ return;
431
+ const avg = (arr) => arr.reduce((a, b) => a + b, 0) / arr.length;
432
+ return Array.from(metricNames).map((name) => ({
433
+ name,
434
+ before: beforeScores.has(name) ? avg(beforeScores.get(name) ?? []) : undefined,
435
+ after: afterScores.has(name) ? avg(afterScores.get(name) ?? []) : undefined
436
+ }));
437
+ }
438
+
320
439
  // src/lib/sdk-client.ts
321
440
  var exports_sdk_client = {};
322
441
  __export(exports_sdk_client, {
@@ -646,9 +765,13 @@ class SDKClientWrapper {
646
765
  const prompt = await this.getPrompt(String(job.promptId ?? ""));
647
766
  const statesRes = await this.request(`/api/optimization/${jobId}/states`).catch(() => ({ states: [] }));
648
767
  const latestState = statesRes.states[statesRes.states.length - 1];
649
- const iterCtx = latestState?.state.iterationContext;
650
- const mutatedPromptText = iterCtx?.currentPrompt?.prompt;
651
- const originalPromptText = iterCtx?.basePrompt?.prompt;
768
+ const rawState = latestState?.state ?? {};
769
+ const iterCtx = rawState.iterationContext ?? rawState.current?.context;
770
+ const basePromptObj = iterCtx?.basePrompt;
771
+ const currentPromptObj = iterCtx?.currentPrompt;
772
+ const mutatedPromptText = typeof currentPromptObj?.prompt === "string" ? currentPromptObj.prompt : undefined;
773
+ const originalPromptText = typeof basePromptObj?.prompt === "string" ? basePromptObj.prompt : undefined;
774
+ const extracted = extractScorecardDetails(rawState, iterCtx);
652
775
  return {
653
776
  job: {
654
777
  id: job.id ?? jobId,
@@ -658,10 +781,16 @@ class SDKClientWrapper {
658
781
  },
659
782
  prompt,
660
783
  bestScore: job.bestScore,
784
+ originalScore: extracted.originalScore,
661
785
  iterationsCompleted: job.currentIteration,
662
786
  scoreProgression: Array.isArray(progress.progression) ? progress.progression.map((p) => typeof p.score === "number" ? p.score : 0) : undefined,
663
787
  mutatedPromptText,
664
- originalPromptText
788
+ originalPromptText,
789
+ criteriaScores: extracted.criteriaScores,
790
+ datasetResults: extracted.datasetResults,
791
+ failureModes: extracted.failureModes,
792
+ mutations: extracted.mutations,
793
+ evaluationDetails: extracted.evaluationDetails
665
794
  };
666
795
  } catch (error) {
667
796
  this.handleError(error);
@@ -7451,5 +7580,5 @@ program.addCommand(createSkillsCommand());
7451
7580
  program.addCommand(createUsageCommand());
7452
7581
  program.parse();
7453
7582
 
7454
- //# debugId=B35CD49159FCE51364756E2164756E21
7583
+ //# debugId=873FF0DF0E58222164756E2164756E21
7455
7584
  //# sourceMappingURL=cli.js.map