clementine-agent 1.18.162 → 1.18.164
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/self-improve.js +14 -0
- package/dist/cli/dashboard.js +133 -2
- package/dist/gateway/failure-clustering.d.ts +94 -0
- package/dist/gateway/failure-clustering.js +190 -0
- package/dist/memory/skill-quality.d.ts +96 -0
- package/dist/memory/skill-quality.js +231 -0
- package/package.json +1 -1
|
@@ -19,6 +19,7 @@ import { listAllGoals } from '../tools/shared.js';
|
|
|
19
19
|
import { MemoryStore } from '../memory/store.js';
|
|
20
20
|
import { ANTHROPIC_SKILL_NAME_PATTERN } from './skill-store.js';
|
|
21
21
|
import { recordApprovalSignal, formatApprovalSignalsForHypothesizer } from './approval-signals.js';
|
|
22
|
+
import { clusterBrokenJobs, formatClustersForHypothesizer } from '../gateway/failure-clustering.js';
|
|
22
23
|
const logger = pino({ name: 'clementine.self-improve' });
|
|
23
24
|
// ── Defaults ─────────────────────────────────────────────────────────
|
|
24
25
|
const DEFAULT_CONFIG = {
|
|
@@ -1102,6 +1103,18 @@ export class SelfImproveLoop {
|
|
|
1102
1103
|
// owner has approved, away from those they've denied. Empty string for
|
|
1103
1104
|
// fresh installs, which keeps the prompt clean.
|
|
1104
1105
|
const approvalSignalsText = formatApprovalSignalsForHypothesizer();
|
|
1106
|
+
// Cross-job failure clusters (1.18.163) — when ≥3 jobs hit the same
|
|
1107
|
+
// normalized error pattern in 48h, surface ONE cluster summary so
|
|
1108
|
+
// the hypothesizer proposes a root-cause fix instead of N per-job
|
|
1109
|
+
// patches. Empty string when no cluster meets the threshold.
|
|
1110
|
+
let failureClusterText = '';
|
|
1111
|
+
try {
|
|
1112
|
+
const clusters = clusterBrokenJobs();
|
|
1113
|
+
failureClusterText = formatClustersForHypothesizer(clusters);
|
|
1114
|
+
}
|
|
1115
|
+
catch (err) {
|
|
1116
|
+
logger.warn({ err }, 'Failed to compute failure clusters — proceeding without them');
|
|
1117
|
+
}
|
|
1105
1118
|
// ── Step 1: Analysis — identify top opportunities from metrics (no config dumps) ──
|
|
1106
1119
|
const analysisPrompt = `You are Clementine's self-improvement strategist. Analyze the performance data below and identify the top 3 improvement opportunities.\n\n` +
|
|
1107
1120
|
`## Recent Performance Data (last 7 days)\n` +
|
|
@@ -1119,6 +1132,7 @@ export class SelfImproveLoop {
|
|
|
1119
1132
|
diversityConstraint +
|
|
1120
1133
|
agentFocusText +
|
|
1121
1134
|
soulCandidatesText +
|
|
1135
|
+
(failureClusterText ? `\n${failureClusterText}` : '') +
|
|
1122
1136
|
(approvalSignalsText ? `\n${approvalSignalsText}` : '') +
|
|
1123
1137
|
`\n## Instructions\n` +
|
|
1124
1138
|
`Propose **1-3 concrete, high-impact improvements** the owner should review today — no fewer (aim for at least one actionable suggestion when data warrants it), no more (the owner reads each proposal manually and you'll overwhelm them). Rank by expected impact; drop anything below "solid idea".\n\n` +
|
package/dist/cli/dashboard.js
CHANGED
|
@@ -4546,6 +4546,38 @@ export async function cmdDashboard(opts) {
|
|
|
4546
4546
|
res.status(500).json({ ok: false, error: String(err) });
|
|
4547
4547
|
}
|
|
4548
4548
|
});
|
|
4549
|
+
// 1.18.164 — skill quality scoring per Anthropic metrics. Computed on
|
|
4550
|
+
// demand from the cron run log; no schema, no persistence. Bulk
|
|
4551
|
+
// endpoint for the Skills page table; per-skill endpoint for the
|
|
4552
|
+
// detail pane. Both registered BEFORE /api/skills/:name to win route
|
|
4553
|
+
// precedence (literal segment 'quality' beats the :name placeholder).
|
|
4554
|
+
app.get('/api/skills/quality', async (req, res) => {
|
|
4555
|
+
try {
|
|
4556
|
+
const { computeAllSkillQuality } = await import('../memory/skill-quality.js');
|
|
4557
|
+
const windowDays = req.query.windowDays ? Math.max(1, Math.min(365, Number(req.query.windowDays))) : undefined;
|
|
4558
|
+
const scores = computeAllSkillQuality(windowDays ? { windowDays } : {});
|
|
4559
|
+
res.json({ ok: true, count: scores.length, scores });
|
|
4560
|
+
}
|
|
4561
|
+
catch (err) {
|
|
4562
|
+
res.status(500).json({ ok: false, error: String(err) });
|
|
4563
|
+
}
|
|
4564
|
+
});
|
|
4565
|
+
app.get('/api/skills/:name/quality', async (req, res) => {
|
|
4566
|
+
try {
|
|
4567
|
+
const name = req.params.name;
|
|
4568
|
+
if (!name) {
|
|
4569
|
+
res.status(400).json({ ok: false, error: 'name required' });
|
|
4570
|
+
return;
|
|
4571
|
+
}
|
|
4572
|
+
const { computeSkillQuality } = await import('../memory/skill-quality.js');
|
|
4573
|
+
const windowDays = req.query.windowDays ? Math.max(1, Math.min(365, Number(req.query.windowDays))) : undefined;
|
|
4574
|
+
const score = computeSkillQuality(name, windowDays ? { windowDays } : {});
|
|
4575
|
+
res.json({ ok: true, score });
|
|
4576
|
+
}
|
|
4577
|
+
catch (err) {
|
|
4578
|
+
res.status(500).json({ ok: false, error: String(err) });
|
|
4579
|
+
}
|
|
4580
|
+
});
|
|
4549
4581
|
app.get('/api/skills/:name', async (req, res) => {
|
|
4550
4582
|
try {
|
|
4551
4583
|
const name = req.params.name;
|
|
@@ -11407,7 +11439,7 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
|
|
|
11407
11439
|
res.status(500).json({ error: String(err) });
|
|
11408
11440
|
}
|
|
11409
11441
|
});
|
|
11410
|
-
app.get('/api/self-improve', (_req, res) => {
|
|
11442
|
+
app.get('/api/self-improve', async (_req, res) => {
|
|
11411
11443
|
const siDir = path.join(BASE_DIR, 'self-improve');
|
|
11412
11444
|
const stateFile = path.join(siDir, 'state.json');
|
|
11413
11445
|
const logFile = path.join(siDir, 'experiment-log.jsonl');
|
|
@@ -11472,7 +11504,18 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
|
|
|
11472
11504
|
}
|
|
11473
11505
|
catch { /* ignore */ }
|
|
11474
11506
|
}
|
|
11475
|
-
|
|
11507
|
+
// 1.18.163 — cross-job failure clusters (≥3 jobs hitting the same
|
|
11508
|
+
// normalized error pattern in 48h). Computed on demand from
|
|
11509
|
+
// computeBrokenJobs(); no schema, no persistence. The Self-Improve
|
|
11510
|
+
// tab surfaces this so the owner sees "5 jobs hit X — propose one
|
|
11511
|
+
// root-cause fix" instead of N per-job rows.
|
|
11512
|
+
let clusters = [];
|
|
11513
|
+
try {
|
|
11514
|
+
const { clusterBrokenJobs } = await import('../gateway/failure-clustering.js');
|
|
11515
|
+
clusters = clusterBrokenJobs();
|
|
11516
|
+
}
|
|
11517
|
+
catch { /* non-fatal — empty clusters list */ }
|
|
11518
|
+
res.json({ state, experiments, pending, triggers, verifications, clusters });
|
|
11476
11519
|
});
|
|
11477
11520
|
app.post('/api/self-improve/run', async (_req, res) => {
|
|
11478
11521
|
try {
|
|
@@ -19940,6 +19983,13 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
|
|
|
19940
19983
|
<div class="empty-state" style="padding:14px">No active failures — nothing has tripped 3+ consecutive errors.</div>
|
|
19941
19984
|
</div>
|
|
19942
19985
|
</div>
|
|
19986
|
+
<div class="card" style="margin-top:16px" id="si-clusters-card" hidden>
|
|
19987
|
+
<div class="card-header" style="display:flex;align-items:center;justify-content:space-between">
|
|
19988
|
+
<span>Cross-job failure clusters <span style="font-weight:normal;font-size:11px;color:var(--text-muted)">· 3+ jobs hitting the same error pattern (last 48h)</span></span>
|
|
19989
|
+
<span class="tab-badge" id="tab-si-clusters" style="background:#a855f7;color:#fff">0</span>
|
|
19990
|
+
</div>
|
|
19991
|
+
<div class="card-body" id="si-clusters-list" style="padding:0"></div>
|
|
19992
|
+
</div>
|
|
19943
19993
|
<div class="card" style="margin-top:16px">
|
|
19944
19994
|
<div class="card-header" style="display:flex;align-items:center;justify-content:space-between">
|
|
19945
19995
|
<span>Verifying fixes</span>
|
|
@@ -29603,11 +29653,53 @@ async function showSkillDetail(name) {
|
|
|
29603
29653
|
detailEl.innerHTML = renderSkillDetail(d.skill);
|
|
29604
29654
|
if (typeof loadSkillSuppressionState === 'function') loadSkillSuppressionState(name);
|
|
29605
29655
|
if (typeof loadSkillScheduleState === 'function') loadSkillScheduleState(name);
|
|
29656
|
+
if (typeof loadSkillQualityState === 'function') loadSkillQualityState(name);
|
|
29606
29657
|
} catch (e) {
|
|
29607
29658
|
detailEl.innerHTML = '<div style="padding:24px;color:var(--red);font-size:12px">Error: ' + esc(String(e)) + '</div>';
|
|
29608
29659
|
}
|
|
29609
29660
|
}
|
|
29610
29661
|
|
|
29662
|
+
// 1.18.164 — fetch + render the skill quality scorecard. Best-effort:
|
|
29663
|
+
// renders nothing if the container is absent or the fetch errors. We
|
|
29664
|
+
// hit the per-skill endpoint here (fast, single skill) instead of the
|
|
29665
|
+
// bulk one — the Skills page already has its own bulk render path.
|
|
29666
|
+
async function loadSkillQualityState(skillName) {
|
|
29667
|
+
var container = document.getElementById('skill-quality-' + encodeURIComponent(skillName));
|
|
29668
|
+
if (!container) return;
|
|
29669
|
+
try {
|
|
29670
|
+
var r = await apiFetch('/api/skills/' + encodeURIComponent(skillName) + '/quality');
|
|
29671
|
+
var d = await r.json();
|
|
29672
|
+
if (!r.ok || d.ok === false || !d.score) return;
|
|
29673
|
+
var s = d.score;
|
|
29674
|
+
var gradeColors = { good: '#10b981', underperforming: '#ef4444', stale: '#f59e0b', 'no-data': '#6b7280' };
|
|
29675
|
+
var gradeLabel = (s.grade || 'no-data').replace(/-/g, ' ');
|
|
29676
|
+
var color = gradeColors[s.grade] || '#6b7280';
|
|
29677
|
+
var pct = function(v) { return v === null || v === undefined ? '—' : (v * 100).toFixed(0) + '%'; };
|
|
29678
|
+
var ms = function(v) { return v === null || v === undefined ? '—' : (v < 1000 ? v + 'ms' : (v / 1000).toFixed(1) + 's'); };
|
|
29679
|
+
var usd = function(v) { return v === null || v === undefined ? '—' : '$' + v.toFixed(4); };
|
|
29680
|
+
var rows = [
|
|
29681
|
+
['Total runs', s.totalRuns],
|
|
29682
|
+
['Pinned / auto', s.pinnedRuns + ' / ' + s.autoRuns],
|
|
29683
|
+
['Success rate', pct(s.successRate)],
|
|
29684
|
+
['Trigger accuracy', s.triggerAccuracy === null ? '— (no auto-matched runs)' : pct(s.triggerAccuracy)],
|
|
29685
|
+
['Avg duration', ms(s.avgDurationMs)],
|
|
29686
|
+
['Avg cost', usd(s.avgCostUsd)],
|
|
29687
|
+
];
|
|
29688
|
+
container.innerHTML =
|
|
29689
|
+
'<div style="font-weight:600;margin-bottom:6px;display:flex;align-items:center;gap:8px">' +
|
|
29690
|
+
'<span>Quality (' + s.windowDays + 'd)</span>' +
|
|
29691
|
+
'<span style="font-size:11px;padding:2px 8px;border-radius:10px;background:' + color + ';color:#fff;text-transform:uppercase">' + esc(gradeLabel) + '</span>' +
|
|
29692
|
+
'</div>' +
|
|
29693
|
+
'<div style="font-size:12px;color:var(--text-muted);margin-bottom:8px">' + esc(s.gradeReason || '') + '</div>' +
|
|
29694
|
+
'<table style="width:100%;font-size:12px;border-collapse:collapse">' +
|
|
29695
|
+
rows.map(function(r) {
|
|
29696
|
+
return '<tr><td style="padding:3px 0;color:var(--text-muted);width:50%">' + esc(r[0]) + '</td>' +
|
|
29697
|
+
'<td style="padding:3px 0;text-align:right;font-family:ui-monospace,monospace">' + esc(String(r[1])) + '</td></tr>';
|
|
29698
|
+
}).join('') +
|
|
29699
|
+
'</table>';
|
|
29700
|
+
} catch (e) { /* best-effort — leave empty */ }
|
|
29701
|
+
}
|
|
29702
|
+
|
|
29611
29703
|
// 1.18.127 — fetch the current suppression state and wire the checkbox.
|
|
29612
29704
|
// Cached per call; the file is small enough that re-fetching on every
|
|
29613
29705
|
// detail open is fine (and ensures consistency if the user just toggled
|
|
@@ -29965,6 +30057,14 @@ function renderSkillDetail(s) {
|
|
|
29965
30057
|
html += '</div>';
|
|
29966
30058
|
html += '</div>';
|
|
29967
30059
|
|
|
30060
|
+
// 1.18.164 — Quality scorecard (per Anthropic skill metrics).
|
|
30061
|
+
// Lazy-loaded: rendered as a placeholder, populated by
|
|
30062
|
+
// loadSkillQualityState() right after the detail pane mounts. The
|
|
30063
|
+
// grade chip (good / underperforming / stale / no-data) tells the
|
|
30064
|
+
// owner at a glance whether this skill is pulling its weight; the
|
|
30065
|
+
// table beneath has the supporting numbers for drilling in.
|
|
30066
|
+
html += '<div id="skill-quality-' + encodeURIComponent(fm.name) + '" style="margin-top:10px;padding:12px 14px;background:var(--bg-secondary);border:1px solid var(--border);border-radius:6px;font-size:12px;color:var(--text-muted)">Loading quality…</div>';
|
|
30067
|
+
|
|
29968
30068
|
// ── 2. Validation warnings (if any)
|
|
29969
30069
|
if (Array.isArray(s.validation) && s.validation.length > 0) {
|
|
29970
30070
|
var errors = s.validation.filter(function(v) { return v.severity === 'error'; });
|
|
@@ -40500,6 +40600,7 @@ async function refreshSelfImprove() {
|
|
|
40500
40600
|
const pending = d.pending || [];
|
|
40501
40601
|
const triggers = d.triggers || [];
|
|
40502
40602
|
const verifications = d.verifications || [];
|
|
40603
|
+
const clusters = d.clusters || [];
|
|
40503
40604
|
|
|
40504
40605
|
// Update tab badge — combine human-attention queues so the sidebar
|
|
40505
40606
|
// count reflects "things that need you to look at", not just proposals.
|
|
@@ -40537,6 +40638,36 @@ async function refreshSelfImprove() {
|
|
|
40537
40638
|
}
|
|
40538
40639
|
}
|
|
40539
40640
|
|
|
40641
|
+
// 1.18.163 — cross-job failure clusters (≥3 jobs hitting the same
|
|
40642
|
+
// normalized pattern). Hidden when the list is empty so the card
|
|
40643
|
+
// doesn't take up space on a healthy install.
|
|
40644
|
+
const clustersCard = document.getElementById('si-clusters-card');
|
|
40645
|
+
const clustersList = document.getElementById('si-clusters-list');
|
|
40646
|
+
const clustersBadge = document.getElementById('tab-si-clusters');
|
|
40647
|
+
if (clustersCard && clustersList) {
|
|
40648
|
+
if (clusters.length === 0) {
|
|
40649
|
+
clustersCard.hidden = true;
|
|
40650
|
+
} else {
|
|
40651
|
+
clustersCard.hidden = false;
|
|
40652
|
+
if (clustersBadge) clustersBadge.textContent = clusters.length;
|
|
40653
|
+
clustersList.innerHTML = clusters.map(function(c) {
|
|
40654
|
+
var rep = String(c.representative || '').slice(0, 200);
|
|
40655
|
+
var jobsList = (c.jobs || []).slice(0, 5).map(function(j) {
|
|
40656
|
+
return '<span class="badge" style="margin-right:4px;font-size:11px">' + esc(j.jobName) + ' ×' + (j.errorCount48h || 0) + '</span>';
|
|
40657
|
+
}).join('');
|
|
40658
|
+
var more = (c.jobs && c.jobs.length > 5) ? '<span style="font-size:11px;color:var(--text-muted)">+' + (c.jobs.length - 5) + ' more</span>' : '';
|
|
40659
|
+
return '<div style="padding:12px;border-bottom:1px solid var(--border)">' +
|
|
40660
|
+
'<div style="display:flex;justify-content:space-between;align-items:baseline;gap:8px;flex-wrap:wrap">' +
|
|
40661
|
+
'<div><strong>' + (c.jobs ? c.jobs.length : 0) + ' jobs</strong> · ' +
|
|
40662
|
+
'<span style="font-size:11px;color:var(--text-muted)">' + (c.totalErrors || 0) + ' total errors (48h)</span></div>' +
|
|
40663
|
+
'</div>' +
|
|
40664
|
+
'<div style="margin-top:6px;font-size:12px;color:var(--text-secondary);font-family:ui-monospace,monospace">' + esc(rep) + '</div>' +
|
|
40665
|
+
'<div style="margin-top:8px">' + jobsList + ' ' + more + '</div>' +
|
|
40666
|
+
'</div>';
|
|
40667
|
+
}).join('');
|
|
40668
|
+
}
|
|
40669
|
+
}
|
|
40670
|
+
|
|
40540
40671
|
// Pending fix verifications — auto-fixes soaking through the 3-run window.
|
|
40541
40672
|
const verifyEl = document.getElementById('si-verifying-list');
|
|
40542
40673
|
if (verifyEl) {
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-job failure clustering (1.18.163).
|
|
3
|
+
*
|
|
4
|
+
* Today the failure pipeline is per-job:
|
|
5
|
+
* broken-job(jobName) → classifyFailure(lastErrors) → 1 fix proposal
|
|
6
|
+
*
|
|
7
|
+
* That means when 5 different cron jobs all hit the same root cause
|
|
8
|
+
* (e.g. all 5 fail with "Prompt is too long"), the system generates
|
|
9
|
+
* 5 isolated patches instead of 1 root-cause fix. The owner sees
|
|
10
|
+
* 5 separate proposals in the Self-Improve tab and either approves all
|
|
11
|
+
* 5 (busywork) or denies them (and the underlying issue persists).
|
|
12
|
+
*
|
|
13
|
+
* This module groups recent broken jobs by *normalized error pattern*.
|
|
14
|
+
* When ≥3 distinct jobs hit the same cluster, the owner gets ONE
|
|
15
|
+
* "5 jobs all hit X — propose Y for all of them" suggestion instead of
|
|
16
|
+
* N separate ones.
|
|
17
|
+
*
|
|
18
|
+
* This is purely a *suggestion / presentation* layer — clusters are
|
|
19
|
+
* surfaced as a hint to the hypothesizer + dashboard. The existing
|
|
20
|
+
* per-job `failure-fix-consumer` continues to handle individual patches
|
|
21
|
+
* unchanged. Clustering is additive observability, not a replacement
|
|
22
|
+
* for per-job fixes.
|
|
23
|
+
*
|
|
24
|
+
* Reads from the existing `computeBrokenJobs()` source — no new schema,
|
|
25
|
+
* no new persistence, computed on demand.
|
|
26
|
+
*/
|
|
27
|
+
import type { BrokenJob } from './failure-monitor.js';
|
|
28
|
+
/**
|
|
29
|
+
* Minimum distinct jobs required to form a cluster. Below this we don't
|
|
30
|
+
* bother — a single repeated error is just a per-job problem.
|
|
31
|
+
*
|
|
32
|
+
* 3 is conservative: 2 looks coincidental, 3 is "this is a systemic
|
|
33
|
+
* thing." Tunable if we get noise.
|
|
34
|
+
*/
|
|
35
|
+
export declare const MIN_CLUSTER_SIZE = 3;
|
|
36
|
+
/**
|
|
37
|
+
* Normalize an error message into a clustering key.
|
|
38
|
+
*
|
|
39
|
+
* Goals:
|
|
40
|
+
* - "Prompt is too long (12345 tokens)" and "Prompt is too long (45678
|
|
41
|
+
* tokens)" should collapse to the same key.
|
|
42
|
+
* - Job-specific tokens (UUIDs, timestamps, paths with the job name)
|
|
43
|
+
* should be stripped.
|
|
44
|
+
* - The result should still be human-readable (we surface it in the UI).
|
|
45
|
+
*
|
|
46
|
+
* Strategy:
|
|
47
|
+
* 1. Lowercase + collapse whitespace
|
|
48
|
+
* 2. Strip ISO timestamps + UNIX epochs
|
|
49
|
+
* 3. Strip UUIDs and long hex tokens
|
|
50
|
+
* 4. Strip parenthesized numbers ("(12345 tokens)" → "(N tokens)")
|
|
51
|
+
* 5. Strip absolute paths
|
|
52
|
+
* 6. Truncate to ERROR_NORMALIZE_LEN
|
|
53
|
+
*/
|
|
54
|
+
export declare function normalizeErrorMessage(raw: string): string;
|
|
55
|
+
export interface FailureCluster {
|
|
56
|
+
/** The normalized pattern key. Stable across jobs/runs. */
|
|
57
|
+
pattern: string;
|
|
58
|
+
/** A representative human-readable error message (one of the original
|
|
59
|
+
* uncleaned strings, picked by frequency). */
|
|
60
|
+
representative: string;
|
|
61
|
+
/** Distinct jobs hitting this cluster, sorted by error count desc. */
|
|
62
|
+
jobs: Array<{
|
|
63
|
+
jobName: string;
|
|
64
|
+
agentSlug?: string;
|
|
65
|
+
errorCount48h: number;
|
|
66
|
+
lastErrorAt: string | null;
|
|
67
|
+
}>;
|
|
68
|
+
/** Total errors across all jobs in the cluster (last 48h). */
|
|
69
|
+
totalErrors: number;
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Group the current broken jobs by normalized error pattern. Only
|
|
73
|
+
* returns clusters with ≥ MIN_CLUSTER_SIZE distinct jobs. Returns
|
|
74
|
+
* largest clusters first (by distinct-job count, then total error
|
|
75
|
+
* count).
|
|
76
|
+
*
|
|
77
|
+
* Each broken job contributes UP TO 3 patterns (its `lastErrors[]`).
|
|
78
|
+
* A job that hits two distinct patterns counts toward both clusters
|
|
79
|
+
* — that's by design, since a job with two root causes really does
|
|
80
|
+
* need both fixes.
|
|
81
|
+
*/
|
|
82
|
+
export declare function clusterBrokenJobs(jobs?: BrokenJob[]): FailureCluster[];
|
|
83
|
+
/**
|
|
84
|
+
* Render a cluster summary for the hypothesizer prompt block. Empty
|
|
85
|
+
* string when no clusters meet the threshold.
|
|
86
|
+
*
|
|
87
|
+
* Format:
|
|
88
|
+
* ### Cross-job failure clusters (last 48h)
|
|
89
|
+
* - "Prompt is too long (N tokens)" — 5 jobs: insight-check, outcome-grader, route-classifier, ...
|
|
90
|
+
* - "Reached maximum number of turns (N)" — 3 jobs: ...
|
|
91
|
+
* Bias one root-cause proposal toward the largest cluster instead of N per-job ones.
|
|
92
|
+
*/
|
|
93
|
+
export declare function formatClustersForHypothesizer(clusters: FailureCluster[]): string;
|
|
94
|
+
//# sourceMappingURL=failure-clustering.d.ts.map
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-job failure clustering (1.18.163).
|
|
3
|
+
*
|
|
4
|
+
* Today the failure pipeline is per-job:
|
|
5
|
+
* broken-job(jobName) → classifyFailure(lastErrors) → 1 fix proposal
|
|
6
|
+
*
|
|
7
|
+
* That means when 5 different cron jobs all hit the same root cause
|
|
8
|
+
* (e.g. all 5 fail with "Prompt is too long"), the system generates
|
|
9
|
+
* 5 isolated patches instead of 1 root-cause fix. The owner sees
|
|
10
|
+
* 5 separate proposals in the Self-Improve tab and either approves all
|
|
11
|
+
* 5 (busywork) or denies them (and the underlying issue persists).
|
|
12
|
+
*
|
|
13
|
+
* This module groups recent broken jobs by *normalized error pattern*.
|
|
14
|
+
* When ≥3 distinct jobs hit the same cluster, the owner gets ONE
|
|
15
|
+
* "5 jobs all hit X — propose Y for all of them" suggestion instead of
|
|
16
|
+
* N separate ones.
|
|
17
|
+
*
|
|
18
|
+
* This is purely a *suggestion / presentation* layer — clusters are
|
|
19
|
+
* surfaced as a hint to the hypothesizer + dashboard. The existing
|
|
20
|
+
* per-job `failure-fix-consumer` continues to handle individual patches
|
|
21
|
+
* unchanged. Clustering is additive observability, not a replacement
|
|
22
|
+
* for per-job fixes.
|
|
23
|
+
*
|
|
24
|
+
* Reads from the existing `computeBrokenJobs()` source — no new schema,
|
|
25
|
+
* no new persistence, computed on demand.
|
|
26
|
+
*/
|
|
27
|
+
import pino from 'pino';
|
|
28
|
+
import { computeBrokenJobs } from './failure-monitor.js';
|
|
29
|
+
const logger = pino({ name: 'clementine.failure-clustering' });
|
|
30
|
+
// ── Tunables ─────────────────────────────────────────────────────────
|
|
31
|
+
/**
|
|
32
|
+
* Minimum distinct jobs required to form a cluster. Below this we don't
|
|
33
|
+
* bother — a single repeated error is just a per-job problem.
|
|
34
|
+
*
|
|
35
|
+
* 3 is conservative: 2 looks coincidental, 3 is "this is a systemic
|
|
36
|
+
* thing." Tunable if we get noise.
|
|
37
|
+
*/
|
|
38
|
+
export const MIN_CLUSTER_SIZE = 3;
|
|
39
|
+
/** Max chars of an error message we consider when normalizing. The
|
|
40
|
+
* important signal is in the first ~200 chars; longer suffixes are
|
|
41
|
+
* usually stack traces or per-call IDs that destroy clustering. */
|
|
42
|
+
const ERROR_NORMALIZE_LEN = 200;
|
|
43
|
+
// ── Normalization ────────────────────────────────────────────────────
|
|
44
|
+
/**
|
|
45
|
+
* Normalize an error message into a clustering key.
|
|
46
|
+
*
|
|
47
|
+
* Goals:
|
|
48
|
+
* - "Prompt is too long (12345 tokens)" and "Prompt is too long (45678
|
|
49
|
+
* tokens)" should collapse to the same key.
|
|
50
|
+
* - Job-specific tokens (UUIDs, timestamps, paths with the job name)
|
|
51
|
+
* should be stripped.
|
|
52
|
+
* - The result should still be human-readable (we surface it in the UI).
|
|
53
|
+
*
|
|
54
|
+
* Strategy:
|
|
55
|
+
* 1. Lowercase + collapse whitespace
|
|
56
|
+
* 2. Strip ISO timestamps + UNIX epochs
|
|
57
|
+
* 3. Strip UUIDs and long hex tokens
|
|
58
|
+
* 4. Strip parenthesized numbers ("(12345 tokens)" → "(N tokens)")
|
|
59
|
+
* 5. Strip absolute paths
|
|
60
|
+
* 6. Truncate to ERROR_NORMALIZE_LEN
|
|
61
|
+
*/
|
|
62
|
+
export function normalizeErrorMessage(raw) {
|
|
63
|
+
if (!raw)
|
|
64
|
+
return '';
|
|
65
|
+
let s = raw.toLowerCase().trim();
|
|
66
|
+
// ISO timestamps: 2026-05-10T14:23:00.000Z (with optional millis/tz)
|
|
67
|
+
s = s.replace(/\d{4}-\d{2}-\d{2}t\d{2}:\d{2}:\d{2}(\.\d+)?(z|[+-]\d{2}:?\d{2})?/g, '<ts>');
|
|
68
|
+
// Unix epoch ms (13-digit) + sec (10-digit) — must come BEFORE plain numbers
|
|
69
|
+
s = s.replace(/\b\d{13}\b/g, '<ts>');
|
|
70
|
+
s = s.replace(/\b\d{10}\b/g, '<ts>');
|
|
71
|
+
// UUIDs
|
|
72
|
+
s = s.replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g, '<uuid>');
|
|
73
|
+
// Long hex (16+ chars, like commit SHAs / session ids)
|
|
74
|
+
s = s.replace(/\b[0-9a-f]{16,}\b/g, '<hex>');
|
|
75
|
+
// Parenthesized numbers: (12345) → (N) ; (12345 tokens) → (N tokens)
|
|
76
|
+
s = s.replace(/\(\s*\d[\d,_.]*\s*([a-z]*)\s*\)/g, (_m, suffix) => suffix ? `(N ${suffix})` : '(N)');
|
|
77
|
+
// Absolute paths — keep just the basename
|
|
78
|
+
s = s.replace(/\/[\w./-]+\/([\w.-]+)/g, '<path>/$1');
|
|
79
|
+
// Generic standalone large numbers
|
|
80
|
+
s = s.replace(/\b\d{4,}\b/g, '<N>');
|
|
81
|
+
// Collapse whitespace
|
|
82
|
+
s = s.replace(/\s+/g, ' ').trim();
|
|
83
|
+
return s.slice(0, ERROR_NORMALIZE_LEN);
|
|
84
|
+
}
|
|
85
|
+
// ── Clusterer ────────────────────────────────────────────────────────
|
|
86
|
+
/**
|
|
87
|
+
* Group the current broken jobs by normalized error pattern. Only
|
|
88
|
+
* returns clusters with ≥ MIN_CLUSTER_SIZE distinct jobs. Returns
|
|
89
|
+
* largest clusters first (by distinct-job count, then total error
|
|
90
|
+
* count).
|
|
91
|
+
*
|
|
92
|
+
* Each broken job contributes UP TO 3 patterns (its `lastErrors[]`).
|
|
93
|
+
* A job that hits two distinct patterns counts toward both clusters
|
|
94
|
+
* — that's by design, since a job with two root causes really does
|
|
95
|
+
* need both fixes.
|
|
96
|
+
*/
|
|
97
|
+
export function clusterBrokenJobs(jobs) {
|
|
98
|
+
const source = jobs ?? computeBrokenJobs();
|
|
99
|
+
if (source.length === 0)
|
|
100
|
+
return [];
|
|
101
|
+
// pattern → { representative (most common raw), jobs map keyed by jobName }
|
|
102
|
+
const buckets = new Map();
|
|
103
|
+
for (const job of source) {
|
|
104
|
+
const seenForThisJob = new Set();
|
|
105
|
+
for (const raw of job.lastErrors ?? []) {
|
|
106
|
+
const key = normalizeErrorMessage(raw);
|
|
107
|
+
if (!key)
|
|
108
|
+
continue;
|
|
109
|
+
// Don't double-count this job for the same pattern even if
|
|
110
|
+
// lastErrors contains two near-identical messages.
|
|
111
|
+
if (seenForThisJob.has(key))
|
|
112
|
+
continue;
|
|
113
|
+
seenForThisJob.add(key);
|
|
114
|
+
let bucket = buckets.get(key);
|
|
115
|
+
if (!bucket) {
|
|
116
|
+
bucket = { representative: raw, rawCounts: new Map(), jobs: new Map() };
|
|
117
|
+
buckets.set(key, bucket);
|
|
118
|
+
}
|
|
119
|
+
bucket.rawCounts.set(raw, (bucket.rawCounts.get(raw) ?? 0) + 1);
|
|
120
|
+
// Pick the most-common raw form as the representative on the fly.
|
|
121
|
+
const cur = bucket.rawCounts.get(raw);
|
|
122
|
+
const best = bucket.rawCounts.get(bucket.representative) ?? 0;
|
|
123
|
+
if (cur > best)
|
|
124
|
+
bucket.representative = raw;
|
|
125
|
+
const existing = bucket.jobs.get(job.jobName);
|
|
126
|
+
if (existing) {
|
|
127
|
+
existing.errorCount48h += job.errorCount48h;
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
bucket.jobs.set(job.jobName, {
|
|
131
|
+
jobName: job.jobName,
|
|
132
|
+
...(job.agentSlug ? { agentSlug: job.agentSlug } : {}),
|
|
133
|
+
errorCount48h: job.errorCount48h,
|
|
134
|
+
lastErrorAt: job.lastErrorAt,
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
const clusters = [];
|
|
140
|
+
for (const [pattern, bucket] of buckets) {
|
|
141
|
+
if (bucket.jobs.size < MIN_CLUSTER_SIZE)
|
|
142
|
+
continue;
|
|
143
|
+
const jobsArr = [...bucket.jobs.values()].sort((a, b) => b.errorCount48h - a.errorCount48h);
|
|
144
|
+
const totalErrors = jobsArr.reduce((acc, j) => acc + j.errorCount48h, 0);
|
|
145
|
+
clusters.push({
|
|
146
|
+
pattern,
|
|
147
|
+
representative: bucket.representative,
|
|
148
|
+
jobs: jobsArr,
|
|
149
|
+
totalErrors,
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
// Sort: distinct-job count desc, then total errors desc, then pattern asc
|
|
153
|
+
clusters.sort((a, b) => {
|
|
154
|
+
if (b.jobs.length !== a.jobs.length)
|
|
155
|
+
return b.jobs.length - a.jobs.length;
|
|
156
|
+
if (b.totalErrors !== a.totalErrors)
|
|
157
|
+
return b.totalErrors - a.totalErrors;
|
|
158
|
+
return a.pattern.localeCompare(b.pattern);
|
|
159
|
+
});
|
|
160
|
+
if (clusters.length > 0) {
|
|
161
|
+
logger.info({ count: clusters.length, top: clusters[0]?.pattern.slice(0, 80), topJobs: clusters[0]?.jobs.length }, 'Failure clusters detected');
|
|
162
|
+
}
|
|
163
|
+
return clusters;
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Render a cluster summary for the hypothesizer prompt block. Empty
|
|
167
|
+
* string when no clusters meet the threshold.
|
|
168
|
+
*
|
|
169
|
+
* Format:
|
|
170
|
+
* ### Cross-job failure clusters (last 48h)
|
|
171
|
+
* - "Prompt is too long (N tokens)" — 5 jobs: insight-check, outcome-grader, route-classifier, ...
|
|
172
|
+
* - "Reached maximum number of turns (N)" — 3 jobs: ...
|
|
173
|
+
* Bias one root-cause proposal toward the largest cluster instead of N per-job ones.
|
|
174
|
+
*/
|
|
175
|
+
export function formatClustersForHypothesizer(clusters) {
|
|
176
|
+
if (!clusters || clusters.length === 0)
|
|
177
|
+
return '';
|
|
178
|
+
const lines = ['### Cross-job failure clusters (last 48h)'];
|
|
179
|
+
for (const c of clusters.slice(0, 5)) {
|
|
180
|
+
const jobNames = c.jobs.slice(0, 5).map(j => j.jobName).join(', ');
|
|
181
|
+
const more = c.jobs.length > 5 ? `, +${c.jobs.length - 5} more` : '';
|
|
182
|
+
const rep = c.representative.length > 100 ? c.representative.slice(0, 100) + '…' : c.representative;
|
|
183
|
+
lines.push(`- "${rep}" — ${c.jobs.length} jobs (${c.totalErrors} total errors): ${jobNames}${more}`);
|
|
184
|
+
}
|
|
185
|
+
lines.push('When a cluster of 3+ jobs hits the same pattern, prefer ONE root-cause proposal ' +
|
|
186
|
+
'(e.g. an advisor-rule, a prompt-override at agent or global scope, or a shared ' +
|
|
187
|
+
'config change) over N per-job patches.');
|
|
188
|
+
return lines.join('\n') + '\n\n';
|
|
189
|
+
}
|
|
190
|
+
//# sourceMappingURL=failure-clustering.js.map
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Skill quality scoring per Anthropic skill metrics (1.18.164).
|
|
3
|
+
*
|
|
4
|
+
* Anthropic's skill spec calls for tracking per-skill quality with a few
|
|
5
|
+
* specific metrics: trigger accuracy, success rate, average tool calls,
|
|
6
|
+
* average tokens, failure rate per workflow. Today we have the raw data
|
|
7
|
+
* (CronRunEntry stamps `skillsApplied: [{name, source}]` on every run
|
|
8
|
+
* since 1.18.85) but never aggregate it into a "how is this skill
|
|
9
|
+
* actually performing?" view.
|
|
10
|
+
*
|
|
11
|
+
* This module computes the metrics on demand from the existing run log —
|
|
12
|
+
* no new schema, no new persistence. The Skills page card surfaces the
|
|
13
|
+
* scores so the owner can spot:
|
|
14
|
+
* - Skills that auto-trigger but don't help (low trigger accuracy)
|
|
15
|
+
* - Skills that are pinned but consistently fail (low success rate)
|
|
16
|
+
* - Skills with no recent activity ("stale")
|
|
17
|
+
* - Skills with no data at all ("no-data" — fresh; may be unused)
|
|
18
|
+
*
|
|
19
|
+
* The grade is a coarse 4-bucket label optimized for "what should the
|
|
20
|
+
* owner do about this skill?" rather than a precise number. Detailed
|
|
21
|
+
* stats accompany so the owner can drill in.
|
|
22
|
+
*
|
|
23
|
+
* Why no SQLite table:
|
|
24
|
+
* - The data already exists in CronRunLog jsonl files
|
|
25
|
+
* - Recompute is cheap (one-time scan over recent jsonl)
|
|
26
|
+
* - Avoids a new schema migration + the risk of double-counting if
|
|
27
|
+
* we forget to write to it from one of the run paths
|
|
28
|
+
* - Owner isn't running this 100×/sec — the dashboard hits it once
|
|
29
|
+
* when the Skills page renders
|
|
30
|
+
*
|
|
31
|
+
* If the volume ever grows past ~50 skills × 500 runs/day, we can
|
|
32
|
+
* promote to SQLite. Until then, keep it simple.
|
|
33
|
+
*/
|
|
34
|
+
/** Default rolling window for quality computation. Anthropic suggests
|
|
35
|
+
* a 30-day evaluation horizon for skill metrics; that matches our
|
|
36
|
+
* cron-run-log retention so we read what we have. */
|
|
37
|
+
export declare const DEFAULT_WINDOW_DAYS = 30;
|
|
38
|
+
export interface SkillQualityScore {
|
|
39
|
+
/** Skill identifier (the `name` field from frontmatter). */
|
|
40
|
+
name: string;
|
|
41
|
+
/** Window the metrics cover. */
|
|
42
|
+
windowDays: number;
|
|
43
|
+
/** Total runs in the window where this skill was applied. */
|
|
44
|
+
totalRuns: number;
|
|
45
|
+
/** Of those, runs where the skill was explicitly pinned by the cron. */
|
|
46
|
+
pinnedRuns: number;
|
|
47
|
+
/** Of those, runs where the skill was auto-matched by the search layer. */
|
|
48
|
+
autoRuns: number;
|
|
49
|
+
/** Runs we count as successful (status='ok' AND goalCheck didn't fail). */
|
|
50
|
+
successRuns: number;
|
|
51
|
+
/** Runs we count as failed (status in error/timeout/lost OR goalCheck.fail). */
|
|
52
|
+
failureRuns: number;
|
|
53
|
+
/** successRuns / totalRuns — null when totalRuns is 0. */
|
|
54
|
+
successRate: number | null;
|
|
55
|
+
/** Among auto-matched runs only, what fraction succeeded. Anthropic's
|
|
56
|
+
* "trigger accuracy" — how often the auto-match was the right call.
|
|
57
|
+
* null when there are no auto-matched runs in the window. */
|
|
58
|
+
triggerAccuracy: number | null;
|
|
59
|
+
/** Average duration in ms across runs that completed (not 'running'). */
|
|
60
|
+
avgDurationMs: number | null;
|
|
61
|
+
/** Average cost in USD across runs that report it. */
|
|
62
|
+
avgCostUsd: number | null;
|
|
63
|
+
/** Most recent ISO timestamp this skill was applied to a run. */
|
|
64
|
+
lastUsedAt: string | null;
|
|
65
|
+
/**
|
|
66
|
+
* Coarse 4-bucket label for owner attention:
|
|
67
|
+
* - 'good' — enough runs, success rate above threshold
|
|
68
|
+
* - 'underperforming' — enough runs, success rate below threshold
|
|
69
|
+
* - 'stale' — no runs in the last STALE_DAYS regardless of past stats
|
|
70
|
+
* - 'no-data' — fewer than MIN_RUNS_FOR_GRADE runs in the window
|
|
71
|
+
*/
|
|
72
|
+
grade: 'good' | 'underperforming' | 'stale' | 'no-data';
|
|
73
|
+
/** One-sentence reason for the grade — surfaces under the badge. */
|
|
74
|
+
gradeReason: string;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Compute quality scores for a single skill. Returns the aggregate even
|
|
78
|
+
* when there's no data — graded 'no-data' so the dashboard can render
|
|
79
|
+
* a clean empty state.
|
|
80
|
+
*/
|
|
81
|
+
export declare function computeSkillQuality(skillName: string, options?: {
|
|
82
|
+
windowDays?: number;
|
|
83
|
+
baseDir?: string;
|
|
84
|
+
}): SkillQualityScore;
|
|
85
|
+
/**
|
|
86
|
+
* Compute scores for every skill that appeared in *any* run within the
|
|
87
|
+
* window. Returns one score per skill name, sorted by totalRuns desc
|
|
88
|
+
* (most-used first). Skills that exist in the vault but never ran will
|
|
89
|
+
* not appear — callers that need "every skill" should merge with the
|
|
90
|
+
* skill-store listing themselves.
|
|
91
|
+
*/
|
|
92
|
+
export declare function computeAllSkillQuality(options?: {
|
|
93
|
+
windowDays?: number;
|
|
94
|
+
baseDir?: string;
|
|
95
|
+
}): SkillQualityScore[];
|
|
96
|
+
//# sourceMappingURL=skill-quality.d.ts.map
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Skill quality scoring per Anthropic skill metrics (1.18.164).
|
|
3
|
+
*
|
|
4
|
+
* Anthropic's skill spec calls for tracking per-skill quality with a few
|
|
5
|
+
* specific metrics: trigger accuracy, success rate, average tool calls,
|
|
6
|
+
* average tokens, failure rate per workflow. Today we have the raw data
|
|
7
|
+
* (CronRunEntry stamps `skillsApplied: [{name, source}]` on every run
|
|
8
|
+
* since 1.18.85) but never aggregate it into a "how is this skill
|
|
9
|
+
* actually performing?" view.
|
|
10
|
+
*
|
|
11
|
+
* This module computes the metrics on demand from the existing run log —
|
|
12
|
+
* no new schema, no new persistence. The Skills page card surfaces the
|
|
13
|
+
* scores so the owner can spot:
|
|
14
|
+
* - Skills that auto-trigger but don't help (low trigger accuracy)
|
|
15
|
+
* - Skills that are pinned but consistently fail (low success rate)
|
|
16
|
+
* - Skills with no recent activity ("stale")
|
|
17
|
+
* - Skills with no data at all ("no-data" — fresh; may be unused)
|
|
18
|
+
*
|
|
19
|
+
* The grade is a coarse 4-bucket label optimized for "what should the
|
|
20
|
+
* owner do about this skill?" rather than a precise number. Detailed
|
|
21
|
+
* stats accompany so the owner can drill in.
|
|
22
|
+
*
|
|
23
|
+
* Why no SQLite table:
|
|
24
|
+
* - The data already exists in CronRunLog jsonl files
|
|
25
|
+
* - Recompute is cheap (one-time scan over recent jsonl)
|
|
26
|
+
* - Avoids a new schema migration + the risk of double-counting if
|
|
27
|
+
* we forget to write to it from one of the run paths
|
|
28
|
+
* - Owner isn't running this 100×/sec — the dashboard hits it once
|
|
29
|
+
* when the Skills page renders
|
|
30
|
+
*
|
|
31
|
+
* If the volume ever grows past ~50 skills × 500 runs/day, we can
|
|
32
|
+
* promote to SQLite. Until then, keep it simple.
|
|
33
|
+
*/
|
|
34
|
+
import path from 'node:path';
|
|
35
|
+
import pino from 'pino';
|
|
36
|
+
import { existsSync, readdirSync, readFileSync } from 'node:fs';
|
|
37
|
+
import { BASE_DIR } from '../config.js';
|
|
38
|
+
const logger = pino({ name: 'clementine.skill-quality' });
|
|
39
|
+
// ── Tunables ─────────────────────────────────────────────────────────
|
|
40
|
+
/** Default rolling window for quality computation. Anthropic suggests
|
|
41
|
+
* a 30-day evaluation horizon for skill metrics; that matches our
|
|
42
|
+
* cron-run-log retention so we read what we have. */
|
|
43
|
+
export const DEFAULT_WINDOW_DAYS = 30;
|
|
44
|
+
/** Minimum runs before we hand out a grade. Below this, the skill is
|
|
45
|
+
* marked 'no-data' regardless of pass/fail to avoid grading from a
|
|
46
|
+
* sample of 1. */
|
|
47
|
+
const MIN_RUNS_FOR_GRADE = 3;
|
|
48
|
+
/** Stale threshold — if the skill hasn't been used at all within
|
|
49
|
+
* this many days, the grade becomes 'stale' regardless of past stats. */
|
|
50
|
+
const STALE_DAYS = 30;
|
|
51
|
+
/** Below this success-rate threshold a skill with enough runs is graded
|
|
52
|
+
* 'underperforming'. 0.6 = "fails 4 in 10" — a reasonable trigger for
|
|
53
|
+
* the owner to investigate. */
|
|
54
|
+
const UNDERPERFORMING_SUCCESS_RATE = 0.6;
|
|
55
|
+
// ── Internals ────────────────────────────────────────────────────────
|
|
56
|
+
/** Scan all per-job run log files and yield every entry within the window. */
|
|
57
|
+
function* iterRecentRuns(windowDays, baseDir = BASE_DIR) {
|
|
58
|
+
const runsDir = path.join(baseDir, 'cron', 'runs');
|
|
59
|
+
if (!existsSync(runsDir))
|
|
60
|
+
return;
|
|
61
|
+
const cutoff = Date.now() - windowDays * 24 * 60 * 60 * 1000;
|
|
62
|
+
let files;
|
|
63
|
+
try {
|
|
64
|
+
files = readdirSync(runsDir).filter(f => f.endsWith('.jsonl'));
|
|
65
|
+
}
|
|
66
|
+
catch {
|
|
67
|
+
return;
|
|
68
|
+
}
|
|
69
|
+
for (const file of files) {
|
|
70
|
+
let lines;
|
|
71
|
+
try {
|
|
72
|
+
lines = readFileSync(path.join(runsDir, file), 'utf-8').trim().split('\n').filter(Boolean);
|
|
73
|
+
}
|
|
74
|
+
catch {
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
// Iterate newest-first; bail once we cross the cutoff (assumes
|
|
78
|
+
// append-only writes).
|
|
79
|
+
for (let i = lines.length - 1; i >= 0; i--) {
|
|
80
|
+
let entry;
|
|
81
|
+
try {
|
|
82
|
+
entry = JSON.parse(lines[i]);
|
|
83
|
+
}
|
|
84
|
+
catch {
|
|
85
|
+
continue;
|
|
86
|
+
}
|
|
87
|
+
const ts = Date.parse(entry.startedAt);
|
|
88
|
+
if (Number.isFinite(ts) && ts < cutoff)
|
|
89
|
+
break;
|
|
90
|
+
yield entry;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
/** Did this run succeed for the purposes of skill scoring? Status='ok'
|
|
95
|
+
* combined with a non-failing goalCheck (when present). */
|
|
96
|
+
function isRunSuccess(entry) {
|
|
97
|
+
if (entry.status !== 'ok')
|
|
98
|
+
return false;
|
|
99
|
+
if (entry.goalCheck?.status === 'fail')
|
|
100
|
+
return false;
|
|
101
|
+
return true;
|
|
102
|
+
}
|
|
103
|
+
/** Did this run terminally fail? Excludes 'running'/'skipped' so they
|
|
104
|
+
* don't pull either ratio. */
|
|
105
|
+
function isRunFailure(entry) {
|
|
106
|
+
if (entry.status === 'error' || entry.status === 'timeout' || entry.status === 'lost')
|
|
107
|
+
return true;
|
|
108
|
+
if (entry.status === 'ok' && entry.goalCheck?.status === 'fail')
|
|
109
|
+
return true;
|
|
110
|
+
return false;
|
|
111
|
+
}
|
|
112
|
+
// ── Public API ────────────────────────────────────────────────────────
|
|
113
|
+
/**
|
|
114
|
+
* Compute quality scores for a single skill. Returns the aggregate even
|
|
115
|
+
* when there's no data — graded 'no-data' so the dashboard can render
|
|
116
|
+
* a clean empty state.
|
|
117
|
+
*/
|
|
118
|
+
export function computeSkillQuality(skillName, options = {}) {
|
|
119
|
+
const windowDays = options.windowDays ?? DEFAULT_WINDOW_DAYS;
|
|
120
|
+
let total = 0, pinned = 0, auto = 0, success = 0, failure = 0;
|
|
121
|
+
let durationSumMs = 0, durationN = 0;
|
|
122
|
+
let costSum = 0, costN = 0;
|
|
123
|
+
let autoSuccess = 0, autoTotal = 0;
|
|
124
|
+
let lastUsedAt = null;
|
|
125
|
+
for (const entry of iterRecentRuns(windowDays, options.baseDir)) {
|
|
126
|
+
const applied = (entry.skillsApplied ?? []).find(s => s.name === skillName);
|
|
127
|
+
if (!applied)
|
|
128
|
+
continue;
|
|
129
|
+
total++;
|
|
130
|
+
if (applied.source === 'pinned')
|
|
131
|
+
pinned++;
|
|
132
|
+
else if (applied.source === 'auto')
|
|
133
|
+
auto++;
|
|
134
|
+
if (isRunSuccess(entry))
|
|
135
|
+
success++;
|
|
136
|
+
if (isRunFailure(entry))
|
|
137
|
+
failure++;
|
|
138
|
+
if (applied.source === 'auto') {
|
|
139
|
+
autoTotal++;
|
|
140
|
+
if (isRunSuccess(entry))
|
|
141
|
+
autoSuccess++;
|
|
142
|
+
}
|
|
143
|
+
if (typeof entry.durationMs === 'number' && entry.durationMs > 0 && entry.status !== 'running') {
|
|
144
|
+
durationSumMs += entry.durationMs;
|
|
145
|
+
durationN++;
|
|
146
|
+
}
|
|
147
|
+
if (typeof entry.totalCostUsd === 'number') {
|
|
148
|
+
costSum += entry.totalCostUsd;
|
|
149
|
+
costN++;
|
|
150
|
+
}
|
|
151
|
+
if (!lastUsedAt || entry.startedAt > lastUsedAt) {
|
|
152
|
+
lastUsedAt = entry.startedAt;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
const successRate = total > 0 ? success / total : null;
|
|
156
|
+
const triggerAccuracy = autoTotal > 0 ? autoSuccess / autoTotal : null;
|
|
157
|
+
const avgDurationMs = durationN > 0 ? Math.round(durationSumMs / durationN) : null;
|
|
158
|
+
const avgCostUsd = costN > 0 ? costSum / costN : null;
|
|
159
|
+
// Grade decision — order matters: 'no-data' beats everything for
|
|
160
|
+
// small samples; 'stale' beats 'underperforming' for skills that
|
|
161
|
+
// historically did fine but stopped firing.
|
|
162
|
+
let grade = 'no-data';
|
|
163
|
+
let gradeReason = `Only ${total} run${total === 1 ? '' : 's'} in the last ${windowDays}d — not enough to grade.`;
|
|
164
|
+
if (total >= MIN_RUNS_FOR_GRADE) {
|
|
165
|
+
if (lastUsedAt) {
|
|
166
|
+
const lastMs = Date.parse(lastUsedAt);
|
|
167
|
+
if (Number.isFinite(lastMs) && Date.now() - lastMs > STALE_DAYS * 24 * 60 * 60 * 1000) {
|
|
168
|
+
grade = 'stale';
|
|
169
|
+
gradeReason = `No runs in the last ${STALE_DAYS} days. Consider archiving or revisiting triggers.`;
|
|
170
|
+
}
|
|
171
|
+
else if (successRate !== null && successRate < UNDERPERFORMING_SUCCESS_RATE) {
|
|
172
|
+
grade = 'underperforming';
|
|
173
|
+
gradeReason = `${(successRate * 100).toFixed(0)}% success over ${total} runs — investigate failures + tighten triggers or body.`;
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
grade = 'good';
|
|
177
|
+
gradeReason = `${successRate !== null ? (successRate * 100).toFixed(0) : '?'}% success over ${total} runs.`;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
else {
|
|
181
|
+
// Defensive — shouldn't happen if total > 0, but keep fall-through.
|
|
182
|
+
grade = 'no-data';
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return {
|
|
186
|
+
name: skillName,
|
|
187
|
+
windowDays,
|
|
188
|
+
totalRuns: total,
|
|
189
|
+
pinnedRuns: pinned,
|
|
190
|
+
autoRuns: auto,
|
|
191
|
+
successRuns: success,
|
|
192
|
+
failureRuns: failure,
|
|
193
|
+
successRate,
|
|
194
|
+
triggerAccuracy,
|
|
195
|
+
avgDurationMs,
|
|
196
|
+
avgCostUsd,
|
|
197
|
+
lastUsedAt,
|
|
198
|
+
grade,
|
|
199
|
+
gradeReason,
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Compute scores for every skill that appeared in *any* run within the
|
|
204
|
+
* window. Returns one score per skill name, sorted by totalRuns desc
|
|
205
|
+
* (most-used first). Skills that exist in the vault but never ran will
|
|
206
|
+
* not appear — callers that need "every skill" should merge with the
|
|
207
|
+
* skill-store listing themselves.
|
|
208
|
+
*/
|
|
209
|
+
export function computeAllSkillQuality(options = {}) {
|
|
210
|
+
const windowDays = options.windowDays ?? DEFAULT_WINDOW_DAYS;
|
|
211
|
+
// First pass: collect every skill name that appears at least once.
|
|
212
|
+
const seen = new Set();
|
|
213
|
+
for (const entry of iterRecentRuns(windowDays, options.baseDir)) {
|
|
214
|
+
for (const s of entry.skillsApplied ?? []) {
|
|
215
|
+
if (s?.name)
|
|
216
|
+
seen.add(s.name);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
// Second pass: full scoring per skill. Two passes is wasteful but
|
|
220
|
+
// simple; with ~50 skills × 2000-line files this is ms-cheap.
|
|
221
|
+
const scores = [];
|
|
222
|
+
for (const name of seen) {
|
|
223
|
+
scores.push(computeSkillQuality(name, options));
|
|
224
|
+
}
|
|
225
|
+
scores.sort((a, b) => b.totalRuns - a.totalRuns || a.name.localeCompare(b.name));
|
|
226
|
+
if (scores.length > 0) {
|
|
227
|
+
logger.debug({ count: scores.length, top: scores[0]?.name, topRuns: scores[0]?.totalRuns }, 'Skill quality scored');
|
|
228
|
+
}
|
|
229
|
+
return scores;
|
|
230
|
+
}
|
|
231
|
+
//# sourceMappingURL=skill-quality.js.map
|