clementine-agent 1.18.161 → 1.18.163

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ /**
2
+ * runSkill — the canonical Skill execution primitive (1.18.162).
3
+ *
4
+ * Closes Skills Runtime C-2 from the Skills-First redesign.
5
+ *
6
+ * Today (pre-1.18.162) a pinned skill is fed into a cron prompt as a
7
+ * markdown context block and its `clementine.tools.allow` list is UNIONED
8
+ * into the cron's allowedTools (1.18.121 widening). That is permissive,
9
+ * not enforced — a skill that says "I only use Bash + WebFetch" can still
10
+ * call any tool the surrounding cron allows.
11
+ *
12
+ * `runSkill(name, options)` is the alternative path: a sub-call where the
13
+ * skill's `tools.allow` is a HARD allowlist (only those tools, plus a
14
+ * minimal core set), `{{var}}` placeholders in the body are substituted
15
+ * from `options.inputs`, and `clementine.success.schema` is ajv-validated
16
+ * post-run.
17
+ *
18
+ * Why a separate primitive (and not a flag on the existing widening path):
19
+ * - Caller intent is different. Pinned-skills-as-context is "give the LLM
20
+ * reference material"; runSkill is "do this specific procedure now."
21
+ * - Hard enforcement requires constructing the SDK call ourselves, not
22
+ * reusing a cron-job's effective allowlist.
23
+ * - Inputs/success are skill-call concepts, not cron concepts.
24
+ *
25
+ * Surfaced as the MCP tool `run_skill(name, inputs?)` so chat + cron +
26
+ * sub-agents converge on one primitive.
27
+ */
28
+ import type { Skill } from '../types.js';
29
+ export interface RunSkillOptions {
30
+ /** Mustache-style `{{var}}` substitutions for the skill body. */
31
+ inputs?: Record<string, string | number | boolean>;
32
+ /** Optional caller context appended after the skill body
33
+ * (e.g. the user's request, the cron firing context). */
34
+ context?: string;
35
+ /** Stable session key for transcript mirroring. Defaults to a synthesized
36
+ * key derived from the skill name + timestamp. */
37
+ sessionKey?: string;
38
+ /** Source classification for telemetry. Defaults to 'skill'. */
39
+ source?: string;
40
+ /** Optional model override. */
41
+ model?: string;
42
+ /** Hard turn cap. Falls back to `clementine.limits.maxTurns` if set. */
43
+ maxTurns?: number;
44
+ /** Hard budget cap (USD). Falls back to `clementine.limits.maxBudgetUsd`. */
45
+ maxBudgetUsd?: number;
46
+ /** Project work dir for per-project skill precedence (mirrors getSkill's
47
+ * `projectWorkDir` parameter — when set, project-scoped skills shadow
48
+ * global ones with the same name). */
49
+ projectWorkDir?: string;
50
+ /** Skip success.schema validation even if the skill declares one. */
51
+ skipValidation?: boolean;
52
+ /** Streaming callback for partial assistant text. */
53
+ onText?: (chunk: string) => void | Promise<void>;
54
+ /** Abort signal — cancels the SDK stream when triggered. */
55
+ abortSignal?: AbortSignal;
56
+ }
57
+ export interface RunSkillResult {
58
+ ok: boolean;
59
+ /** Final text response from the SDK. */
60
+ output: string;
61
+ /** Cost in USD. */
62
+ cost?: number;
63
+ /** Number of agentic turns. */
64
+ turns?: number;
65
+ /** SDK session id — capture for resume. */
66
+ sessionId?: string;
67
+ /** SDK runId — joins to the Event store. */
68
+ runId?: string;
69
+ /** Schema validation result when the skill declared `clementine.success.schema`. */
70
+ validation?: {
71
+ /** True when validation actually ran (schema present + JSON extractable). */
72
+ tried: boolean;
73
+ /** True when the response validated against the schema. */
74
+ pass: boolean;
75
+ /** First few ajv error messages. */
76
+ errors: string[];
77
+ };
78
+ /** The hard allowlist that was passed to the SDK. */
79
+ effectiveTools?: string[];
80
+ /** Failure reason when ok=false. */
81
+ error?: string;
82
+ }
83
+ /**
84
+ * Substitute `{{var}}` placeholders in `body` from `inputs`. Missing
85
+ * keys are left as-is (so the LLM still sees the placeholder and can
86
+ * complain) rather than silently dropped — a missing input is more
87
+ * recoverable as visible text than as a stripped string.
88
+ */
89
+ export declare function applyMustache(body: string, inputs: Record<string, string | number | boolean> | undefined): string;
90
+ /**
91
+ * Compute the HARD allowlist for a skill call.
92
+ *
93
+ * Combines, in order:
94
+ * 1. The skill's `clementine.tools.allow` list (or [] if absent)
95
+ * 2. Tool names auto-extracted from the skill body matching `mcp__*__*`
96
+ * 3. SKILL_BASELINE_TOOLS so the SDK can read files / dispatch subagents
97
+ *
98
+ * Then subtracts anything in `clementine.tools.deny` (deny wins).
99
+ *
100
+ * Returns a deduped array. Empty input = empty output (which the SDK
101
+ * treats as "deny everything"); callers are expected to set a sensible
102
+ * `tools.allow` on the skill.
103
+ */
104
+ export declare function computeSkillAllowlist(skill: Skill): string[];
105
+ /**
106
+ * Build the prompt the SDK actually executes for a skill call.
107
+ *
108
+ * Format:
109
+ * <skill body, with mustache substitutions applied>
110
+ *
111
+ * ## Caller context
112
+ * <options.context> ← when provided
113
+ *
114
+ * The skill body itself becomes the procedure; the optional context is
115
+ * the immediate "what triggered this call" frame. Bundled files (other
116
+ * .md siblings under the skill folder) are NOT inlined — the SDK can
117
+ * read them via `Read` if listed under tools.allow.
118
+ */
119
+ export declare function buildSkillPrompt(skill: Skill, inputs: Record<string, string | number | boolean> | undefined, context: string | undefined): string;
120
+ /**
121
+ * Run a skill as a hard-allowlisted sub-call. Returns a structured result.
122
+ *
123
+ * The skill is loaded via `getSkill()` (project-precedence honored when
124
+ * `projectDir` + `agentSlug` are passed). Its body is mustache-rendered
125
+ * with `inputs`, then sent to the SDK with an allowlist computed from
126
+ * `clementine.tools.allow` + auto-extracted MCP refs + a small baseline.
127
+ * After the SDK returns, `clementine.success.schema` (when set) is
128
+ * ajv-validated against the response.
129
+ *
130
+ * This function never throws — failures (skill not found, SDK error,
131
+ * timeout) are returned as `{ ok: false, error }`. The caller (chat,
132
+ * cron, sub-agent, MCP tool) decides how to surface that.
133
+ */
134
+ export declare function runSkill(name: string, options?: RunSkillOptions): Promise<RunSkillResult>;
135
+ //# sourceMappingURL=run-skill.d.ts.map
@@ -0,0 +1,267 @@
1
+ /**
2
+ * runSkill — the canonical Skill execution primitive (1.18.162).
3
+ *
4
+ * Closes Skills Runtime C-2 from the Skills-First redesign.
5
+ *
6
+ * Today (pre-1.18.162) a pinned skill is fed into a cron prompt as a
7
+ * markdown context block and its `clementine.tools.allow` list is UNIONED
8
+ * into the cron's allowedTools (1.18.121 widening). That is permissive,
9
+ * not enforced — a skill that says "I only use Bash + WebFetch" can still
10
+ * call any tool the surrounding cron allows.
11
+ *
12
+ * `runSkill(name, options)` is the alternative path: a sub-call where the
13
+ * skill's `tools.allow` is a HARD allowlist (only those tools, plus a
14
+ * minimal core set), `{{var}}` placeholders in the body are substituted
15
+ * from `options.inputs`, and `clementine.success.schema` is ajv-validated
16
+ * post-run.
17
+ *
18
+ * Why a separate primitive (and not a flag on the existing widening path):
19
+ * - Caller intent is different. Pinned-skills-as-context is "give the LLM
20
+ * reference material"; runSkill is "do this specific procedure now."
21
+ * - Hard enforcement requires constructing the SDK call ourselves, not
22
+ * reusing a cron-job's effective allowlist.
23
+ * - Inputs/success are skill-call concepts, not cron concepts.
24
+ *
25
+ * Surfaced as the MCP tool `run_skill(name, inputs?)` so chat + cron +
26
+ * sub-agents converge on one primitive.
27
+ */
28
+ import path from 'node:path';
29
+ import pino from 'pino';
30
+ import { getSkill } from './skill-store.js';
31
+ import { runAgent } from './run-agent.js';
32
+ const logger = pino({ name: 'clementine.run-skill' });
33
+ // ── Mustache substitution ─────────────────────────────────────────────
34
+ /** Matches `{{var_name}}` with optional whitespace. var_name is
35
+ * `[a-zA-Z_][a-zA-Z0-9_-]*` — the same identifier shape used in YAML
36
+ * frontmatter `inputs:` keys. */
37
+ const MUSTACHE = /\{\{\s*([a-zA-Z_][a-zA-Z0-9_-]*)\s*\}\}/g;
38
+ /**
39
+ * Substitute `{{var}}` placeholders in `body` from `inputs`. Missing
40
+ * keys are left as-is (so the LLM still sees the placeholder and can
41
+ * complain) rather than silently dropped — a missing input is more
42
+ * recoverable as visible text than as a stripped string.
43
+ */
44
+ export function applyMustache(body, inputs) {
45
+ if (!inputs || Object.keys(inputs).length === 0)
46
+ return body;
47
+ return body.replace(MUSTACHE, (match, key) => {
48
+ if (Object.prototype.hasOwnProperty.call(inputs, key)) {
49
+ return String(inputs[key]);
50
+ }
51
+ return match;
52
+ });
53
+ }
54
+ // ── Allowlist computation ─────────────────────────────────────────────
55
+ /** Tools every skill needs as a baseline regardless of its `tools.allow`.
56
+ * Without these the SDK can't navigate the project at all. Read/Glob/Grep
57
+ * are non-mutating; Agent is required so the SDK can dispatch its own
58
+ * internal subagents. */
59
+ const SKILL_BASELINE_TOOLS = ['Agent', 'Read', 'Glob', 'Grep'];
60
+ /** Matches `mcp__<server>__<tool>` references in skill bodies. Used to
61
+ * auto-include MCP tool names the skill *clearly* intends to call but
62
+ * which the author forgot to list under `tools.allow`. Same pattern as
63
+ * run-agent-cron.ts:150. */
64
+ const MCP_TOOL_REF = /mcp__([A-Za-z0-9-]+(?:_[A-Za-z0-9-]+)*)__[A-Za-z0-9_-]+/g;
65
+ /**
66
+ * Compute the HARD allowlist for a skill call.
67
+ *
68
+ * Combines, in order:
69
+ * 1. The skill's `clementine.tools.allow` list (or [] if absent)
70
+ * 2. Tool names auto-extracted from the skill body matching `mcp__*__*`
71
+ * 3. SKILL_BASELINE_TOOLS so the SDK can read files / dispatch subagents
72
+ *
73
+ * Then subtracts anything in `clementine.tools.deny` (deny wins).
74
+ *
75
+ * Returns a deduped array. Empty input = empty output (which the SDK
76
+ * treats as "deny everything"); callers are expected to set a sensible
77
+ * `tools.allow` on the skill.
78
+ */
79
+ export function computeSkillAllowlist(skill) {
80
+ const tools = skill.frontmatter?.clementine?.tools;
81
+ const declared = Array.isArray(tools?.allow) ? tools.allow : [];
82
+ const denied = new Set(Array.isArray(tools?.deny) ? tools.deny : []);
83
+ const fromBody = new Set();
84
+ let m;
85
+ // exec() with /g shares state per-regex; reset before each pass.
86
+ MCP_TOOL_REF.lastIndex = 0;
87
+ while ((m = MCP_TOOL_REF.exec(skill.body)) !== null) {
88
+ // m[0] is the full mcp__<server>__<tool> match
89
+ fromBody.add(m[0]);
90
+ }
91
+ const merged = new Set([
92
+ ...declared,
93
+ ...fromBody,
94
+ ...SKILL_BASELINE_TOOLS,
95
+ ]);
96
+ for (const d of denied)
97
+ merged.delete(d);
98
+ return [...merged];
99
+ }
100
+ // ── Prompt builder ────────────────────────────────────────────────────
101
+ /**
102
+ * Build the prompt the SDK actually executes for a skill call.
103
+ *
104
+ * Format:
105
+ * <skill body, with mustache substitutions applied>
106
+ *
107
+ * ## Caller context
108
+ * <options.context> ← when provided
109
+ *
110
+ * The skill body itself becomes the procedure; the optional context is
111
+ * the immediate "what triggered this call" frame. Bundled files (other
112
+ * .md siblings under the skill folder) are NOT inlined — the SDK can
113
+ * read them via `Read` if listed under tools.allow.
114
+ */
115
+ export function buildSkillPrompt(skill, inputs, context) {
116
+ const substitutedBody = applyMustache(skill.body, inputs);
117
+ if (!context || !context.trim())
118
+ return substitutedBody;
119
+ return `${substitutedBody}\n\n## Caller context\n\n${context.trim()}\n`;
120
+ }
121
+ // ── Schema validation ─────────────────────────────────────────────────
122
+ /** Best-effort JSON extraction: try whole text, then fenced ```json
123
+ * block, then the largest {…} substring. Mirrors goal-evaluator.ts so
124
+ * skill authors get the same forgiving behavior as goalCheck. */
125
+ function extractJson(text) {
126
+ if (!text)
127
+ return null;
128
+ const trimmed = text.trim();
129
+ try {
130
+ return JSON.parse(trimmed);
131
+ }
132
+ catch { /* fall through */ }
133
+ const fenced = /```json\s*([\s\S]*?)```/i.exec(text);
134
+ if (fenced?.[1]) {
135
+ try {
136
+ return JSON.parse(fenced[1]);
137
+ }
138
+ catch { /* fall through */ }
139
+ }
140
+ const start = text.indexOf('{');
141
+ const end = text.lastIndexOf('}');
142
+ if (start !== -1 && end > start) {
143
+ try {
144
+ return JSON.parse(text.slice(start, end + 1));
145
+ }
146
+ catch { /* fall through */ }
147
+ }
148
+ return null;
149
+ }
150
+ async function validateSkillOutput(output, schema) {
151
+ const json = extractJson(output);
152
+ if (json === null)
153
+ return { tried: false, pass: false, errors: [] };
154
+ try {
155
+ // Lazy import: ajv pulls in ~150KB and most callers won't have a schema.
156
+ // Default-export interop matches goal-evaluator.ts:75 — ajv@8 is CJS
157
+ // and the ESM bridge sometimes lands the constructor on .default.
158
+ const ajvMod = await import('ajv');
159
+ const AjvCtor = ajvMod.default ?? ajvMod;
160
+ const ajv = new AjvCtor({ allErrors: true, strict: false });
161
+ const validate = ajv.compile(schema);
162
+ const valid = validate(json);
163
+ const rawErrors = validate.errors ?? ajv.errors ?? [];
164
+ return {
165
+ tried: true,
166
+ pass: !!valid,
167
+ errors: rawErrors.slice(0, 5).map(e => {
168
+ const p = e.instancePath || '';
169
+ const m = e.message || 'invalid';
170
+ return p ? `${p} ${m}` : m;
171
+ }),
172
+ };
173
+ }
174
+ catch (err) {
175
+ return { tried: true, pass: false, errors: [`schema compile error: ${err}`] };
176
+ }
177
+ }
178
+ // ── The primitive ─────────────────────────────────────────────────────
179
+ /**
180
+ * Run a skill as a hard-allowlisted sub-call. Returns a structured result.
181
+ *
182
+ * The skill is loaded via `getSkill()` (project-precedence honored when
183
+ * `projectDir` + `agentSlug` are passed). Its body is mustache-rendered
184
+ * with `inputs`, then sent to the SDK with an allowlist computed from
185
+ * `clementine.tools.allow` + auto-extracted MCP refs + a small baseline.
186
+ * After the SDK returns, `clementine.success.schema` (when set) is
187
+ * ajv-validated against the response.
188
+ *
189
+ * This function never throws — failures (skill not found, SDK error,
190
+ * timeout) are returned as `{ ok: false, error }`. The caller (chat,
191
+ * cron, sub-agent, MCP tool) decides how to surface that.
192
+ */
193
+ export async function runSkill(name, options = {}) {
194
+ const skill = getSkill(name, {
195
+ ...(options.projectWorkDir ? { projectWorkDir: options.projectWorkDir } : {}),
196
+ });
197
+ if (!skill) {
198
+ return {
199
+ ok: false,
200
+ output: '',
201
+ error: `Skill not found: ${name}`,
202
+ };
203
+ }
204
+ const effectiveTools = computeSkillAllowlist(skill);
205
+ const prompt = buildSkillPrompt(skill, options.inputs, options.context);
206
+ const limits = skill.frontmatter?.clementine?.limits;
207
+ const maxTurns = options.maxTurns ?? limits?.maxTurns;
208
+ const maxBudgetUsd = options.maxBudgetUsd ?? limits?.maxBudgetUsd;
209
+ const sessionKey = options.sessionKey
210
+ ?? `skill:${name}:${Date.now().toString(36)}`;
211
+ // Surface the skill folder to the SDK via additionalDirectories so
212
+ // bundled scripts (skill/scripts/*.py) are reachable for `Bash` calls.
213
+ // Folder-form skills only — flat skills have no siblings worth surfacing.
214
+ const additionalDirectories = skill.layout === 'folder' ? [path.dirname(skill.filePath)] : undefined;
215
+ logger.info({
216
+ skill: name,
217
+ tools: effectiveTools,
218
+ maxTurns,
219
+ maxBudgetUsd,
220
+ inputKeys: Object.keys(options.inputs ?? {}),
221
+ hasContext: !!options.context,
222
+ }, 'runSkill: invoking');
223
+ let runResult;
224
+ try {
225
+ const sdkOpts = {
226
+ sessionKey,
227
+ source: options.source ?? 'skill',
228
+ allowedTools: effectiveTools,
229
+ ...(options.model ? { model: options.model } : {}),
230
+ ...(typeof maxTurns === 'number' ? { maxTurns } : {}),
231
+ ...(typeof maxBudgetUsd === 'number' ? { maxBudgetUsd } : {}),
232
+ ...(additionalDirectories ? { additionalDirectories } : {}),
233
+ ...(options.onText ? { onText: options.onText } : {}),
234
+ ...(options.abortSignal ? { abortSignal: options.abortSignal } : {}),
235
+ };
236
+ runResult = await runAgent(prompt, sdkOpts);
237
+ }
238
+ catch (err) {
239
+ logger.error({ err, skill: name }, 'runSkill: SDK call failed');
240
+ return {
241
+ ok: false,
242
+ output: '',
243
+ effectiveTools,
244
+ error: `SDK error: ${err}`,
245
+ };
246
+ }
247
+ // Schema validation — only when the skill declared one and the caller
248
+ // didn't opt out. We do not flip ok=false on schema fail; we surface
249
+ // the result so the caller can decide. (A cron may want to retry; a
250
+ // chat user just sees a "schema mismatch" badge.)
251
+ let validation;
252
+ const successSchema = skill.frontmatter?.clementine?.success?.schema;
253
+ if (!options.skipValidation && successSchema) {
254
+ validation = await validateSkillOutput(runResult.text, successSchema);
255
+ }
256
+ return {
257
+ ok: true,
258
+ output: runResult.text,
259
+ cost: runResult.totalCostUsd,
260
+ turns: runResult.numTurns,
261
+ sessionId: runResult.sessionId,
262
+ runId: runResult.runId,
263
+ effectiveTools,
264
+ ...(validation ? { validation } : {}),
265
+ };
266
+ }
267
+ //# sourceMappingURL=run-skill.js.map
@@ -19,6 +19,7 @@ import { listAllGoals } from '../tools/shared.js';
19
19
  import { MemoryStore } from '../memory/store.js';
20
20
  import { ANTHROPIC_SKILL_NAME_PATTERN } from './skill-store.js';
21
21
  import { recordApprovalSignal, formatApprovalSignalsForHypothesizer } from './approval-signals.js';
22
+ import { clusterBrokenJobs, formatClustersForHypothesizer } from '../gateway/failure-clustering.js';
22
23
  const logger = pino({ name: 'clementine.self-improve' });
23
24
  // ── Defaults ─────────────────────────────────────────────────────────
24
25
  const DEFAULT_CONFIG = {
@@ -1102,6 +1103,18 @@ export class SelfImproveLoop {
1102
1103
  // owner has approved, away from those they've denied. Empty string for
1103
1104
  // fresh installs, which keeps the prompt clean.
1104
1105
  const approvalSignalsText = formatApprovalSignalsForHypothesizer();
1106
+ // Cross-job failure clusters (1.18.163) — when ≥3 jobs hit the same
1107
+ // normalized error pattern in 48h, surface ONE cluster summary so
1108
+ // the hypothesizer proposes a root-cause fix instead of N per-job
1109
+ // patches. Empty string when no cluster meets the threshold.
1110
+ let failureClusterText = '';
1111
+ try {
1112
+ const clusters = clusterBrokenJobs();
1113
+ failureClusterText = formatClustersForHypothesizer(clusters);
1114
+ }
1115
+ catch (err) {
1116
+ logger.warn({ err }, 'Failed to compute failure clusters — proceeding without them');
1117
+ }
1105
1118
  // ── Step 1: Analysis — identify top opportunities from metrics (no config dumps) ──
1106
1119
  const analysisPrompt = `You are Clementine's self-improvement strategist. Analyze the performance data below and identify the top 3 improvement opportunities.\n\n` +
1107
1120
  `## Recent Performance Data (last 7 days)\n` +
@@ -1119,6 +1132,7 @@ export class SelfImproveLoop {
1119
1132
  diversityConstraint +
1120
1133
  agentFocusText +
1121
1134
  soulCandidatesText +
1135
+ (failureClusterText ? `\n${failureClusterText}` : '') +
1122
1136
  (approvalSignalsText ? `\n${approvalSignalsText}` : '') +
1123
1137
  `\n## Instructions\n` +
1124
1138
  `Propose **1-3 concrete, high-impact improvements** the owner should review today — no fewer (aim for at least one actionable suggestion when data warrants it), no more (the owner reads each proposal manually and you'll overwhelm them). Rank by expected impact; drop anything below "solid idea".\n\n` +
@@ -11407,7 +11407,7 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
11407
11407
  res.status(500).json({ error: String(err) });
11408
11408
  }
11409
11409
  });
11410
- app.get('/api/self-improve', (_req, res) => {
11410
+ app.get('/api/self-improve', async (_req, res) => {
11411
11411
  const siDir = path.join(BASE_DIR, 'self-improve');
11412
11412
  const stateFile = path.join(siDir, 'state.json');
11413
11413
  const logFile = path.join(siDir, 'experiment-log.jsonl');
@@ -11472,7 +11472,18 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
11472
11472
  }
11473
11473
  catch { /* ignore */ }
11474
11474
  }
11475
- res.json({ state, experiments, pending, triggers, verifications });
11475
+ // 1.18.163 cross-job failure clusters (≥3 jobs hitting the same
11476
+ // normalized error pattern in 48h). Computed on demand from
11477
+ // computeBrokenJobs(); no schema, no persistence. The Self-Improve
11478
+ // tab surfaces this so the owner sees "5 jobs hit X — propose one
11479
+ // root-cause fix" instead of N per-job rows.
11480
+ let clusters = [];
11481
+ try {
11482
+ const { clusterBrokenJobs } = await import('../gateway/failure-clustering.js');
11483
+ clusters = clusterBrokenJobs();
11484
+ }
11485
+ catch { /* non-fatal — empty clusters list */ }
11486
+ res.json({ state, experiments, pending, triggers, verifications, clusters });
11476
11487
  });
11477
11488
  app.post('/api/self-improve/run', async (_req, res) => {
11478
11489
  try {
@@ -19940,6 +19951,13 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
19940
19951
  <div class="empty-state" style="padding:14px">No active failures &mdash; nothing has tripped 3+ consecutive errors.</div>
19941
19952
  </div>
19942
19953
  </div>
19954
+ <div class="card" style="margin-top:16px" id="si-clusters-card" hidden>
19955
+ <div class="card-header" style="display:flex;align-items:center;justify-content:space-between">
19956
+ <span>Cross-job failure clusters <span style="font-weight:normal;font-size:11px;color:var(--text-muted)">&middot; 3+ jobs hitting the same error pattern (last 48h)</span></span>
19957
+ <span class="tab-badge" id="tab-si-clusters" style="background:#a855f7;color:#fff">0</span>
19958
+ </div>
19959
+ <div class="card-body" id="si-clusters-list" style="padding:0"></div>
19960
+ </div>
19943
19961
  <div class="card" style="margin-top:16px">
19944
19962
  <div class="card-header" style="display:flex;align-items:center;justify-content:space-between">
19945
19963
  <span>Verifying fixes</span>
@@ -40500,6 +40518,7 @@ async function refreshSelfImprove() {
40500
40518
  const pending = d.pending || [];
40501
40519
  const triggers = d.triggers || [];
40502
40520
  const verifications = d.verifications || [];
40521
+ const clusters = d.clusters || [];
40503
40522
 
40504
40523
  // Update tab badge — combine human-attention queues so the sidebar
40505
40524
  // count reflects "things that need you to look at", not just proposals.
@@ -40537,6 +40556,36 @@ async function refreshSelfImprove() {
40537
40556
  }
40538
40557
  }
40539
40558
 
40559
+ // 1.18.163 — cross-job failure clusters (≥3 jobs hitting the same
40560
+ // normalized pattern). Hidden when the list is empty so the card
40561
+ // doesn't take up space on a healthy install.
40562
+ const clustersCard = document.getElementById('si-clusters-card');
40563
+ const clustersList = document.getElementById('si-clusters-list');
40564
+ const clustersBadge = document.getElementById('tab-si-clusters');
40565
+ if (clustersCard && clustersList) {
40566
+ if (clusters.length === 0) {
40567
+ clustersCard.hidden = true;
40568
+ } else {
40569
+ clustersCard.hidden = false;
40570
+ if (clustersBadge) clustersBadge.textContent = clusters.length;
40571
+ clustersList.innerHTML = clusters.map(function(c) {
40572
+ var rep = String(c.representative || '').slice(0, 200);
40573
+ var jobsList = (c.jobs || []).slice(0, 5).map(function(j) {
40574
+ return '<span class="badge" style="margin-right:4px;font-size:11px">' + esc(j.jobName) + ' &times;' + (j.errorCount48h || 0) + '</span>';
40575
+ }).join('');
40576
+ var more = (c.jobs && c.jobs.length > 5) ? '<span style="font-size:11px;color:var(--text-muted)">+' + (c.jobs.length - 5) + ' more</span>' : '';
40577
+ return '<div style="padding:12px;border-bottom:1px solid var(--border)">' +
40578
+ '<div style="display:flex;justify-content:space-between;align-items:baseline;gap:8px;flex-wrap:wrap">' +
40579
+ '<div><strong>' + (c.jobs ? c.jobs.length : 0) + ' jobs</strong> &middot; ' +
40580
+ '<span style="font-size:11px;color:var(--text-muted)">' + (c.totalErrors || 0) + ' total errors (48h)</span></div>' +
40581
+ '</div>' +
40582
+ '<div style="margin-top:6px;font-size:12px;color:var(--text-secondary);font-family:ui-monospace,monospace">' + esc(rep) + '</div>' +
40583
+ '<div style="margin-top:8px">' + jobsList + ' ' + more + '</div>' +
40584
+ '</div>';
40585
+ }).join('');
40586
+ }
40587
+ }
40588
+
40540
40589
  // Pending fix verifications — auto-fixes soaking through the 3-run window.
40541
40590
  const verifyEl = document.getElementById('si-verifying-list');
40542
40591
  if (verifyEl) {
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Cross-job failure clustering (1.18.163).
3
+ *
4
+ * Today the failure pipeline is per-job:
5
+ * broken-job(jobName) → classifyFailure(lastErrors) → 1 fix proposal
6
+ *
7
+ * That means when 5 different cron jobs all hit the same root cause
8
+ * (e.g. all 5 fail with "Prompt is too long"), the system generates
9
+ * 5 isolated patches instead of 1 root-cause fix. The owner sees
10
+ * 5 separate proposals in the Self-Improve tab and either approves all
11
+ * 5 (busywork) or denies them (and the underlying issue persists).
12
+ *
13
+ * This module groups recent broken jobs by *normalized error pattern*.
14
+ * When ≥3 distinct jobs hit the same cluster, the owner gets ONE
15
+ * "5 jobs all hit X — propose Y for all of them" suggestion instead of
16
+ * N separate ones.
17
+ *
18
+ * This is purely a *suggestion / presentation* layer — clusters are
19
+ * surfaced as a hint to the hypothesizer + dashboard. The existing
20
+ * per-job `failure-fix-consumer` continues to handle individual patches
21
+ * unchanged. Clustering is additive observability, not a replacement
22
+ * for per-job fixes.
23
+ *
24
+ * Reads from the existing `computeBrokenJobs()` source — no new schema,
25
+ * no new persistence, computed on demand.
26
+ */
27
+ import type { BrokenJob } from './failure-monitor.js';
28
+ /**
29
+ * Minimum distinct jobs required to form a cluster. Below this we don't
30
+ * bother — a single repeated error is just a per-job problem.
31
+ *
32
+ * 3 is conservative: 2 looks coincidental, 3 is "this is a systemic
33
+ * thing." Tunable if we get noise.
34
+ */
35
+ export declare const MIN_CLUSTER_SIZE = 3;
36
+ /**
37
+ * Normalize an error message into a clustering key.
38
+ *
39
+ * Goals:
40
+ * - "Prompt is too long (12345 tokens)" and "Prompt is too long (45678
41
+ * tokens)" should collapse to the same key.
42
+ * - Job-specific tokens (UUIDs, timestamps, paths with the job name)
43
+ * should be stripped.
44
+ * - The result should still be human-readable (we surface it in the UI).
45
+ *
46
+ * Strategy:
47
+ * 1. Lowercase + collapse whitespace
48
+ * 2. Strip ISO timestamps + UNIX epochs
49
+ * 3. Strip UUIDs and long hex tokens
50
+ * 4. Strip parenthesized numbers ("(12345 tokens)" → "(N tokens)")
51
+ * 5. Strip absolute paths
52
+ * 6. Truncate to ERROR_NORMALIZE_LEN
53
+ */
54
+ export declare function normalizeErrorMessage(raw: string): string;
55
+ export interface FailureCluster {
56
+ /** The normalized pattern key. Stable across jobs/runs. */
57
+ pattern: string;
58
+ /** A representative human-readable error message (one of the original
59
+ * uncleaned strings, picked by frequency). */
60
+ representative: string;
61
+ /** Distinct jobs hitting this cluster, sorted by error count desc. */
62
+ jobs: Array<{
63
+ jobName: string;
64
+ agentSlug?: string;
65
+ errorCount48h: number;
66
+ lastErrorAt: string | null;
67
+ }>;
68
+ /** Total errors across all jobs in the cluster (last 48h). */
69
+ totalErrors: number;
70
+ }
71
+ /**
72
+ * Group the current broken jobs by normalized error pattern. Only
73
+ * returns clusters with ≥ MIN_CLUSTER_SIZE distinct jobs. Returns
74
+ * largest clusters first (by distinct-job count, then total error
75
+ * count).
76
+ *
77
+ * Each broken job contributes UP TO 3 patterns (its `lastErrors[]`).
78
+ * A job that hits two distinct patterns counts toward both clusters
79
+ * — that's by design, since a job with two root causes really does
80
+ * need both fixes.
81
+ */
82
+ export declare function clusterBrokenJobs(jobs?: BrokenJob[]): FailureCluster[];
83
+ /**
84
+ * Render a cluster summary for the hypothesizer prompt block. Empty
85
+ * string when no clusters meet the threshold.
86
+ *
87
+ * Format:
88
+ * ### Cross-job failure clusters (last 48h)
89
+ * - "Prompt is too long (N tokens)" — 5 jobs: insight-check, outcome-grader, route-classifier, ...
90
+ * - "Reached maximum number of turns (N)" — 3 jobs: ...
91
+ * Bias one root-cause proposal toward the largest cluster instead of N per-job ones.
92
+ */
93
+ export declare function formatClustersForHypothesizer(clusters: FailureCluster[]): string;
94
+ //# sourceMappingURL=failure-clustering.d.ts.map
@@ -0,0 +1,190 @@
1
+ /**
2
+ * Cross-job failure clustering (1.18.163).
3
+ *
4
+ * Today the failure pipeline is per-job:
5
+ * broken-job(jobName) → classifyFailure(lastErrors) → 1 fix proposal
6
+ *
7
+ * That means when 5 different cron jobs all hit the same root cause
8
+ * (e.g. all 5 fail with "Prompt is too long"), the system generates
9
+ * 5 isolated patches instead of 1 root-cause fix. The owner sees
10
+ * 5 separate proposals in the Self-Improve tab and either approves all
11
+ * 5 (busywork) or denies them (and the underlying issue persists).
12
+ *
13
+ * This module groups recent broken jobs by *normalized error pattern*.
14
+ * When ≥3 distinct jobs hit the same cluster, the owner gets ONE
15
+ * "5 jobs all hit X — propose Y for all of them" suggestion instead of
16
+ * N separate ones.
17
+ *
18
+ * This is purely a *suggestion / presentation* layer — clusters are
19
+ * surfaced as a hint to the hypothesizer + dashboard. The existing
20
+ * per-job `failure-fix-consumer` continues to handle individual patches
21
+ * unchanged. Clustering is additive observability, not a replacement
22
+ * for per-job fixes.
23
+ *
24
+ * Reads from the existing `computeBrokenJobs()` source — no new schema,
25
+ * no new persistence, computed on demand.
26
+ */
27
+ import pino from 'pino';
28
+ import { computeBrokenJobs } from './failure-monitor.js';
29
+ const logger = pino({ name: 'clementine.failure-clustering' });
30
+ // ── Tunables ─────────────────────────────────────────────────────────
31
+ /**
32
+ * Minimum distinct jobs required to form a cluster. Below this we don't
33
+ * bother — a single repeated error is just a per-job problem.
34
+ *
35
+ * 3 is conservative: 2 looks coincidental, 3 is "this is a systemic
36
+ * thing." Tunable if we get noise.
37
+ */
38
+ export const MIN_CLUSTER_SIZE = 3;
39
+ /** Max chars of an error message we consider when normalizing. The
40
+ * important signal is in the first ~200 chars; longer suffixes are
41
+ * usually stack traces or per-call IDs that destroy clustering. */
42
+ const ERROR_NORMALIZE_LEN = 200;
43
+ // ── Normalization ────────────────────────────────────────────────────
44
+ /**
45
+ * Normalize an error message into a clustering key.
46
+ *
47
+ * Goals:
48
+ * - "Prompt is too long (12345 tokens)" and "Prompt is too long (45678
49
+ * tokens)" should collapse to the same key.
50
+ * - Job-specific tokens (UUIDs, timestamps, paths with the job name)
51
+ * should be stripped.
52
+ * - The result should still be human-readable (we surface it in the UI).
53
+ *
54
+ * Strategy:
55
+ * 1. Lowercase + collapse whitespace
56
+ * 2. Strip ISO timestamps + UNIX epochs
57
+ * 3. Strip UUIDs and long hex tokens
58
+ * 4. Strip parenthesized numbers ("(12345 tokens)" → "(N tokens)")
59
+ * 5. Strip absolute paths
60
+ * 6. Truncate to ERROR_NORMALIZE_LEN
61
+ */
62
+ export function normalizeErrorMessage(raw) {
63
+ if (!raw)
64
+ return '';
65
+ let s = raw.toLowerCase().trim();
66
+ // ISO timestamps: 2026-05-10T14:23:00.000Z (with optional millis/tz)
67
+ s = s.replace(/\d{4}-\d{2}-\d{2}t\d{2}:\d{2}:\d{2}(\.\d+)?(z|[+-]\d{2}:?\d{2})?/g, '<ts>');
68
+ // Unix epoch ms (13-digit) + sec (10-digit) — must come BEFORE plain numbers
69
+ s = s.replace(/\b\d{13}\b/g, '<ts>');
70
+ s = s.replace(/\b\d{10}\b/g, '<ts>');
71
+ // UUIDs
72
+ s = s.replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g, '<uuid>');
73
+ // Long hex (16+ chars, like commit SHAs / session ids)
74
+ s = s.replace(/\b[0-9a-f]{16,}\b/g, '<hex>');
75
+ // Parenthesized numbers: (12345) → (N) ; (12345 tokens) → (N tokens)
76
+ s = s.replace(/\(\s*\d[\d,_.]*\s*([a-z]*)\s*\)/g, (_m, suffix) => suffix ? `(N ${suffix})` : '(N)');
77
+ // Absolute paths — keep just the basename
78
+ s = s.replace(/\/[\w./-]+\/([\w.-]+)/g, '<path>/$1');
79
+ // Generic standalone large numbers
80
+ s = s.replace(/\b\d{4,}\b/g, '<N>');
81
+ // Collapse whitespace
82
+ s = s.replace(/\s+/g, ' ').trim();
83
+ return s.slice(0, ERROR_NORMALIZE_LEN);
84
+ }
85
+ // ── Clusterer ────────────────────────────────────────────────────────
86
+ /**
87
+ * Group the current broken jobs by normalized error pattern. Only
88
+ * returns clusters with ≥ MIN_CLUSTER_SIZE distinct jobs. Returns
89
+ * largest clusters first (by distinct-job count, then total error
90
+ * count).
91
+ *
92
+ * Each broken job contributes UP TO 3 patterns (its `lastErrors[]`).
93
+ * A job that hits two distinct patterns counts toward both clusters
94
+ * — that's by design, since a job with two root causes really does
95
+ * need both fixes.
96
+ */
97
+ export function clusterBrokenJobs(jobs) {
98
+ const source = jobs ?? computeBrokenJobs();
99
+ if (source.length === 0)
100
+ return [];
101
+ // pattern → { representative (most common raw), jobs map keyed by jobName }
102
+ const buckets = new Map();
103
+ for (const job of source) {
104
+ const seenForThisJob = new Set();
105
+ for (const raw of job.lastErrors ?? []) {
106
+ const key = normalizeErrorMessage(raw);
107
+ if (!key)
108
+ continue;
109
+ // Don't double-count this job for the same pattern even if
110
+ // lastErrors contains two near-identical messages.
111
+ if (seenForThisJob.has(key))
112
+ continue;
113
+ seenForThisJob.add(key);
114
+ let bucket = buckets.get(key);
115
+ if (!bucket) {
116
+ bucket = { representative: raw, rawCounts: new Map(), jobs: new Map() };
117
+ buckets.set(key, bucket);
118
+ }
119
+ bucket.rawCounts.set(raw, (bucket.rawCounts.get(raw) ?? 0) + 1);
120
+ // Pick the most-common raw form as the representative on the fly.
121
+ const cur = bucket.rawCounts.get(raw);
122
+ const best = bucket.rawCounts.get(bucket.representative) ?? 0;
123
+ if (cur > best)
124
+ bucket.representative = raw;
125
+ const existing = bucket.jobs.get(job.jobName);
126
+ if (existing) {
127
+ existing.errorCount48h += job.errorCount48h;
128
+ }
129
+ else {
130
+ bucket.jobs.set(job.jobName, {
131
+ jobName: job.jobName,
132
+ ...(job.agentSlug ? { agentSlug: job.agentSlug } : {}),
133
+ errorCount48h: job.errorCount48h,
134
+ lastErrorAt: job.lastErrorAt,
135
+ });
136
+ }
137
+ }
138
+ }
139
+ const clusters = [];
140
+ for (const [pattern, bucket] of buckets) {
141
+ if (bucket.jobs.size < MIN_CLUSTER_SIZE)
142
+ continue;
143
+ const jobsArr = [...bucket.jobs.values()].sort((a, b) => b.errorCount48h - a.errorCount48h);
144
+ const totalErrors = jobsArr.reduce((acc, j) => acc + j.errorCount48h, 0);
145
+ clusters.push({
146
+ pattern,
147
+ representative: bucket.representative,
148
+ jobs: jobsArr,
149
+ totalErrors,
150
+ });
151
+ }
152
+ // Sort: distinct-job count desc, then total errors desc, then pattern asc
153
+ clusters.sort((a, b) => {
154
+ if (b.jobs.length !== a.jobs.length)
155
+ return b.jobs.length - a.jobs.length;
156
+ if (b.totalErrors !== a.totalErrors)
157
+ return b.totalErrors - a.totalErrors;
158
+ return a.pattern.localeCompare(b.pattern);
159
+ });
160
+ if (clusters.length > 0) {
161
+ logger.info({ count: clusters.length, top: clusters[0]?.pattern.slice(0, 80), topJobs: clusters[0]?.jobs.length }, 'Failure clusters detected');
162
+ }
163
+ return clusters;
164
+ }
165
+ /**
166
+ * Render a cluster summary for the hypothesizer prompt block. Empty
167
+ * string when no clusters meet the threshold.
168
+ *
169
+ * Format:
170
+ * ### Cross-job failure clusters (last 48h)
171
+ * - "Prompt is too long (N tokens)" — 5 jobs: insight-check, outcome-grader, route-classifier, ...
172
+ * - "Reached maximum number of turns (N)" — 3 jobs: ...
173
+ * Bias one root-cause proposal toward the largest cluster instead of N per-job ones.
174
+ */
175
+ export function formatClustersForHypothesizer(clusters) {
176
+ if (!clusters || clusters.length === 0)
177
+ return '';
178
+ const lines = ['### Cross-job failure clusters (last 48h)'];
179
+ for (const c of clusters.slice(0, 5)) {
180
+ const jobNames = c.jobs.slice(0, 5).map(j => j.jobName).join(', ');
181
+ const more = c.jobs.length > 5 ? `, +${c.jobs.length - 5} more` : '';
182
+ const rep = c.representative.length > 100 ? c.representative.slice(0, 100) + '…' : c.representative;
183
+ lines.push(`- "${rep}" — ${c.jobs.length} jobs (${c.totalErrors} total errors): ${jobNames}${more}`);
184
+ }
185
+ lines.push('When a cluster of 3+ jobs hits the same pattern, prefer ONE root-cause proposal ' +
186
+ '(e.g. an advisor-rule, a prompt-override at agent or global scope, or a shared ' +
187
+ 'config change) over N per-job patches.');
188
+ return lines.join('\n') + '\n\n';
189
+ }
190
+ //# sourceMappingURL=failure-clustering.js.map
@@ -214,5 +214,43 @@ export function registerSkillTools(server) {
214
214
  return textResult(`❌ Failed to list skills: ${err instanceof Error ? err.message : String(err)}`);
215
215
  }
216
216
  });
217
+ // ── run_skill (1.18.162) ────────────────────────────────────────────
218
+ // Invoke a skill as a hard-allowlisted sub-call. Mustache substitutes
219
+ // `{{var}}` placeholders in the skill body from `inputs`, runs through
220
+ // the SDK with ONLY the skill's clementine.tools.allow + a baseline
221
+ // (Agent/Read/Glob/Grep) + auto-extracted mcp__*__* refs from the body,
222
+ // and validates against clementine.success.schema if declared.
223
+ //
224
+ // Use this when a chat or another skill needs to *execute* a procedure
225
+ // (not just reference it). Pinned-skills-as-context (the existing 1.18.121
226
+ // widening path) is for the cron prompt; this is for callable execution.
227
+ server.tool('run_skill', 'Execute a named skill as a sub-call with a HARD tool allowlist. The skill body is rendered with optional {{var}} substitutions from `inputs`, then run with only the tools the skill declared under clementine.tools.allow (plus a small baseline). Returns the skill output + cost + schema validation result when applicable. Use when chat says "run my morning-briefing skill" or when one skill needs to invoke another.', {
228
+ name: z.string().regex(NAME_PATTERN).describe('Skill slug (e.g. "morning-briefing"). Must match an existing skill in the vault.'),
229
+ inputs: z.record(z.string(), z.union([z.string(), z.number(), z.boolean()])).optional()
230
+ .describe('Optional key→value map substituted into {{var}} placeholders in the skill body. Missing placeholders are left as-is so the LLM can complain.'),
231
+ context: z.string().optional()
232
+ .describe('Optional caller context appended after the skill body (e.g. "user said: do X right now"). Surfaced under a "## Caller context" heading.'),
233
+ }, async ({ name, inputs, context }) => {
234
+ try {
235
+ // Lazy import — runSkill pulls in run-agent + the SDK; only load on
236
+ // demand so `list_skills` etc stay fast and the MCP server boots
237
+ // without warming the whole agent path.
238
+ const { runSkill } = await import('../agent/run-skill.js');
239
+ const result = await runSkill(name, { inputs, context, source: 'mcp:run_skill' });
240
+ if (!result.ok) {
241
+ return textResult(`❌ run_skill(${name}) failed: ${result.error ?? 'unknown error'}`);
242
+ }
243
+ const validationLine = result.validation
244
+ ? `\n\n**Schema:** ${result.validation.tried ? (result.validation.pass ? '✅ pass' : `❌ fail — ${result.validation.errors.slice(0, 2).join('; ')}`) : '(skipped — no JSON in output)'}`
245
+ : '';
246
+ const meta = `\n\n_${result.turns ?? 0} turns · $${(result.cost ?? 0).toFixed(4)} · ${result.effectiveTools?.length ?? 0} tools allowed_${validationLine}`;
247
+ return textResult(`${result.output}${meta}`);
248
+ }
249
+ catch (err) {
250
+ const msg = err instanceof Error ? err.message : String(err);
251
+ logger.error({ err, skill: name }, 'run_skill failed');
252
+ return textResult(`❌ run_skill(${name}) failed: ${msg}`);
253
+ }
254
+ });
217
255
  }
218
256
  //# sourceMappingURL=skill-tools.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clementine-agent",
3
- "version": "1.18.161",
3
+ "version": "1.18.163",
4
4
  "description": "Clementine — Personal AI Assistant (TypeScript)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",