npm - clementine-agent - Versions diffs - 1.18.86 → 1.18.87 - Mend

clementine-agent 1.18.86 → 1.18.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/cli/cron.js +11 -2
package/dist/cli/dashboard.js +59 -1
package/dist/gateway/cron-scheduler.js +19 -2
package/dist/gateway/failure-taxonomy.d.ts +24 -0
package/dist/gateway/failure-taxonomy.js +173 -0
package/dist/types.d.ts +14 -0
package/package.json +1 -1

package/dist/cli/cron.js CHANGED Viewed

@@ -182,7 +182,7 @@ export async function cmdCronRun(jobName) {
     catch (err) {
         const finishedAt = new Date();
         const trigger = process.env.CRON_RUN_TRIGGER || 'scheduled';
-        runLog.append({
+        const errEntry = {
             jobName: job.name,
             startedAt: startedAt.toISOString(),
             finishedAt: finishedAt.toISOString(),
@@ -192,7 +192,16 @@ export async function cmdCronRun(jobName) {
             errorType: classifyError(err),
             attempt: 1,
             trigger,
-        });
+        };
+        // 1.18.87: stamp PRD-canonical failure category.
+        try {
+            const { classifyRunFailure } = await import('../gateway/failure-taxonomy.js');
+            const cat = classifyRunFailure(errEntry);
+            if (cat)
+                errEntry.failureCategory = cat;
+        }
+        catch { /* non-fatal */ }
+        runLog.append(errEntry);
         console.error(`Error: ${err}`);
         process.exit(1);
     }

package/dist/cli/dashboard.js CHANGED Viewed

@@ -23748,6 +23748,7 @@ function renderRunningCard(item) {
 var _runListState = {
   filterStatus: 'all',     // 'all' | 'failed' | 'ok'
   filterWindow: '24h',     // '24h' | '7d' | 'all'
+  filterCategory: 'all',   // 'all' | <one of the 11 PRD failure categories>
   filterText: '',          // free-text task name match
   data: [],                // raw runs from /api/cron/runs
 };
@@ -23760,13 +23761,15 @@ function _runListLoadDefaultView() {
       var saved = JSON.parse(raw);
       _runListState.filterStatus = saved.filterStatus || 'all';
       _runListState.filterWindow = saved.filterWindow || '24h';
+      _runListState.filterCategory = saved.filterCategory || 'all';
       _runListState.filterText = saved.filterText || '';
       return;
     }
   } catch (e) { /* ignore */ }
-  // Default: failures, last 24h.
+  // Default: failures, last 24h, all categories.
   _runListState.filterStatus = 'failed';
   _runListState.filterWindow = '24h';
+  _runListState.filterCategory = 'all';
   _runListState.filterText = '';
 }
@@ -23775,6 +23778,7 @@ function _runListSaveView() {
     localStorage.setItem('runListView', JSON.stringify({
       filterStatus: _runListState.filterStatus,
       filterWindow: _runListState.filterWindow,
+      filterCategory: _runListState.filterCategory,
       filterText: _runListState.filterText,
     }));
   } catch (e) { /* ignore */ }
@@ -23786,12 +23790,16 @@ function _runListApplyFilters(runs) {
     : _runListState.filterWindow === '7d' ? 7 * 24 * 60 * 60 * 1000
     : Infinity;
   var query = (_runListState.filterText || '').trim().toLowerCase();
+  var catFilter = _runListState.filterCategory;
   return runs.filter(function(r) {
     if (_runListState.filterStatus === 'failed') {
       if (r.status !== 'error' && r.status !== 'timeout' && r.status !== 'lost') return false;
     } else if (_runListState.filterStatus === 'ok') {
       if (r.status !== 'ok') return false;
     }
+    if (catFilter && catFilter !== 'all') {
+      if (r.failureCategory !== catFilter) return false;
+    }
     if (query && String(r.jobName || '').toLowerCase().indexOf(query) === -1) return false;
     if (windowMs !== Infinity && r.startedAt) {
       var age = now - new Date(r.startedAt).getTime();
@@ -23844,6 +23852,21 @@ function renderRunListBody(allRuns) {
     { value: '7d',  label: 'Last 7 days' },
     { value: 'all', label: 'All time' },
   ], 'filterWindow');
+  // PRD §9 / 1.18.87: 11-category failure filter. Build the option list from
+  // the categories actually present in the loaded data so the chip row stays
+  // compact (don't show buckets that have zero runs).
+  var seenCats = {};
+  for (var ci = 0; ci < allRuns.length; ci++) {
+    var c = allRuns[ci].failureCategory;
+    if (c) seenCats[c] = (seenCats[c] || 0) + 1;
+  }
+  var catOptions = [{ value: 'all', label: 'Any category' }];
+  Object.keys(seenCats).sort().forEach(function(k) {
+    catOptions.push({ value: k, label: _runListCategoryLabel(k) + ' (' + seenCats[k] + ')' });
+  });
+  if (catOptions.length > 1) {
+    html += _runListChip('Category', catOptions, 'filterCategory');
+  }
   html += '<input type="search" placeholder="Filter by task name…" value="' + esc(_runListState.filterText) + '" oninput="onRunListSearch(this.value)" style="flex:1;min-width:200px;max-width:320px;padding:6px 10px;font-size:12px;border:1px solid var(--border);border-radius:6px;background:var(--bg-secondary);color:var(--text-primary)">';
   html += '<button class="btn-sm" onclick="resetRunListFilters()" style="font-size:11px">Reset to default</button>';
   html += '</div>';
@@ -23881,6 +23904,13 @@ function renderRunListBody(allRuns) {
       : entry.trigger === 'after' ? 'var(--purple)'
       : entry.trigger === 'discord' ? 'var(--blue)'
       : 'var(--text-muted)';
+    // 1.18.87: failure category badge in the preview area when set.
+    var categoryBadge = '';
+    if (entry.failureCategory) {
+      var catLabel = _runListCategoryLabel(entry.failureCategory);
+      var catColor = _runListCategoryColor(entry.failureCategory);
+      categoryBadge = '<span style="display:inline-block;background:' + catColor + '20;color:' + catColor + ';padding:1px 6px;border-radius:4px;font-size:10px;font-weight:600;letter-spacing:0.04em;margin-right:4px">' + esc(catLabel) + '</span>';
+    }
     // Goal cell
     var goalCell = '<div></div>';
     if (entry.goalCheck) {
@@ -23901,6 +23931,7 @@ function renderRunListBody(allRuns) {
       +    goalCell
       +    '<div style="min-width:0">'
       +      '<div style="font-weight:500;color:var(--text-primary);font-size:13px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap" title="' + esc(jobName) + '">' + esc(jobName) + (entry.attempt > 1 ? ' · attempt ' + esc(entry.attempt) : '') + '</div>'
+      +      (categoryBadge ? '<div style="margin-top:2px">' + categoryBadge + '</div>' : '')
       +      preview
       +    '</div>'
       +    '<div style="font-size:11px;color:' + triggerColor + ';line-height:18px">' + esc(triggerLabel) + '</div>'
@@ -23913,6 +23944,32 @@ function renderRunListBody(allRuns) {
   return html;
 }
+// PRD §9 / 1.18.87: failure category labels + colors mirror
+// failure-taxonomy.ts on the server. Kept inline so the dashboard JS
+// doesn't need to round-trip for the lookup.
+function _runListCategoryLabel(cat) {
+  return ({
+    model_error: 'Model API',
+    model_output_error: 'Bad LLM output',
+    tool_error: 'Tool failed',
+    tool_timeout: 'Tool timeout',
+    schema_error: 'Schema mismatch',
+    context_error: 'Context exceeded',
+    prompt_error: 'Blocked by policy',
+    agent_loop_error: 'Loop limit',
+    subagent_error: 'Subagent failed',
+    infrastructure_error: 'Infrastructure',
+    cancelled: 'Cancelled',
+  })[cat] || cat;
+}
+function _runListCategoryColor(cat) {
+  if (cat === 'cancelled') return 'var(--text-muted)';
+  if (cat === 'tool_timeout' || cat === 'agent_loop_error' || cat === 'context_error') return 'var(--yellow)';
+  if (cat === 'prompt_error' || cat === 'schema_error') return 'var(--purple)';
+  if (cat === 'model_error' || cat === 'model_output_error') return 'var(--accent)';
+  return 'var(--red)';
+}
 function _runListChip(label, options, stateKey) {
   var current = _runListState[stateKey];
   var html = '<span style="display:inline-flex;align-items:center;gap:4px">';
@@ -23946,6 +24003,7 @@ function onRunListSearch(value) {
 function resetRunListFilters() {
   _runListState.filterStatus = 'failed';
   _runListState.filterWindow = '24h';
+  _runListState.filterCategory = 'all';
   _runListState.filterText = '';
   _runListSaveView();
   var panel = document.getElementById('panel-runs');

package/dist/gateway/cron-scheduler.js CHANGED Viewed

@@ -1288,7 +1288,11 @@ export class CronScheduler {
                     const errorType = errTerminalReason
                         ? classifyTerminalReason(errTerminalReason)
                         : classifyError(err);
-                    this._logRun({
+                    // 1.18.87: stamp PRD-canonical failure category. classifyRunFailure
+                    // is sync; safe to call inline. Returns null for non-failures, but
+                    // we know this branch is the error path so it always returns a
+                    // category.
+                    const errEntry = {
                         jobName: job.name,
                         startedAt: startedAt.toISOString(),
                         finishedAt: finishedAt.toISOString(),
@@ -1298,12 +1302,25 @@ export class CronScheduler {
                         errorType,
                         terminalReason: errTerminalReason,
                         attempt,
+                        // 1.18.84/85 fields preserved on the error path so the Run list
+                        // can show trigger + open the partial Event log if any.
+                        trigger,
+                        ...(errCronMetadata?.runId ? { id: errCronMetadata.runId } : {}),
                         ...(errCronMetadata?.skillsApplied?.length ? { skillsApplied: errCronMetadata.skillsApplied } : {}),
                         ...(errCronMetadata?.skillsMissing?.length ? { skillsMissing: errCronMetadata.skillsMissing } : {}),
                         ...(errCronMetadata?.allowedToolsApplied?.length ? { allowedToolsApplied: errCronMetadata.allowedToolsApplied } : {}),
                         ...(errCronMetadata?.mcpServersApplied?.length ? { mcpServersApplied: errCronMetadata.mcpServersApplied } : {}),
                         advisorApplied,
-                    });
+                    };
+                    // Lazy-import the classifier so it doesn't load on success paths.
+                    try {
+                        const { classifyRunFailure } = await import('./failure-taxonomy.js');
+                        const cat = classifyRunFailure(errEntry);
+                        if (cat)
+                            errEntry.failureCategory = cat;
+                    }
+                    catch { /* non-fatal */ }
+                    this._logRun(errEntry);
                     if (isCreditBalanceError(err)) {
                         const { block, created } = markBackgroundCreditBlocked(err);
                         logger.error({ err, job: job.name, until: block.until }, 'Cron hit Claude credit exhaustion — pausing background jobs');

package/dist/gateway/failure-taxonomy.d.ts ADDED Viewed

@@ -0,0 +1,24 @@
+/**
+ * PRD §9 / Phase 4c: 11-category failure classifier.
+ *
+ * Maps a CronRunEntry to one of the PRD-canonical failure buckets so the
+ * dashboard's Run list filter and Run detail viewer can group failures
+ * meaningfully. Sits ABOVE the existing job-health.ts classifier (which
+ * still produces the lower-level kind used by self-improve and the
+ * advisor) — this module re-buckets job-health output into PRD vocabulary.
+ *
+ * Source signals consulted, in priority order:
+ *  1. CronRunEntry.terminalReason — most precise, comes straight from SDK.
+ *  2. job-health classifyRunHealth — already has rate_limit / auth / context_overflow / etc.
+ *  3. error string heuristics — last resort.
+ *
+ * Returns null when the run is not a failure (status='ok').
+ */
+import type { CronRunEntry, RunFailureCategory } from '../types.js';
+/** Returns the PRD-canonical failure bucket, or null if the run succeeded. */
+export declare function classifyRunFailure(entry: CronRunEntry): RunFailureCategory | null;
+/** Human-readable label for a failure category — surfaced on dashboards. */
+export declare function failureCategoryLabel(cat: RunFailureCategory): string;
+/** Color hint for the dashboard pill. Returns a CSS var name. */
+export declare function failureCategoryColor(cat: RunFailureCategory): string;
+//# sourceMappingURL=failure-taxonomy.d.ts.map

package/dist/gateway/failure-taxonomy.js ADDED Viewed

@@ -0,0 +1,173 @@
+/**
+ * PRD §9 / Phase 4c: 11-category failure classifier.
+ *
+ * Maps a CronRunEntry to one of the PRD-canonical failure buckets so the
+ * dashboard's Run list filter and Run detail viewer can group failures
+ * meaningfully. Sits ABOVE the existing job-health.ts classifier (which
+ * still produces the lower-level kind used by self-improve and the
+ * advisor) — this module re-buckets job-health output into PRD vocabulary.
+ *
+ * Source signals consulted, in priority order:
+ *  1. CronRunEntry.terminalReason — most precise, comes straight from SDK.
+ *  2. job-health classifyRunHealth — already has rate_limit / auth / context_overflow / etc.
+ *  3. error string heuristics — last resort.
+ *
+ * Returns null when the run is not a failure (status='ok').
+ */
+import { classifyRunHealth } from './job-health.js';
+/** Returns the PRD-canonical failure bucket, or null if the run succeeded. */
+export function classifyRunFailure(entry) {
+    // Non-failures don't get a category.
+    if (entry.status === 'ok')
+        return null;
+    if (entry.status === 'skipped')
+        return null;
+    if (entry.status === 'running')
+        return null;
+    // 'cancelled' is its own status today; map directly.
+    if (entry.status === 'cancelled')
+        return 'cancelled';
+    // Lost = daemon-boot sweep closed an orphaned 'running' entry.
+    // Treated as infrastructure_error per PRD §9 — the daemon crashed.
+    if (entry.status === 'lost')
+        return 'infrastructure_error';
+    // Timeout status maps directly.
+    if (entry.status === 'timeout')
+        return 'tool_timeout';
+    // Inspect terminalReason (SDK-reported termination) first — it's the
+    // most precise signal we have.
+    switch (entry.terminalReason) {
+        case 'max_turns':
+            return 'agent_loop_error';
+        case 'prompt_too_long':
+            return 'context_error';
+        case 'rapid_refill_breaker':
+            return 'context_error';
+        case 'blocking_limit':
+            return 'tool_error';
+        case 'image_error':
+            return 'model_output_error';
+        case 'aborted_streaming':
+        case 'aborted_tools':
+            return 'cancelled';
+        case 'stop_hook_prevented':
+        case 'hook_stopped':
+            return 'prompt_error';
+        case 'tool_deferred':
+            return 'tool_error';
+        case 'model_error':
+            return 'model_error';
+        // 'completed' should never land here (status would be 'ok')
+        default:
+            // Fall through to job-health + error string heuristics
+            break;
+    }
+    // High-precedence error-string patterns that should be classified
+    // BEFORE handing to job-health (which collapses "permission denied" into
+    // tool_scope, but PRD §9 says hook-blocked permission denials are
+    // prompt_error). Order matters here.
+    const earlyBlob = ((entry.error ?? '') + ' ' + (entry.outputPreview ?? '')).toLowerCase();
+    if (/permission denied|policy violation|prompt[- ]injection|guardrail|blocked by hook/.test(earlyBlob)) {
+        return 'prompt_error';
+    }
+    if (/^cancel|user (?:interrupt|abort|stopped)/.test(earlyBlob)) {
+        return 'cancelled';
+    }
+    if (/subagent|sub[- ]agent failed|delegated agent/.test(earlyBlob)) {
+        return 'subagent_error';
+    }
+    // Use the existing health classifier for buckets it already knows about.
+    // We use a stripped-down entry to avoid coupling to the full type.
+    try {
+        const health = classifyRunHealth(entry);
+        switch (health.status) {
+            case 'usage_blocked':
+            case 'auth':
+            case 'rate_limited':
+                return 'model_error';
+            case 'context_overflow':
+            case 'prompt_too_large':
+                return 'context_error';
+            case 'tool_scope':
+                return 'tool_error';
+            case 'partial':
+                // delivery-failed runs surface as tool_error in the new taxonomy
+                return 'tool_error';
+            case 'failed':
+                // Disambiguate via error string below
+                break;
+            case 'unknown':
+            default:
+                break;
+        }
+    }
+    catch {
+        // job-health threw — proceed with heuristics
+    }
+    // Error-string heuristics. Last-resort. Order matters: more specific
+    // patterns first so the catch-all doesn't swallow them.
+    const blob = ((entry.error ?? '') + ' ' + (entry.outputPreview ?? '')).toLowerCase();
+    if (!blob.trim())
+        return 'infrastructure_error';
+    if (/refusal|cannot (?:assist|help|comply)|i (?:can'?t|am unable)/.test(blob))
+        return 'model_output_error';
+    if (/invalid (?:tool|function) (?:call|input|json)|malformed tool|tool .* invalid arguments/.test(blob))
+        return 'model_output_error';
+    if (/permission denied|policy violation|prompt[- ]injection|guardrail|blocked by hook/.test(blob))
+        return 'prompt_error';
+    if (/tool .* time(d)? ?out|exceeded .* deadline|tool deadline/.test(blob))
+        return 'tool_timeout';
+    if (/schema|validation failed|did not validate|does not match schema/.test(blob))
+        return 'schema_error';
+    if (/context|too long|maximum context|exceeds.*tokens|input is too long/.test(blob))
+        return 'context_error';
+    if (/subagent|sub[- ]agent failed|delegated agent/.test(blob))
+        return 'subagent_error';
+    if (/cancel|user (?:interrupt|abort|stopped)/.test(blob))
+        return 'cancelled';
+    if (/oom|out of memory|enospc|enoent|enotfound|spawn .*ENOENT|process .* exited|terminated/.test(blob))
+        return 'infrastructure_error';
+    if (/401|403|unauthor|forbidden|invalid api key|api[- ]key/.test(blob))
+        return 'model_error';
+    if (/429|rate.?limit|quota/.test(blob))
+        return 'model_error';
+    if (/credit|billing|usage limit/.test(blob))
+        return 'model_error';
+    if (/(network|fetch|connect).*(fail|reset|refused|timeout)/.test(blob))
+        return 'infrastructure_error';
+    // Default catch-all — the run failed but the cause isn't explicit.
+    return 'tool_error';
+}
+/** Human-readable label for a failure category — surfaced on dashboards. */
+export function failureCategoryLabel(cat) {
+    switch (cat) {
+        case 'model_error': return 'Model API';
+        case 'model_output_error': return 'Bad LLM output';
+        case 'tool_error': return 'Tool failed';
+        case 'tool_timeout': return 'Tool timeout';
+        case 'schema_error': return 'Schema mismatch';
+        case 'context_error': return 'Context exceeded';
+        case 'prompt_error': return 'Blocked by policy';
+        case 'agent_loop_error': return 'Loop limit';
+        case 'subagent_error': return 'Subagent failed';
+        case 'infrastructure_error': return 'Infrastructure';
+        case 'cancelled': return 'Cancelled';
+    }
+}
+/** Color hint for the dashboard pill. Returns a CSS var name. */
+export function failureCategoryColor(cat) {
+    switch (cat) {
+        case 'cancelled': return 'var(--text-muted)';
+        case 'tool_timeout':
+        case 'agent_loop_error':
+        case 'context_error': return 'var(--yellow)';
+        case 'prompt_error':
+        case 'schema_error': return 'var(--purple)';
+        case 'model_error':
+        case 'model_output_error': return 'var(--accent)';
+        case 'infrastructure_error': return 'var(--red)';
+        case 'tool_error':
+        case 'subagent_error': return 'var(--red)';
+    }
+}
+//# sourceMappingURL=failure-taxonomy.js.map

package/dist/types.d.ts CHANGED Viewed

@@ -448,6 +448,16 @@ export interface RunEvent {
     /** Subagent id when kind='subagent_*'. */
     agentId?: string;
 }
+/**
+ * PRD §9 / 1.18.87: 11-category failure taxonomy. Replaces the existing
+ * JobHealthKind union for surfacing-on-the-dashboard purposes (job-health.ts
+ * stays as the lower-level classifier and feeds into this).
+ *
+ * Stamped on CronRunEntry.failureCategory at write-time when the run is a
+ * failure (status: 'error' | 'timeout' | 'lost' | retried-final). Powers
+ * the Run list filter chip and the Run detail viewer's failure pill.
+ */
+export type RunFailureCategory = 'model_error' | 'model_output_error' | 'tool_error' | 'tool_timeout' | 'schema_error' | 'context_error' | 'prompt_error' | 'agent_loop_error' | 'subagent_error' | 'infrastructure_error' | 'cancelled';
 export interface CronRunEntry {
     /** PRD §6 / 1.18.85: stable run UUID. Optional only because pre-1.18.85
      *  entries don't have it; new entries always do. The Event store keys
@@ -500,6 +510,10 @@ export interface CronRunEntry {
      *  Discord) so the Run list can filter by source instead of guessing
      *  via heuristics on attempt count. */
     trigger?: 'manual' | 'scheduled' | 'webhook' | 'api' | 'fork' | 'resume' | 'discord' | 'after';
+    /** PRD §9 / 1.18.87: PRD-canonical failure bucket. Set on every entry
+     *  whose status indicates a failure (error/timeout/lost/cancelled). The
+     *  Run list filter chip and Run detail header read from this field. */
+    failureCategory?: RunFailureCategory;
     /** PRD Phase 1: did the run accomplish what it was supposed to?
      *  Computed at run-end when the Task has successSchema or successCriteriaText.
      *  - status='pass'      both configured checks passed (or the only one configured did)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clementine-agent",
-  "version": "1.18.86",
+  "version": "1.18.87",
   "description": "Clementine — Personal AI Assistant (TypeScript)",
   "type": "module",
   "main": "dist/index.js",