npm - chekk - Versions diffs - 0.4.0 → 0.4.2 - Mend

chekk 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/bin/chekk.js +1 -1
package/package.json +1 -1
package/src/display.js +143 -58
package/src/insights.js +66 -4
package/src/metrics/ai-leverage.js +13 -3
package/src/metrics/debug-cycles.js +17 -3
package/src/metrics/decomposition.js +12 -2
package/src/metrics/session-structure.js +12 -2

package/bin/chekk.js CHANGED Viewed

@@ -4,7 +4,7 @@ import { execSync, spawn } from 'child_process';
 import { Command } from 'commander';
 import { run } from '../src/index.js';
-const LOCAL_VERSION = '0.4.0';
+const LOCAL_VERSION = '0.4.2';
 // ── Auto-update check ──
 // If running from a cached npx install, check if there's a newer version

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "chekk",
-  "version": "0.4.0",
+  "version": "0.4.2",
   "description": "See how you prompt. Chekk analyzes your AI coding workflow and tells you what kind of engineer you are.",
   "bin": {
     "chekk": "./bin/chekk.js"

package/src/display.js CHANGED Viewed

@@ -41,9 +41,28 @@ function numberFormat(n) {
   return String(n);
 }
+// Measure visible display width accounting for wide characters (emoji, CJK)
+function visibleWidth(str) {
+  const stripped = str.replace(/\u001b\[[0-9;]*m/g, '');
+  let width = 0;
+  for (const ch of stripped) {
+    const code = ch.codePointAt(0);
+    // Emoji and symbols that take 2 terminal columns
+    if (code > 0x1F000 || // emoji block
+        (code >= 0x2600 && code <= 0x27BF) || // misc symbols
+        (code >= 0x2B50 && code <= 0x2B55) || // stars
+        (code >= 0xFE00 && code <= 0xFE0F) || // variation selectors
+        (code >= 0x1F300 && code <= 0x1FAFF)) { // extended emoji
+      width += 2;
+    } else {
+      width += 1;
+    }
+  }
+  return width;
+}
 function pad(str, len) {
-  const visible = str.replace(/\u001b\[[0-9;]*m/g, '');
-  return str + ' '.repeat(Math.max(0, len - visible.length));
+  return str + ' '.repeat(Math.max(0, len - visibleWidth(str)));
 }
 // ── Qualitative tier labels for dimensions ──
@@ -65,25 +84,50 @@ function dimTierColor(score) {
 // ── Snippet helpers ──
-function cleanPrompt(prompt, maxLen = 120) {
+function cleanPrompt(prompt) {
   if (!prompt) return null;
-  let clean = prompt.replace(/\s+/g, ' ').trim();
-  if (clean.length > maxLen) {
-    clean = clean.slice(0, maxLen - 1) + '\u2026';
-  }
-  return clean;
+  return prompt.replace(/\s+/g, ' ').trim();
 }
-function displayLabeledSnippet(label, prompt, maxLen = 120) {
-  const s = cleanPrompt(prompt, maxLen);
+function displayLabeledSnippet(label, prompt) {
+  const s = cleanPrompt(prompt);
   if (!s) return;
-  console.log(`  ${dim('\u21B3')} ${dim(label + ':')} ${dim.italic('\u201C' + s + '\u201D')}`);
+  // Wrap the full prompt across multiple lines instead of truncating
+  const prefix = `${dim('\u21B3')} ${dim(label + ':')} `;
+  const quoted = `\u201C${s}\u201D`;
+  const lines = wrapText(quoted, 53);
+  console.log(`  ${prefix}${dim.italic(lines[0])}`);
+  for (let i = 1; i < lines.length; i++) {
+    console.log(`    ${dim.italic(lines[i])}`);
+  }
 }
+// Cross-dimension filters: reject prompts that clearly belong to another dimension
+const architecturalRe = /\b(architect|design|refactor|redesign|restructure|system design|data model|schema|api design|infrastructure|migration|strategy)\b/i;
+const debugRe = /\b(error|bug|broken|crash|fail|exception|traceback|stack trace|doesn'?t work|not working|TypeError|SyntaxError|ImportError|ReferenceError|500|502|503|404|CORS)\b/i;
+const planningRe = /\b(plan|breakdown|break down|think through|help me think|pros and cons|how should|code review|audit)\b/i;
+// For each dimension, prompts matching these patterns are *excluded* as evidence
+const dimensionExclusions = {
+  'specific_report': [architecturalRe, planningRe],
+  'quick_fix': [architecturalRe, planningRe],
+  'architectural': [debugRe],
+  'planning': [debugRe],
+  'exploratory': [debugRe],
+  'decomposition': [],
+  'followup': [],
+  'context_setting': [],
+  'refinement': [],
+};
 function pickExample(examples, type) {
   if (!examples || !examples.length) return null;
-  const match = examples.find(e => e.type === type);
-  return match ? match.prompt : null;
+  const exclusions = dimensionExclusions[type] || [];
+  // Prefer a match that doesn't trigger exclusion patterns
+  const candidates = examples.filter(e => e.type === type);
+  if (candidates.length === 0) return null;
+  const clean = candidates.find(e => !exclusions.some(re => re.test(e.prompt)));
+  return (clean || candidates[0]).prompt;
 }
 // ── Box drawing ──
@@ -92,8 +136,7 @@ function box(lines, width = 47) {
   const out = [];
   out.push(dim('  \u250C' + '\u2500'.repeat(width) + '\u2510'));
   for (const line of lines) {
-    const visible = line.replace(/\u001b\[[0-9;]*m/g, '');
-    const padding = Math.max(0, width - visible.length);
+    const padding = Math.max(0, width - visibleWidth(line));
     out.push(dim('  \u2502') + line + ' '.repeat(padding) + dim('\u2502'));
   }
   out.push(dim('  \u2514' + '\u2500'.repeat(width) + '\u2518'));
@@ -128,7 +171,7 @@ export function displayHeader() {
   console.log();
   const lines = [
     '',
-    `   ${bold.white('chekk')}${dim(' v0.4.0')}`,
+    `   ${bold.white('chekk')}${dim(' v0.4.2')}`,
     `   ${dim('engineering capability profile')}`,
     '',
   ];
@@ -186,7 +229,7 @@ function displayProfileHeader(result, extra = {}) {
   console.log(`  ${bold.white('ENGINEERING CAPABILITY PROFILE')}`);
   console.log();
   if (sessionStats) {
-    console.log(`  ${dim(`Generated ${dateStr} | chekk v0.4.0`)}`);
+    console.log(`  ${dim(`Generated ${dateStr} | chekk v0.4.2`)}`);
     console.log(`  ${dim(`Analysis: ${sessionStats.totalSessions} sessions \u00B7 ${sessionStats.tools.length} tool${sessionStats.tools.length > 1 ? 's' : ''} \u00B7 ${numberFormat(sessionStats.totalExchanges)} exchanges`)}`);
     if (sessionStats.dateRangeShort) {
       console.log(`  ${dim(`Period: ${sessionStats.dateRangeShort}`)}`);
@@ -366,7 +409,7 @@ export function displayNarratives(metrics, prose) {
   const shownSnippets = new Set();
   function showLabeledSnippet(label, prompt) {
     if (!prompt) return;
-    const s = cleanPrompt(prompt, 120);
+    const s = cleanPrompt(prompt);
     if (shownSnippets.has(s)) return;
     shownSnippets.add(s);
     displayLabeledSnippet(label, prompt);
@@ -404,7 +447,7 @@ function displayDataNarratives(metrics, shownSnippets) {
   function showSnippet(label, prompt) {
     if (!prompt) return;
-    const s = cleanPrompt(prompt, 120);
+    const s = cleanPrompt(prompt);
     if (shownSnippets.has(s)) return;
     shownSnippets.add(s);
     displayLabeledSnippet(label, prompt);
@@ -458,6 +501,9 @@ function displaySignatures(insights) {
     for (const line of lines) {
       console.log(`    ${dim(line)}`);
     }
+    if (sig.evidence) {
+      displayLabeledSnippet('Proof', sig.evidence);
+    }
     console.log();
   }
 }
@@ -477,6 +523,9 @@ function displayWatchPoints(insights) {
     for (const line of lines) {
       console.log(`    ${dim(line)}`);
     }
+    if (wp.evidence) {
+      displayLabeledSnippet('Example', wp.evidence);
+    }
     console.log();
   }
 }
@@ -605,7 +654,32 @@ export function displayVerbose(metrics, sessions) {
   console.log(doubleRule());
   console.log(dim('\n  DETAILED BREAKDOWN\n'));
-  // Per-project stats
+  // Helper: show a metric row with value, benchmark comparison, and verdict
+  // lowerIsBetter: true for metrics where lower = better (e.g. turns to resolve)
+  function metricRow(label, value, benchmark, unit = '', lowerIsBetter = false) {
+    const valStr = typeof value === 'number' ? String(value) : value;
+    let verdict = '';
+    if (benchmark !== null && benchmark !== undefined && typeof value === 'number') {
+      const ratio = value / benchmark;
+      if (lowerIsBetter) {
+        if (ratio <= 0.5) verdict = green(' ++ faster than benchmark');
+        else if (ratio <= 0.8) verdict = cyan(' + faster than benchmark');
+        else if (ratio <= 1.1) verdict = dim(' ~ at benchmark');
+        else if (ratio <= 1.5) verdict = orange(' - slower than benchmark');
+        else verdict = red(' -- well above benchmark');
+      } else {
+        if (ratio >= 1.5) verdict = green(' ++ above benchmark');
+        else if (ratio >= 1.1) verdict = cyan(' + above benchmark');
+        else if (ratio >= 0.9) verdict = dim(' ~ at benchmark');
+        else if (ratio >= 0.6) verdict = orange(' - below benchmark');
+        else verdict = red(' -- well below benchmark');
+      }
+    }
+    const benchStr = benchmark !== null && benchmark !== undefined ? dim(` (benchmark: ${benchmark}${unit})`) : '';
+    console.log(`  ${dim(pad(label, 28))} ${white(valStr + unit)}${benchStr}${verdict}`);
+  }
+  // ── Projects ──
   const projects = {};
   for (const s of sessions) {
     const p = s.project || 'unknown';
@@ -615,60 +689,71 @@ export function displayVerbose(metrics, sessions) {
     projects[p].minutes += s.durationMinutes || 0;
   }
-  console.log(bold('  PROJECTS'));
+  console.log(`  ${bold('PROJECTS')}`);
+  console.log(`  ${dim('\u2500'.repeat(50))}`);
   for (const [name, data] of Object.entries(projects).sort((a, b) => b[1].exchanges - a[1].exchanges)) {
-    const shortName = name.length > 30 ? '...' + name.slice(-27) : name;
-    console.log(`  ${dim(pad(shortName, 32))} ${dim(data.sessions + ' sessions')}  ${dim(numberFormat(data.exchanges) + ' exchanges')}`);
+    const shortName = name.length > 28 ? '...' + name.slice(-25) : name;
+    console.log(`  ${pad(white(shortName), 30)} ${dim(data.sessions + ' sessions')}  ${dim(numberFormat(data.exchanges) + ' exchanges')}`);
   }
   console.log();
-  // Decomposition
+  // ── Thinking / Decomposition ──
   const d = metrics.decomposition.details;
-  console.log(bold('  DECOMPOSITION'));
-  console.log(`  ${dim(pad('Avg session depth', 30))} ${dim(String(d.avgExchangesPerSession))}  ${dim(`(benchmark: ${BENCHMARKS.avgExchangesPerSession})`)}`);
-  console.log(`  ${dim(pad('Avg prompt length', 30))} ${dim(d.avgPromptLength + ' chars')}  ${dim(`(benchmark: ${BENCHMARKS.avgPromptLength})`)}`);
-  console.log(`  ${dim(pad('Multi-step sessions', 30))} ${dim(String(d.multiStepSessions) + '/' + d.totalSessions)}  ${dim(d.multiStepSessions > d.totalSessions * 0.5 ? '\u2014 strong' : '\u2014 room to grow')}`);
-  console.log(`  ${dim(pad('Single-shot sessions', 30))} ${dim(String(d.singleShotSessions))}`);
-  console.log(`  ${dim(pad('Contextual followups', 30))} ${dim(d.contextualFollowupRatio + '%')}  ${dim(d.contextualFollowupRatio > 20 ? '\u2014 builds on context well' : '\u2014 could reference prior work more')}`);
+  console.log(`  ${bold('\uD83E\uDDE0 THINKING')}  ${dim('(weight: 25%)')}`);
+  console.log(`  ${dim('\u2500'.repeat(50))}`);
+  metricRow('Session depth', d.avgExchangesPerSession, BENCHMARKS.avgExchangesPerSession, ' exchanges');
+  metricRow('Prompt length', d.avgPromptLength, BENCHMARKS.avgPromptLength, ' chars');
+  console.log(`  ${dim(pad('Multi-step sessions', 28))} ${white(d.multiStepSessions + '/' + d.totalSessions)} ${dim('(' + Math.round(d.multiStepSessions / Math.max(1, d.totalSessions) * 100) + '%)')}`);
+  console.log(`  ${dim(pad('Single-shot sessions', 28))} ${white(String(d.singleShotSessions))}`);
+  console.log(`  ${dim(pad('Contextual followups', 28))} ${white(d.contextualFollowupRatio + '%')}${d.contextualFollowupRatio > 20 ? cyan(' builds on context well') : orange(' could reference prior work more')}`);
   console.log();
-  // Debug
+  // ── Debugging ──
   const db = metrics.debugCycles.details;
-  console.log(bold('  DEBUG CYCLES'));
-  console.log(`  ${dim(pad('Total sequences', 30))} ${dim(String(db.totalDebugSequences))}`);
-  console.log(`  ${dim(pad('Avg turns to resolve', 30))} ${dim(String(db.avgTurnsToResolve))}  ${dim(`(benchmark: ${BENCHMARKS.avgTurnsToResolve})`)}`);
-  console.log(`  ${dim(pad('Quick fixes (\u22642 turns)', 30))} ${dim(String(db.quickFixes))}`);
-  console.log(`  ${dim(pad('Extended loops (>5 turns)', 30))} ${dim(String(db.longLoops))}  ${dim(db.longLoops === 0 ? '\u2014 zero spirals' : '')}`);
-  console.log(`  ${dim(pad('Specific report ratio', 30))} ${dim(db.specificReportRatio + '%')}  ${dim(`(benchmark: ${BENCHMARKS.specificReportRatio}%)`)}`);
-  console.log(`  ${dim(pad('Vague reports', 30))} ${dim(String(db.vagueReports))}  ${dim(db.vagueReports === 0 ? '\u2014 never vague' : '')}`);
+  console.log(`  ${bold('\u26A1 DEBUGGING')}  ${dim('(weight: 25%)')}`);
+  console.log(`  ${dim('\u2500'.repeat(50))}`);
+  console.log(`  ${dim(pad('Debug sequences', 28))} ${white(String(db.totalDebugSequences))}`);
+  metricRow('Turns to resolve', db.avgTurnsToResolve, BENCHMARKS.avgTurnsToResolve, ' avg', true);
+  console.log(`  ${dim(pad('Quick fixes (1-2 turns)', 28))} ${white(String(db.quickFixes))} ${dim('of ' + db.totalDebugSequences)}`);
+  console.log(`  ${dim(pad('Extended loops (>5 turns)', 28))} ${db.longLoops === 0 ? green('0 -- zero spirals') : orange(String(db.longLoops))}`);
+  metricRow('Specific report ratio', db.specificReportRatio, BENCHMARKS.specificReportRatio, '%');
+  console.log(`  ${dim(pad('Vague reports', 28))} ${db.vagueReports === 0 ? green('0 -- never vague') : orange(String(db.vagueReports))}`);
   console.log();
-  // AI Leverage
+  // ── AI Leverage ──
   const ai = metrics.aiLeverage.details;
-  console.log(bold('  AI LEVERAGE'));
-  console.log(`  ${dim(pad('Total prompts', 30))} ${dim(numberFormat(ai.totalPrompts))}`);
-  console.log(`  ${dim(pad('Architectural', 30))} ${dim(String(ai.architecturalPrompts))} ${dim(`(${Math.round(ai.architecturalPrompts / Math.max(1, ai.totalPrompts) * 100)}%)`)}`);
-  console.log(`  ${dim(pad('Planning', 30))} ${dim(String(ai.planningPrompts))} ${dim(`(${Math.round(ai.planningPrompts / Math.max(1, ai.totalPrompts) * 100)}%)`)}`);
-  console.log(`  ${dim(pad('Exploratory', 30))} ${dim(String(ai.exploratoryPrompts))} ${dim(`(${Math.round(ai.exploratoryPrompts / Math.max(1, ai.totalPrompts) * 100)}%)`)}`);
-  console.log(`  ${dim(pad('Boilerplate', 30))} ${dim(String(ai.boilerplatePrompts))} ${dim(`(${Math.round(ai.boilerplatePrompts / Math.max(1, ai.totalPrompts) * 100)}%)`)}  ${dim(ai.boilerplatePrompts < ai.totalPrompts * 0.05 ? '\u2014 minimal' : '')}`);
-  console.log(`  ${dim(pad('Testing', 30))} ${dim(String(ai.testingPrompts))}`);
-  console.log(`  ${dim(pad('High-level ratio', 30))} ${dim(ai.highLevelRatio + '%')}  ${dim(`(benchmark: ${BENCHMARKS.highLevelRatio}%)`)}`);
+  const total = Math.max(1, ai.totalPrompts);
+  console.log(`  ${bold('\uD83D\uDD27 AI LEVERAGE')}  ${dim('(weight: 30%)')}`);
+  console.log(`  ${dim('\u2500'.repeat(50))}`);
+  console.log(`  ${dim(pad('Total prompts analyzed', 28))} ${white(numberFormat(ai.totalPrompts))}`);
+  console.log();
+  console.log(`  ${dim('  Prompt type breakdown:')}`);
+  console.log(`  ${dim(pad('  Architectural', 28))} ${white(String(ai.architecturalPrompts))} ${dim('(' + Math.round(ai.architecturalPrompts / total * 100) + '%)  design, schema, strategy')}`);
+  console.log(`  ${dim(pad('  Planning', 28))} ${white(String(ai.planningPrompts))} ${dim('(' + Math.round(ai.planningPrompts / total * 100) + '%)  how-should-I, trade-offs')}`);
+  console.log(`  ${dim(pad('  Exploratory', 28))} ${white(String(ai.exploratoryPrompts))} ${dim('(' + Math.round(ai.exploratoryPrompts / total * 100) + '%)  explain, investigate')}`);
+  console.log(`  ${dim(pad('  Boilerplate', 28))} ${white(String(ai.boilerplatePrompts))} ${dim('(' + Math.round(ai.boilerplatePrompts / total * 100) + '%)  CRUD, templates')}${ai.boilerplatePrompts < total * 0.05 ? green('  minimal') : ''}`);
+  console.log(`  ${dim(pad('  Testing', 28))} ${white(String(ai.testingPrompts))} ${dim('(' + Math.round(ai.testingPrompts / total * 100) + '%)')}`);
+  console.log();
+  metricRow('High-level ratio', ai.highLevelRatio, BENCHMARKS.highLevelRatio, '%');
   console.log();
-  // Session structure
+  // ── Session Structure ──
   const ss = metrics.sessionStructure.details;
-  console.log(bold('  SESSION STRUCTURE'));
-  console.log(`  ${dim(pad('Context-setting', 30))} ${dim(ss.contextSetRatio + '%')}  ${dim(`(benchmark: ${BENCHMARKS.contextSetRatio}%)`)}`);
-  console.log(`  ${dim(pad('Plan before code', 30))} ${dim(ss.planBeforeCodeRatio + '%')}`);
-  console.log(`  ${dim(pad('Review at end', 30))} ${dim(ss.reviewEndRatio + '%')}  ${dim(`(benchmark: ${BENCHMARKS.reviewEndRatio}%)`)}`);
-  console.log(`  ${dim(pad('Refinement rate', 30))} ${dim(ss.refinementRatio + '%')}  ${dim(`(benchmark: ${BENCHMARKS.refinementRatio}%)`)}`);
-  console.log(`  ${dim(pad('Avg first prompt', 30))} ${dim(ss.avgFirstPromptLength + ' chars')}`);
+  console.log(`  ${bold('\uD83D\uDCD0 WORKFLOW')}  ${dim('(weight: 20%)')}`);
+  console.log(`  ${dim('\u2500'.repeat(50))}`);
+  metricRow('Context-setting', ss.contextSetRatio, BENCHMARKS.contextSetRatio, '%');
+  console.log(`  ${dim(pad('Plan before code', 28))} ${white(ss.planBeforeCodeRatio + '%')}`);
+  metricRow('Review at end', ss.reviewEndRatio, BENCHMARKS.reviewEndRatio, '%');
+  metricRow('Refinement rate', ss.refinementRatio, BENCHMARKS.refinementRatio, '%');
+  console.log(`  ${dim(pad('Avg first prompt length', 28))} ${white(ss.avgFirstPromptLength + ' chars')}`);
   if (ss.durationDistribution) {
     const dur = ss.durationDistribution;
-    console.log(`  ${dim(pad('Focused sessions (10-45m)', 30))} ${dim(String(dur.focused))}`);
-    console.log(`  ${dim(pad('Short (<5m)', 30))} ${dim(String(dur.short))}`);
-    console.log(`  ${dim(pad('Medium (5-60m)', 30))} ${dim(String(dur.medium))}`);
-    console.log(`  ${dim(pad('Long (>60m)', 30))} ${dim(String(dur.long))}`);
+    console.log();
+    console.log(`  ${dim('  Session duration:')}`);
+    console.log(`  ${dim(pad('  Focused (10-45m)', 28))} ${white(String(dur.focused))} ${dim('-- ideal range')}`);
+    console.log(`  ${dim(pad('  Short (<5m)', 28))} ${white(String(dur.short))}`);
+    console.log(`  ${dim(pad('  Medium (5-60m)', 28))} ${white(String(dur.medium))}`);
+    console.log(`  ${dim(pad('  Long (>60m)', 28))} ${white(String(dur.long))}`);
   }
   console.log();
 }

package/src/insights.js CHANGED Viewed

@@ -45,6 +45,16 @@ const preflightPatterns = /^(before (we|you|i)|don'?t code|review (first|this|my
 const testFirstPatterns = /\b(write (the )?tests? (first|before)|test.?driven|TDD|spec first|start with (tests?|specs?))\b/i;
 const negativeConstraintPatterns = /\b(don'?t|do not|never|avoid|must not|shouldn'?t)\b.*\b(add|create|use|include|change|modify|touch|remove)\b/i;
+// Evidence quality filter (same rules as metric parsers)
+const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
+function isGoodEvidence(prompt) {
+  if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
+  if (noisePatterns.test(prompt)) return false;
+  const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
+  if (alpha / prompt.length < 0.4) return false;
+  return true;
+}
 export function computeSignatures(allSessions, metrics) {
   const signatures = [];
   const d = metrics.decomposition.details;
@@ -59,6 +69,12 @@ export function computeSignatures(allSessions, metrics) {
   let modificationCount = 0;
   let acceptCount = 0;
+  // Capture evidence prompts for each signature type
+  let bestPreflightPrompt = null;
+  let bestConstraintPrompt = null;
+  let bestTestFirstPrompt = null;
+  let bestModifyPrompt = null;
   for (const session of allSessions) {
     const { exchanges } = session;
     if (exchanges.length === 0) continue;
@@ -67,6 +83,9 @@ export function computeSignatures(allSessions, metrics) {
     const firstPrompt = exchanges[0].userPrompt || '';
     if (preflightPatterns.test(firstPrompt)) {
       preflightSessions++;
+      if (isGoodEvidence(firstPrompt) && (!bestPreflightPrompt || firstPrompt.length > bestPreflightPrompt.length)) {
+        bestPreflightPrompt = firstPrompt;
+      }
     }
     let hasTestFirst = false;
@@ -76,15 +95,24 @@ export function computeSignatures(allSessions, metrics) {
       if (constraintPatterns.test(prompt) && negativeConstraintPatterns.test(prompt)) {
         constraintPrompts++;
+        if (isGoodEvidence(prompt) && (!bestConstraintPrompt || prompt.length > bestConstraintPrompt.length)) {
+          bestConstraintPrompt = prompt;
+        }
       }
       if (testFirstPatterns.test(prompt)) {
         hasTestFirst = true;
+        if (isGoodEvidence(prompt) && (!bestTestFirstPrompt || prompt.length > bestTestFirstPrompt.length)) {
+          bestTestFirstPrompt = prompt;
+        }
       }
       // Track modification vs acceptance
       if (i > 0 && /\b(actually|wait|instead|change|no,?|not quite|modify|tweak)\b/i.test(prompt)) {
         modificationCount++;
+        if (isGoodEvidence(prompt) && (!bestModifyPrompt || prompt.length > bestModifyPrompt.length)) {
+          bestModifyPrompt = prompt;
+        }
       } else if (i > 0) {
         acceptCount++;
       }
@@ -100,6 +128,7 @@ export function computeSignatures(allSessions, metrics) {
     signatures.push({
       name: 'Pre-flight reviews',
       detail: `You ask AI to review your plan before coding in ${Math.round(preflightRatio * 100)}% of sessions. Only 8% of engineers do this consistently. This correlates with fewer debug cycles.`,
+      evidence: bestPreflightPrompt,
     });
   }
@@ -109,6 +138,7 @@ export function computeSignatures(allSessions, metrics) {
     signatures.push({
       name: 'Constraint-first prompting',
       detail: `You specify what NOT to do in ${Math.round(constraintRatio * 100)}% of prompts. This is a hallmark of senior architectural thinking that prevents scope creep.`,
+      evidence: bestConstraintPrompt,
     });
   }
@@ -118,22 +148,25 @@ export function computeSignatures(allSessions, metrics) {
     signatures.push({
       name: 'Test-driven AI usage',
       detail: `You request tests before implementation in ${Math.round(testFirstRatio * 100)}% of sessions. Engineers who do this ship fewer bugs post-merge.`,
+      evidence: bestTestFirstPrompt,
     });
   }
-  // Deep session marathons
+  // Deep session marathons — evidence is metric-derived, no single prompt
   if (d.avgExchangesPerSession > BENCHMARKS.avgExchangesPerSession * 2) {
     signatures.push({
       name: 'Marathon sessions',
       detail: `Avg session depth of ${d.avgExchangesPerSession} exchanges is ${Math.round(d.avgExchangesPerSession / BENCHMARKS.avgExchangesPerSession)}x the benchmark (${BENCHMARKS.avgExchangesPerSession}). You sustain deep, focused work.`,
+      evidence: null,
     });
   }
-  // Zero vague debugging
+  // Zero vague debugging — evidence is the absence of something
   if (db.vagueReports === 0 && db.totalDebugSequences > 5) {
     signatures.push({
       name: 'Precision debugging',
       detail: `Zero vague error reports across ${db.totalDebugSequences} debug sequences. Every bug report includes specific context. This is rare.`,
+      evidence: metrics.debugCycles.examples?.[0]?.prompt || null,
     });
   }
@@ -142,6 +175,7 @@ export function computeSignatures(allSessions, metrics) {
     signatures.push({
       name: 'Strategic AI usage',
       detail: `${ai.highLevelRatio}% of prompts are architectural or planning-level (benchmark: ${BENCHMARKS.highLevelRatio}%). You use AI as a thinking partner, not just a code generator.`,
+      evidence: metrics.aiLeverage.examples?.[0]?.prompt || null,
     });
   }
@@ -152,6 +186,7 @@ export function computeSignatures(allSessions, metrics) {
     signatures.push({
       name: 'Critical reviewer',
       detail: `You modify or redirect AI output in ${Math.round(modRatio * 100)}% of follow-up prompts. This indicates active evaluation rather than passive acceptance.`,
+      evidence: bestModifyPrompt,
     });
   }
@@ -177,6 +212,7 @@ export function computeWatchPoints(allSessions, metrics) {
     projectSessions[p].push(s);
   }
   let contextRestarts = 0;
+  let bestContextRestartPrompt = null;
   let multiSessionProjects = 0;
   for (const [, sessions] of Object.entries(projectSessions)) {
     if (sessions.length < 2) continue;
@@ -186,6 +222,9 @@ export function computeWatchPoints(allSessions, metrics) {
       // If first prompt doesn't reference previous work, it's a context restart
       if (firstPrompt.length > 50 && !/\b(continuing|following up|as discussed|last time|previously|where we left|earlier)\b/i.test(firstPrompt)) {
         contextRestarts++;
+        if (isGoodEvidence(firstPrompt) && (!bestContextRestartPrompt || firstPrompt.length > bestContextRestartPrompt.length)) {
+          bestContextRestartPrompt = firstPrompt;
+        }
       }
     }
   }
@@ -194,6 +233,7 @@ export function computeWatchPoints(allSessions, metrics) {
     watchPoints.push({
       name: 'Context amnesia',
       detail: `You restart context from scratch in ${Math.round(contextRestarts / totalFollowupSessions * 100)}% of follow-up sessions on the same project. Engineers who maintain context across sessions are more efficient.`,
+      evidence: bestContextRestartPrompt,
     });
   }
@@ -214,14 +254,26 @@ export function computeWatchPoints(allSessions, metrics) {
     watchPoints.push({
       name: 'Acceptance without review',
       detail: `You accept AI output without modification in ${Math.round((1 - modRatio) * 100)}% of cases. Top engineers modify or redirect 30%+ of initial suggestions.`,
+      evidence: null, // Anti-pattern is the absence of modification
     });
   }
   // Monologue prompting — excessively long first prompts
   if (d.avgPromptLength > 2000) {
+    // Find a representative long prompt
+    let bestLongPrompt = null;
+    for (const s of allSessions) {
+      for (const ex of s.exchanges) {
+        const p = ex.userPrompt || '';
+        if (p.length > 1500 && p.length < 3000 && isGoodEvidence(p)) {
+          if (!bestLongPrompt || p.length > bestLongPrompt.length) bestLongPrompt = p;
+        }
+      }
+    }
     watchPoints.push({
       name: 'Monologue prompting',
       detail: `Avg prompt length of ${d.avgPromptLength} chars is ${Math.round(d.avgPromptLength / BENCHMARKS.avgPromptLength)}x the benchmark. Breaking complex requests into 2-3 shorter prompts typically yields better AI output.`,
+      evidence: bestLongPrompt,
     });
   }
@@ -230,6 +282,7 @@ export function computeWatchPoints(allSessions, metrics) {
     watchPoints.push({
       name: 'Missing context',
       detail: `Only ${ss.contextSetRatio}% of sessions start with context-setting (benchmark: ${BENCHMARKS.contextSetRatio}%). Upfront context leads to better first responses and fewer corrections.`,
+      evidence: null,
     });
   }
@@ -238,6 +291,7 @@ export function computeWatchPoints(allSessions, metrics) {
     watchPoints.push({
       name: 'Debug spirals',
       detail: `${db.longLoops} extended debug loops (>5 turns) detected. When stuck, try providing more specific error context or breaking the problem differently.`,
+      evidence: null,
     });
   }
@@ -417,11 +471,11 @@ export function generateAssessment(result, metrics, signatures, watchPoints) {
   const weakest = dims[dims.length - 1];
   // Build assessment parts
-  let assessment = `This engineer demonstrates ${dimQualitative(strongest.score)} ${strongest.label}`;
+  let assessment = `This engineer demonstrates ${dimQualitative(strongest.score).toLowerCase()} ${strongest.label}`;
   // Add signature mention if available
   if (signatures.length > 0) {
-    assessment += ` with a distinctive pattern of ${signatures[0].name.toLowerCase()}`;
+    assessment += ` with a distinctive pattern of ${formatSignatureName(signatures[0].name)}`;
   }
   assessment += '.';
@@ -466,6 +520,14 @@ function dimQualitative(score) {
   return 'Early-stage';
 }
+// Lowercase a signature name for prose while preserving acronyms like "AI", "TDD"
+function formatSignatureName(name) {
+  return name
+    .toLowerCase()
+    .replace(/\bai\b/g, 'AI')
+    .replace(/\btdd\b/g, 'TDD');
+}
 // ══════════════════════════════════════════════
 // CONFIDENCE — Data volume indicator
 // ══════════════════════════════════════════════

package/src/metrics/ai-leverage.js CHANGED Viewed

@@ -11,6 +11,16 @@
  * - Diversity of tool usage (not just "write code" but also explore, analyze, test)
  */
+// ── Evidence quality filter ──
+const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
+function isGoodEvidence(prompt) {
+  if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
+  if (noisePatterns.test(prompt)) return false;
+  const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
+  if (alpha / prompt.length < 0.4) return false;
+  return true;
+}
 const architecturalPatterns = /\b(architect|design|refactor|redesign|restructure|system design|data model|schema|api design|interface|abstract|pattern|trade-?off|scalab|approach|strategy|migration|infrastructure)\b/i;
 const planningPatterns = /\b(plan|breakdown|break down|think through|help me think|what('?s| is) the best (way|approach)|how should (i|we)|pros and cons|options for|compare|evaluate|review my|code review|audit)\b/i;
 const exploratoryPatterns = /\b(explain|understand|how does|what does|why does|walk me through|investigate|diagnose|analyze|explore|deep dive|look into)\b/i;
@@ -53,15 +63,15 @@ export function computeAILeverage(sessions) {
       // Categorize prompt type
       if (architecturalPatterns.test(prompt)) {
         architecturalPrompts++;
-        if (prompt.length > bestArchLen) { bestArchLen = prompt.length; bestArchPrompt = prompt; }
+        if (isGoodEvidence(prompt) && prompt.length > bestArchLen) { bestArchLen = prompt.length; bestArchPrompt = prompt; }
       }
       if (planningPatterns.test(prompt)) {
         planningPrompts++;
-        if (prompt.length > bestPlanLen) { bestPlanLen = prompt.length; bestPlanPrompt = prompt; }
+        if (isGoodEvidence(prompt) && prompt.length > bestPlanLen) { bestPlanLen = prompt.length; bestPlanPrompt = prompt; }
       }
       if (exploratoryPatterns.test(prompt)) {
         exploratoryPrompts++;
-        if (prompt.length > bestExploreLen) { bestExploreLen = prompt.length; bestExplorePrompt = prompt; }
+        if (isGoodEvidence(prompt) && prompt.length > bestExploreLen) { bestExploreLen = prompt.length; bestExplorePrompt = prompt; }
       }
       if (boilerplatePatterns.test(prompt)) boilerplatePrompts++;
       if (testingPatterns.test(prompt)) testingPrompts++;

package/src/metrics/debug-cycles.js CHANGED Viewed

@@ -10,6 +10,19 @@
  * - "it's still broken" vs targeted debug prompts
  */
+// ── Evidence quality filter ──
+// Prompts used as evidence should be human-written, readable, and illustrative.
+// Reject system-generated context, raw log pastes, and extreme lengths.
+const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
+function isGoodEvidence(prompt) {
+  if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
+  if (noisePatterns.test(prompt)) return false;
+  // Reject if >40% of content is non-alpha (log lines, stack traces, JSON blobs)
+  const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
+  if (alpha / prompt.length < 0.4) return false;
+  return true;
+}
 const errorPatterns = /\b(error|bug|broken|crash|fail|exception|traceback|stack trace|doesn'?t work|not working|issue|problem|wrong)\b/i;
 const vaguePhrases = /^(it'?s? (?:still )?(?:not working|broken|wrong|failing))|^(fix it|try again|still (?:the same|broken|failing|not working))|^(same (?:error|issue|problem|thing))/i;
 const specificDebugPatterns = /\b(line \d+|TypeError|SyntaxError|ImportError|ReferenceError|ValueError|KeyError|AttributeError|NoneType|undefined is not|cannot read prop|stack trace|traceback|\.py:\d+|\.ts:\d+|\.js:\d+|status (?:code )?\d{3}|HTTP \d{3}|ENOENT|EACCES|CORS|404|500|502|503)\b/i;
@@ -57,8 +70,9 @@ export function computeDebugCycles(sessions) {
         }
         if (specificDebugPatterns.test(prompt) || prompt.length > 200) {
           specificReports++;
-          // Track best specific report
-          if (prompt.length > bestSpecificLen) {
+          // Track best specific report — require actual debug pattern match
+          // and readable evidence quality
+          if (specificDebugPatterns.test(prompt) && isGoodEvidence(prompt) && prompt.length > bestSpecificLen) {
             bestSpecificLen = prompt.length;
             bestSpecificReport = prompt;
           }
@@ -69,7 +83,7 @@ export function computeDebugCycles(sessions) {
           totalTurnsToResolve += debugTurnCount;
           if (debugTurnCount <= 2) {
             quickFixes++;
-            if (!bestQuickFix) bestQuickFix = debugStartPrompt;
+            if (!bestQuickFix && isGoodEvidence(debugStartPrompt)) bestQuickFix = debugStartPrompt;
           }
           if (debugTurnCount > 5) longLoops++;
           inDebugMode = false;

package/src/metrics/decomposition.js CHANGED Viewed

@@ -11,6 +11,16 @@
  * - Follow-up prompts that reference or build on previous context
  */
+// ── Evidence quality filter ──
+const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
+function isGoodEvidence(prompt) {
+  if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
+  if (noisePatterns.test(prompt)) return false;
+  const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
+  if (alpha / prompt.length < 0.4) return false;
+  return true;
+}
 export function computeDecomposition(sessions) {
   if (sessions.length === 0) return { score: 50, details: {} };
@@ -51,7 +61,7 @@ export function computeDecomposition(sessions) {
       if (len < 100) shortPromptCount++;
       // Track decomposition examples (multi-sentence prompts showing task breakdown)
-      if (len > 150 && len < 2000) {
+      if (isGoodEvidence(prompt)) {
         decompCandidates.push(prompt);
       }
@@ -60,7 +70,7 @@ export function computeDecomposition(sessions) {
         if (followupPatterns.test(prompt) || refinementPatterns.test(prompt)) {
           contextualFollowups++;
           // Capture best followup example
-          if (!bestFollowupPrompt || prompt.length > bestFollowupPrompt.length) {
+          if (isGoodEvidence(prompt) && (!bestFollowupPrompt || prompt.length > bestFollowupPrompt.length)) {
             bestFollowupPrompt = prompt;
           }
         }

package/src/metrics/session-structure.js CHANGED Viewed

@@ -11,6 +11,16 @@
  * - Modification rate of AI output (shows critical review)
  */
+// ── Evidence quality filter ──
+const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
+function isGoodEvidence(prompt) {
+  if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
+  if (noisePatterns.test(prompt)) return false;
+  const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
+  if (alpha / prompt.length < 0.4) return false;
+  return true;
+}
 const contextSettingPatterns = /^(i('?m| am) (working on|building|trying to|looking at)|we need to|the goal is|here'?s (the|what)|context:|background:|i have a|there'?s a|i want to|let me explain)/i;
 const planningStartPatterns = /^(let'?s (plan|think|figure|start by)|first,? (let'?s|we should)|before we (start|begin|code)|the plan is|step 1|here'?s (my|the) plan)/i;
 const reviewPatterns = /\b(looks good|ship it|deploy|push it|commit|merge|let'?s go|lgtm|approved|test it|run (the )?tests|build it|does this look|review this|check this)\b/i;
@@ -52,7 +62,7 @@ export function computeSessionStructure(sessions) {
     if (contextSettingPatterns.test(firstPrompt) || firstPrompt.length > 200) {
       contextSetSessions++;
       // Track best context-setting prompt
-      if (firstPrompt.length > bestContextLen) {
+      if (isGoodEvidence(firstPrompt) && firstPrompt.length > bestContextLen) {
         bestContextLen = firstPrompt.length;
         bestContextPrompt = firstPrompt;
       }
@@ -77,7 +87,7 @@ export function computeSessionStructure(sessions) {
       if (refinementPatterns.test(prompt)) {
         refinementCount++;
         // Track best refinement example
-        if (!bestRefinementPrompt || prompt.length > bestRefinementPrompt.length) {
+        if (isGoodEvidence(prompt) && (!bestRefinementPrompt || prompt.length > bestRefinementPrompt.length)) {
           bestRefinementPrompt = prompt;
         }
       }