chekk 0.5.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +17 -0
- package/dist/index.js +448 -0
- package/package.json +18 -34
- package/bin/chekk.js +0 -62
- package/src/detect.js +0 -146
- package/src/display.js +0 -1153
- package/src/index.js +0 -301
- package/src/insights.js +0 -661
- package/src/metrics/ai-leverage.js +0 -186
- package/src/metrics/debug-cycles.js +0 -204
- package/src/metrics/decomposition.js +0 -158
- package/src/metrics/session-structure.js +0 -199
- package/src/metrics/token-efficiency.js +0 -258
- package/src/parsers/claude-code.js +0 -231
- package/src/parsers/codex.js +0 -188
- package/src/parsers/cursor.js +0 -281
- package/src/scorer.js +0 -228
- package/src/upload.js +0 -140
|
@@ -1,186 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* AI Leverage
|
|
3
|
-
*
|
|
4
|
-
* Measures whether the engineer uses AI for high-level design and architecture
|
|
5
|
-
* or mostly for boilerplate and simple tasks.
|
|
6
|
-
*
|
|
7
|
-
* Signals:
|
|
8
|
-
* - Architectural/design prompts vs boilerplate/CRUD prompts
|
|
9
|
-
* - Complexity of requested tasks
|
|
10
|
-
* - Usage of AI for planning, design review, code review
|
|
11
|
-
* - Diversity of tool usage (not just "write code" but also explore, analyze, test)
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
// ── Evidence quality filter ──
|
|
15
|
-
const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
|
|
16
|
-
function isGoodEvidence(prompt) {
|
|
17
|
-
if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
|
|
18
|
-
if (noisePatterns.test(prompt)) return false;
|
|
19
|
-
const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
|
|
20
|
-
if (alpha / prompt.length < 0.4) return false;
|
|
21
|
-
return true;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
const architecturalPatterns = /\b(architect|design|refactor|redesign|restructure|system design|data model|schema|api design|interface|abstract|pattern|trade-?off|scalab|approach|strategy|migration|infrastructure)\b/i;
|
|
25
|
-
const planningPatterns = /\b(plan|breakdown|break down|think through|help me think|what('?s| is) the best (way|approach)|how should (i|we)|pros and cons|options for|compare|evaluate|review my|code review|audit)\b/i;
|
|
26
|
-
const exploratoryPatterns = /\b(explain|understand|how does|what does|why does|walk me through|investigate|diagnose|analyze|explore|deep dive|look into)\b/i;
|
|
27
|
-
const boilerplatePatterns = /\b(add a (button|field|column|route|endpoint|page|component)|create a (form|modal|table|list|card)|simple (function|class|component)|CRUD|boilerplate|scaffold|template|generate (a |the )?(basic|simple))\b/i;
|
|
28
|
-
const testingPatterns = /\b(test|spec|unit test|integration test|e2e|coverage|assert|expect|mock|fixture)\b/i;
|
|
29
|
-
|
|
30
|
-
const highLeverageTools = ['Task', 'WebSearch', 'WebFetch', 'Grep', 'Glob', 'Read'];
|
|
31
|
-
const codingTools = ['Write', 'Edit', 'Bash', 'NotebookEdit'];
|
|
32
|
-
|
|
33
|
-
export function computeAILeverage(sessions) {
|
|
34
|
-
if (sessions.length === 0) return { score: 50, details: {} };
|
|
35
|
-
|
|
36
|
-
let totalPrompts = 0;
|
|
37
|
-
let architecturalPrompts = 0;
|
|
38
|
-
let planningPrompts = 0;
|
|
39
|
-
let exploratoryPrompts = 0;
|
|
40
|
-
let boilerplatePrompts = 0;
|
|
41
|
-
let testingPrompts = 0;
|
|
42
|
-
let highLeverageToolUses = 0;
|
|
43
|
-
let codingToolUses = 0;
|
|
44
|
-
let totalToolUses = 0;
|
|
45
|
-
|
|
46
|
-
// Track prompt complexity via length and structure
|
|
47
|
-
let complexPrompts = 0; // > 200 chars with multiple sentences
|
|
48
|
-
let trivialPrompts = 0; // < 50 chars, simple commands
|
|
49
|
-
|
|
50
|
-
// Capture representative examples
|
|
51
|
-
let bestArchPrompt = null; // best architectural prompt
|
|
52
|
-
let bestPlanPrompt = null; // best planning prompt
|
|
53
|
-
let bestExplorePrompt = null; // best exploratory prompt
|
|
54
|
-
let bestArchLen = 0;
|
|
55
|
-
let bestPlanLen = 0;
|
|
56
|
-
let bestExploreLen = 0;
|
|
57
|
-
|
|
58
|
-
for (const session of sessions) {
|
|
59
|
-
for (const exchange of session.exchanges) {
|
|
60
|
-
const prompt = exchange.userPrompt || '';
|
|
61
|
-
totalPrompts++;
|
|
62
|
-
|
|
63
|
-
// Categorize prompt type
|
|
64
|
-
if (architecturalPatterns.test(prompt)) {
|
|
65
|
-
architecturalPrompts++;
|
|
66
|
-
if (isGoodEvidence(prompt) && prompt.length > bestArchLen) { bestArchLen = prompt.length; bestArchPrompt = prompt; }
|
|
67
|
-
}
|
|
68
|
-
if (planningPatterns.test(prompt)) {
|
|
69
|
-
planningPrompts++;
|
|
70
|
-
if (isGoodEvidence(prompt) && prompt.length > bestPlanLen) { bestPlanLen = prompt.length; bestPlanPrompt = prompt; }
|
|
71
|
-
}
|
|
72
|
-
if (exploratoryPatterns.test(prompt)) {
|
|
73
|
-
exploratoryPrompts++;
|
|
74
|
-
if (isGoodEvidence(prompt) && prompt.length > bestExploreLen) { bestExploreLen = prompt.length; bestExplorePrompt = prompt; }
|
|
75
|
-
}
|
|
76
|
-
if (boilerplatePatterns.test(prompt)) boilerplatePrompts++;
|
|
77
|
-
if (testingPatterns.test(prompt)) testingPrompts++;
|
|
78
|
-
|
|
79
|
-
// Complexity
|
|
80
|
-
const sentences = prompt.split(/[.!?]+/).filter(s => s.trim().length > 10);
|
|
81
|
-
if (prompt.length > 200 && sentences.length >= 2) {
|
|
82
|
-
complexPrompts++;
|
|
83
|
-
} else if (prompt.length < 50) {
|
|
84
|
-
trivialPrompts++;
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
// Tool usage from assistant responses
|
|
88
|
-
for (const tool of exchange.toolCalls) {
|
|
89
|
-
totalToolUses++;
|
|
90
|
-
const toolName = tool.tool || '';
|
|
91
|
-
if (highLeverageTools.some(t => toolName.includes(t))) {
|
|
92
|
-
highLeverageToolUses++;
|
|
93
|
-
}
|
|
94
|
-
if (codingTools.some(t => toolName.includes(t))) {
|
|
95
|
-
codingToolUses++;
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
// Score components
|
|
102
|
-
|
|
103
|
-
// High-level thinking ratio (architectural + planning + exploratory vs total)
|
|
104
|
-
const highLevelPrompts = architecturalPrompts + planningPrompts + exploratoryPrompts;
|
|
105
|
-
const highLevelRatio = totalPrompts > 0 ? highLevelPrompts / totalPrompts : 0;
|
|
106
|
-
const highLevelScore = Math.min(100, highLevelRatio * 250); // 40%+ = 100
|
|
107
|
-
|
|
108
|
-
// Boilerplate ratio (lower is better)
|
|
109
|
-
const boilerplateRatio = totalPrompts > 0 ? boilerplatePrompts / totalPrompts : 0;
|
|
110
|
-
const boilerplatePenalty = boilerplateRatio * 60;
|
|
111
|
-
|
|
112
|
-
// Complexity ratio
|
|
113
|
-
const complexRatio = totalPrompts > 0 ? complexPrompts / totalPrompts : 0;
|
|
114
|
-
const complexScore = Math.min(100, complexRatio * 300);
|
|
115
|
-
|
|
116
|
-
// Tool diversity - using AI for research/exploration not just coding
|
|
117
|
-
const researchToolRatio = totalToolUses > 0 ? highLeverageToolUses / totalToolUses : 0.5;
|
|
118
|
-
const toolDiversityScore = Math.min(100, researchToolRatio * 200);
|
|
119
|
-
|
|
120
|
-
// Testing awareness
|
|
121
|
-
const testingRatio = totalPrompts > 0 ? testingPrompts / totalPrompts : 0;
|
|
122
|
-
const testingBonus = Math.min(20, testingRatio * 200);
|
|
123
|
-
|
|
124
|
-
const score = Math.round(
|
|
125
|
-
highLevelScore * 0.35 +
|
|
126
|
-
(100 - boilerplatePenalty) * 0.2 +
|
|
127
|
-
complexScore * 0.2 +
|
|
128
|
-
toolDiversityScore * 0.15 +
|
|
129
|
-
testingBonus * 0.1 +
|
|
130
|
-
50 * 0 // baseline filler
|
|
131
|
-
);
|
|
132
|
-
|
|
133
|
-
// Build examples — pick the best one available
|
|
134
|
-
const examples = [];
|
|
135
|
-
if (bestArchPrompt) examples.push({ type: 'architectural', prompt: bestArchPrompt });
|
|
136
|
-
if (bestPlanPrompt) examples.push({ type: 'planning', prompt: bestPlanPrompt });
|
|
137
|
-
if (bestExplorePrompt) examples.push({ type: 'exploratory', prompt: bestExplorePrompt });
|
|
138
|
-
|
|
139
|
-
// ── Token cost evidence ──
|
|
140
|
-
// Compare cost of trivial prompts vs complex/architectural prompts
|
|
141
|
-
let trivialTokens = 0, trivialTokenCount = 0;
|
|
142
|
-
let complexTokensTotal = 0, complexTokensCount = 0;
|
|
143
|
-
let boilerplateTokens = 0, boilerplateTokenCount = 0;
|
|
144
|
-
let archTokens = 0, archTokenCount = 0;
|
|
145
|
-
|
|
146
|
-
for (const session of sessions) {
|
|
147
|
-
for (const exchange of session.exchanges) {
|
|
148
|
-
const prompt = exchange.userPrompt || '';
|
|
149
|
-
const t = exchange.tokenUsage;
|
|
150
|
-
const tokens = t ? (t.inputTokens + t.outputTokens + t.cacheReadTokens + t.cacheCreationTokens) : 0;
|
|
151
|
-
if (tokens === 0) continue;
|
|
152
|
-
|
|
153
|
-
if (prompt.length < 50) { trivialTokens += tokens; trivialTokenCount++; }
|
|
154
|
-
const sentences = prompt.split(/[.!?]+/).filter(s => s.trim().length > 10);
|
|
155
|
-
if (prompt.length > 200 && sentences.length >= 2) { complexTokensTotal += tokens; complexTokensCount++; }
|
|
156
|
-
if (boilerplatePatterns.test(prompt)) { boilerplateTokens += tokens; boilerplateTokenCount++; }
|
|
157
|
-
if (architecturalPatterns.test(prompt) || planningPatterns.test(prompt)) { archTokens += tokens; archTokenCount++; }
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
return {
|
|
162
|
-
score: Math.max(0, Math.min(100, score)),
|
|
163
|
-
details: {
|
|
164
|
-
totalPrompts,
|
|
165
|
-
architecturalPrompts,
|
|
166
|
-
planningPrompts,
|
|
167
|
-
exploratoryPrompts,
|
|
168
|
-
boilerplatePrompts,
|
|
169
|
-
testingPrompts,
|
|
170
|
-
highLevelRatio: Math.round(highLevelRatio * 100),
|
|
171
|
-
complexPromptRatio: Math.round(complexRatio * 100),
|
|
172
|
-
toolDiversity: {
|
|
173
|
-
total: totalToolUses,
|
|
174
|
-
research: highLeverageToolUses,
|
|
175
|
-
coding: codingToolUses,
|
|
176
|
-
},
|
|
177
|
-
tokenEvidence: {
|
|
178
|
-
avgTokensTrivialPrompt: trivialTokenCount > 0 ? Math.round(trivialTokens / trivialTokenCount) : null,
|
|
179
|
-
avgTokensComplexPrompt: complexTokensCount > 0 ? Math.round(complexTokensTotal / complexTokensCount) : null,
|
|
180
|
-
avgTokensBoilerplate: boilerplateTokenCount > 0 ? Math.round(boilerplateTokens / boilerplateTokenCount) : null,
|
|
181
|
-
avgTokensArchitectural: archTokenCount > 0 ? Math.round(archTokens / archTokenCount) : null,
|
|
182
|
-
},
|
|
183
|
-
},
|
|
184
|
-
examples,
|
|
185
|
-
};
|
|
186
|
-
}
|
|
@@ -1,204 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Debug Cycle Efficiency
|
|
3
|
-
*
|
|
4
|
-
* Measures how effectively the engineer resolves issues with AI assistance.
|
|
5
|
-
*
|
|
6
|
-
* Signals:
|
|
7
|
-
* - Error/fix loops: user reports error → assistant tries fix → user reports same error
|
|
8
|
-
* - Number of turns to resolution
|
|
9
|
-
* - Quality of error context provided (stack traces, specific error messages)
|
|
10
|
-
* - "it's still broken" vs targeted debug prompts
|
|
11
|
-
*/
|
|
12
|
-
|
|
13
|
-
// ── Evidence quality filter ──
|
|
14
|
-
// Prompts used as evidence should be human-written, readable, and illustrative.
|
|
15
|
-
// Reject system-generated context, raw log pastes, and extreme lengths.
|
|
16
|
-
const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
|
|
17
|
-
function isGoodEvidence(prompt) {
|
|
18
|
-
if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
|
|
19
|
-
if (noisePatterns.test(prompt)) return false;
|
|
20
|
-
// Reject if >40% of content is non-alpha (log lines, stack traces, JSON blobs)
|
|
21
|
-
const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
|
|
22
|
-
if (alpha / prompt.length < 0.4) return false;
|
|
23
|
-
return true;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
const errorPatterns = /\b(error|bug|broken|crash|fail|exception|traceback|stack trace|doesn'?t work|not working|issue|problem|wrong)\b/i;
|
|
27
|
-
const vaguePhrases = /^(it'?s? (?:still )?(?:not working|broken|wrong|failing))|^(fix it|try again|still (?:the same|broken|failing|not working))|^(same (?:error|issue|problem|thing))/i;
|
|
28
|
-
const specificDebugPatterns = /\b(line \d+|TypeError|SyntaxError|ImportError|ReferenceError|ValueError|KeyError|AttributeError|NoneType|undefined is not|cannot read prop|stack trace|traceback|\.py:\d+|\.ts:\d+|\.js:\d+|status (?:code )?\d{3}|HTTP \d{3}|ENOENT|EACCES|CORS|404|500|502|503)\b/i;
|
|
29
|
-
const resolutionPatterns = /\b(works|working|fixed|solved|resolved|perfect|great|thanks|nice|awesome|that did it|looks good|ship it)\b/i;
|
|
30
|
-
|
|
31
|
-
export function computeDebugCycles(sessions) {
|
|
32
|
-
if (sessions.length === 0) return { score: 50, details: {} };
|
|
33
|
-
|
|
34
|
-
let totalDebugSequences = 0;
|
|
35
|
-
let totalTurnsToResolve = 0;
|
|
36
|
-
let vagueReports = 0;
|
|
37
|
-
let specificReports = 0;
|
|
38
|
-
let unresolvedSequences = 0;
|
|
39
|
-
let quickFixes = 0; // resolved in 1-2 turns
|
|
40
|
-
let longLoops = 0; // > 5 turns to resolve
|
|
41
|
-
|
|
42
|
-
// Capture representative examples
|
|
43
|
-
let bestSpecificReport = null; // best specific error report
|
|
44
|
-
let bestQuickFix = null; // prompt that led to quick resolution
|
|
45
|
-
let bestSpecificLen = 0;
|
|
46
|
-
|
|
47
|
-
for (const session of sessions) {
|
|
48
|
-
const { exchanges } = session;
|
|
49
|
-
let inDebugMode = false;
|
|
50
|
-
let debugTurnCount = 0;
|
|
51
|
-
let debugStartPrompt = null;
|
|
52
|
-
|
|
53
|
-
for (let i = 0; i < exchanges.length; i++) {
|
|
54
|
-
const prompt = exchanges[i].userPrompt || '';
|
|
55
|
-
|
|
56
|
-
if (errorPatterns.test(prompt)) {
|
|
57
|
-
if (!inDebugMode) {
|
|
58
|
-
// Starting a new debug sequence
|
|
59
|
-
inDebugMode = true;
|
|
60
|
-
debugTurnCount = 1;
|
|
61
|
-
debugStartPrompt = prompt;
|
|
62
|
-
totalDebugSequences++;
|
|
63
|
-
} else {
|
|
64
|
-
debugTurnCount++;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
// Check quality of error report
|
|
68
|
-
if (vaguePhrases.test(prompt)) {
|
|
69
|
-
vagueReports++;
|
|
70
|
-
}
|
|
71
|
-
if (specificDebugPatterns.test(prompt) || prompt.length > 200) {
|
|
72
|
-
specificReports++;
|
|
73
|
-
// Track best specific report — require actual debug pattern match
|
|
74
|
-
// and readable evidence quality
|
|
75
|
-
if (specificDebugPatterns.test(prompt) && isGoodEvidence(prompt) && prompt.length > bestSpecificLen) {
|
|
76
|
-
bestSpecificLen = prompt.length;
|
|
77
|
-
bestSpecificReport = prompt;
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
} else if (inDebugMode) {
|
|
81
|
-
// Check if this exchange resolves the debug
|
|
82
|
-
if (resolutionPatterns.test(prompt)) {
|
|
83
|
-
totalTurnsToResolve += debugTurnCount;
|
|
84
|
-
if (debugTurnCount <= 2) {
|
|
85
|
-
quickFixes++;
|
|
86
|
-
if (!bestQuickFix && isGoodEvidence(debugStartPrompt)) bestQuickFix = debugStartPrompt;
|
|
87
|
-
}
|
|
88
|
-
if (debugTurnCount > 5) longLoops++;
|
|
89
|
-
inDebugMode = false;
|
|
90
|
-
debugTurnCount = 0;
|
|
91
|
-
debugStartPrompt = null;
|
|
92
|
-
} else {
|
|
93
|
-
// Moved on without explicit resolution
|
|
94
|
-
totalTurnsToResolve += debugTurnCount;
|
|
95
|
-
unresolvedSequences++;
|
|
96
|
-
inDebugMode = false;
|
|
97
|
-
debugTurnCount = 0;
|
|
98
|
-
debugStartPrompt = null;
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
// Handle session ending mid-debug
|
|
104
|
-
if (inDebugMode) {
|
|
105
|
-
totalTurnsToResolve += debugTurnCount;
|
|
106
|
-
if (debugTurnCount > 5) longLoops++;
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
const avgTurnsToResolve = totalDebugSequences > 0
|
|
111
|
-
? totalTurnsToResolve / totalDebugSequences
|
|
112
|
-
: 0;
|
|
113
|
-
|
|
114
|
-
const totalReports = vagueReports + specificReports;
|
|
115
|
-
const specificRatio = totalReports > 0 ? specificReports / totalReports : 0.5;
|
|
116
|
-
|
|
117
|
-
// Score components
|
|
118
|
-
// Fewer turns to resolve = better
|
|
119
|
-
const turnsScore = avgTurnsToResolve === 0 ? 70 :
|
|
120
|
-
avgTurnsToResolve <= 2 ? 95 :
|
|
121
|
-
avgTurnsToResolve <= 3 ? 85 :
|
|
122
|
-
avgTurnsToResolve <= 5 ? 65 :
|
|
123
|
-
avgTurnsToResolve <= 8 ? 40 : 20;
|
|
124
|
-
|
|
125
|
-
// More specific reports = better
|
|
126
|
-
const specificScore = specificRatio * 100;
|
|
127
|
-
|
|
128
|
-
// Quick fix ratio
|
|
129
|
-
const quickFixRatio = totalDebugSequences > 0 ? quickFixes / totalDebugSequences : 0.5;
|
|
130
|
-
const quickFixScore = quickFixRatio * 100;
|
|
131
|
-
|
|
132
|
-
// Long loop penalty
|
|
133
|
-
const longLoopRatio = totalDebugSequences > 0 ? longLoops / totalDebugSequences : 0;
|
|
134
|
-
const longLoopPenalty = longLoopRatio * 50;
|
|
135
|
-
|
|
136
|
-
const score = Math.round(
|
|
137
|
-
turnsScore * 0.35 +
|
|
138
|
-
specificScore * 0.25 +
|
|
139
|
-
quickFixScore * 0.25 +
|
|
140
|
-
(100 - longLoopPenalty) * 0.15
|
|
141
|
-
);
|
|
142
|
-
|
|
143
|
-
// Build examples array
|
|
144
|
-
const examples = [];
|
|
145
|
-
if (bestSpecificReport) examples.push({ type: 'specific_report', prompt: bestSpecificReport });
|
|
146
|
-
if (bestQuickFix) examples.push({ type: 'quick_fix', prompt: bestQuickFix });
|
|
147
|
-
|
|
148
|
-
// ── Token cost evidence ──
|
|
149
|
-
// Compare cost of vague vs specific debug exchanges
|
|
150
|
-
let vagueTokens = 0, vagueTokenCount = 0;
|
|
151
|
-
let specificTokens = 0, specificTokenCount = 0;
|
|
152
|
-
let longLoopTokens = 0, longLoopTokenCount = 0;
|
|
153
|
-
let quickFixTokens = 0, quickFixTokenCount = 0;
|
|
154
|
-
|
|
155
|
-
for (const session of sessions) {
|
|
156
|
-
const { exchanges } = session;
|
|
157
|
-
let debugExchanges = [];
|
|
158
|
-
let inDebug = false;
|
|
159
|
-
|
|
160
|
-
for (let i = 0; i < exchanges.length; i++) {
|
|
161
|
-
const prompt = exchanges[i].userPrompt || '';
|
|
162
|
-
const t = exchanges[i].tokenUsage;
|
|
163
|
-
const tokens = t ? (t.inputTokens + t.outputTokens + t.cacheReadTokens + t.cacheCreationTokens) : 0;
|
|
164
|
-
|
|
165
|
-
if (errorPatterns.test(prompt) && tokens > 0) {
|
|
166
|
-
if (!inDebug) { inDebug = true; debugExchanges = []; }
|
|
167
|
-
debugExchanges.push({ prompt, tokens });
|
|
168
|
-
|
|
169
|
-
if (vaguePhrases.test(prompt)) { vagueTokens += tokens; vagueTokenCount++; }
|
|
170
|
-
if (specificDebugPatterns.test(prompt) || prompt.length > 200) { specificTokens += tokens; specificTokenCount++; }
|
|
171
|
-
} else if (inDebug) {
|
|
172
|
-
if (debugExchanges.length <= 2) {
|
|
173
|
-
const total = debugExchanges.reduce((s, e) => s + e.tokens, 0);
|
|
174
|
-
quickFixTokens += total; quickFixTokenCount++;
|
|
175
|
-
} else if (debugExchanges.length > 5) {
|
|
176
|
-
const total = debugExchanges.reduce((s, e) => s + e.tokens, 0);
|
|
177
|
-
longLoopTokens += total; longLoopTokenCount++;
|
|
178
|
-
}
|
|
179
|
-
inDebug = false;
|
|
180
|
-
debugExchanges = [];
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
return {
|
|
186
|
-
score: Math.max(0, Math.min(100, score)),
|
|
187
|
-
details: {
|
|
188
|
-
totalDebugSequences,
|
|
189
|
-
avgTurnsToResolve: Math.round(avgTurnsToResolve * 10) / 10,
|
|
190
|
-
quickFixes,
|
|
191
|
-
longLoops,
|
|
192
|
-
specificReportRatio: Math.round(specificRatio * 100),
|
|
193
|
-
vagueReports,
|
|
194
|
-
specificReports,
|
|
195
|
-
tokenEvidence: {
|
|
196
|
-
avgTokensVagueDebug: vagueTokenCount > 0 ? Math.round(vagueTokens / vagueTokenCount) : null,
|
|
197
|
-
avgTokensSpecificDebug: specificTokenCount > 0 ? Math.round(specificTokens / specificTokenCount) : null,
|
|
198
|
-
avgTokensQuickFix: quickFixTokenCount > 0 ? Math.round(quickFixTokens / quickFixTokenCount) : null,
|
|
199
|
-
avgTokensLongLoop: longLoopTokenCount > 0 ? Math.round(longLoopTokens / longLoopTokenCount) : null,
|
|
200
|
-
},
|
|
201
|
-
},
|
|
202
|
-
examples,
|
|
203
|
-
};
|
|
204
|
-
}
|
|
@@ -1,158 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Decomposition Quality
|
|
3
|
-
*
|
|
4
|
-
* Measures whether the engineer breaks complex tasks into subtasks
|
|
5
|
-
* or dumps everything in single mega-prompts.
|
|
6
|
-
*
|
|
7
|
-
* Signals:
|
|
8
|
-
* - Multi-step sessions (multiple exchanges building on each other) → higher
|
|
9
|
-
* - Single mega-prompt sessions → lower
|
|
10
|
-
* - Prompt length distribution (long single prompts = less decomposition)
|
|
11
|
-
* - Follow-up prompts that reference or build on previous context
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
// ── Evidence quality filter ──
|
|
15
|
-
const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
|
|
16
|
-
function isGoodEvidence(prompt) {
|
|
17
|
-
if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
|
|
18
|
-
if (noisePatterns.test(prompt)) return false;
|
|
19
|
-
const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
|
|
20
|
-
if (alpha / prompt.length < 0.4) return false;
|
|
21
|
-
return true;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
export function computeDecomposition(sessions) {
|
|
25
|
-
if (sessions.length === 0) return { score: 50, details: {} };
|
|
26
|
-
|
|
27
|
-
let totalExchanges = 0;
|
|
28
|
-
let multiStepSessions = 0;
|
|
29
|
-
let singleShotSessions = 0;
|
|
30
|
-
let avgPromptLength = 0;
|
|
31
|
-
let promptCount = 0;
|
|
32
|
-
let longPromptCount = 0; // > 500 chars
|
|
33
|
-
let shortPromptCount = 0; // < 100 chars
|
|
34
|
-
let contextualFollowups = 0; // prompts that start with "now", "next", "also", "then", reference prev context
|
|
35
|
-
|
|
36
|
-
const followupPatterns = /^(now |next |then |also |and |ok |okay |great |good |perfect |after that|building on|following up|continuing)/i;
|
|
37
|
-
const refinementPatterns = /^(actually |wait |hmm |instead |change |modify |update |tweak |adjust |fix |but )/i;
|
|
38
|
-
|
|
39
|
-
// Capture representative prompt examples
|
|
40
|
-
// Keep top 3 candidates and pick the 2nd-longest to avoid overlap with other metrics
|
|
41
|
-
const decompCandidates = [];
|
|
42
|
-
let bestFollowupPrompt = null;
|
|
43
|
-
|
|
44
|
-
for (const session of sessions) {
|
|
45
|
-
const { exchanges } = session;
|
|
46
|
-
totalExchanges += exchanges.length;
|
|
47
|
-
|
|
48
|
-
if (exchanges.length >= 4) {
|
|
49
|
-
multiStepSessions++;
|
|
50
|
-
} else if (exchanges.length === 1) {
|
|
51
|
-
singleShotSessions++;
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
for (let i = 0; i < exchanges.length; i++) {
|
|
55
|
-
const prompt = exchanges[i].userPrompt || '';
|
|
56
|
-
const len = prompt.length;
|
|
57
|
-
promptCount++;
|
|
58
|
-
avgPromptLength += len;
|
|
59
|
-
|
|
60
|
-
if (len > 500) longPromptCount++;
|
|
61
|
-
if (len < 100) shortPromptCount++;
|
|
62
|
-
|
|
63
|
-
// Track decomposition examples (multi-sentence prompts showing task breakdown)
|
|
64
|
-
if (isGoodEvidence(prompt)) {
|
|
65
|
-
decompCandidates.push(prompt);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
// Check for contextual followups (not the first prompt in a session)
|
|
69
|
-
if (i > 0) {
|
|
70
|
-
if (followupPatterns.test(prompt) || refinementPatterns.test(prompt)) {
|
|
71
|
-
contextualFollowups++;
|
|
72
|
-
// Capture best followup example
|
|
73
|
-
if (isGoodEvidence(prompt) && (!bestFollowupPrompt || prompt.length > bestFollowupPrompt.length)) {
|
|
74
|
-
bestFollowupPrompt = prompt;
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
avgPromptLength = promptCount > 0 ? avgPromptLength / promptCount : 0;
|
|
82
|
-
|
|
83
|
-
// Score components (each 0-100)
|
|
84
|
-
const multiStepRatio = sessions.length > 0 ? multiStepSessions / sessions.length : 0;
|
|
85
|
-
const multiStepScore = Math.min(100, multiStepRatio * 150); // bonus for > 66%
|
|
86
|
-
|
|
87
|
-
const singleShotRatio = sessions.length > 0 ? singleShotSessions / sessions.length : 0;
|
|
88
|
-
const singleShotPenalty = singleShotRatio * 40; // up to -40
|
|
89
|
-
|
|
90
|
-
// Moderate prompt length is good (not too mega, not too terse)
|
|
91
|
-
const lengthScore = avgPromptLength > 50 && avgPromptLength < 400 ? 80 :
|
|
92
|
-
avgPromptLength >= 400 && avgPromptLength < 800 ? 60 :
|
|
93
|
-
avgPromptLength >= 800 ? 30 : 50;
|
|
94
|
-
|
|
95
|
-
// Contextual followups show iterative thinking
|
|
96
|
-
const followupRatio = promptCount > 0 ? contextualFollowups / promptCount : 0;
|
|
97
|
-
const followupScore = Math.min(100, followupRatio * 300);
|
|
98
|
-
|
|
99
|
-
const avgExchangesPerSession = sessions.length > 0 ? totalExchanges / sessions.length : 0;
|
|
100
|
-
const depthScore = Math.min(100, avgExchangesPerSession * 8); // 12+ exchanges = 100
|
|
101
|
-
|
|
102
|
-
const score = Math.round(
|
|
103
|
-
multiStepScore * 0.25 +
|
|
104
|
-
(100 - singleShotPenalty) * 0.15 +
|
|
105
|
-
lengthScore * 0.2 +
|
|
106
|
-
followupScore * 0.2 +
|
|
107
|
-
depthScore * 0.2
|
|
108
|
-
);
|
|
109
|
-
|
|
110
|
-
// Build examples array — pick a mid-length prompt to avoid overlap with other metrics
|
|
111
|
-
const examples = [];
|
|
112
|
-
if (decompCandidates.length > 0) {
|
|
113
|
-
decompCandidates.sort((a, b) => b.length - a.length);
|
|
114
|
-
// Pick ~median length to avoid the longest (which will also match ai-leverage)
|
|
115
|
-
const pickIdx = Math.min(Math.floor(decompCandidates.length / 3), decompCandidates.length - 1);
|
|
116
|
-
examples.push({ type: 'decomposition', prompt: decompCandidates[pickIdx] });
|
|
117
|
-
}
|
|
118
|
-
if (bestFollowupPrompt) examples.push({ type: 'followup', prompt: bestFollowupPrompt });
|
|
119
|
-
|
|
120
|
-
// ── Token cost evidence ──
|
|
121
|
-
// Compare token cost of single-shot sessions vs multi-step sessions
|
|
122
|
-
// to prove decomposition saves tokens
|
|
123
|
-
let singleShotTokens = 0, singleShotCount = 0;
|
|
124
|
-
let multiStepTokens = 0, multiStepCount = 0;
|
|
125
|
-
for (const session of sessions) {
|
|
126
|
-
const t = session.tokenUsage;
|
|
127
|
-
if (!t || (t.inputTokens + t.outputTokens + t.cacheReadTokens + t.cacheCreationTokens) === 0) continue;
|
|
128
|
-
const total = t.inputTokens + t.outputTokens + t.cacheReadTokens + t.cacheCreationTokens;
|
|
129
|
-
const perExchange = session.exchangeCount > 0 ? total / session.exchangeCount : total;
|
|
130
|
-
if (session.exchangeCount === 1) {
|
|
131
|
-
singleShotTokens += perExchange;
|
|
132
|
-
singleShotCount++;
|
|
133
|
-
} else if (session.exchangeCount >= 4) {
|
|
134
|
-
multiStepTokens += perExchange;
|
|
135
|
-
multiStepCount++;
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
const avgTokensSingleShot = singleShotCount > 0 ? Math.round(singleShotTokens / singleShotCount) : null;
|
|
139
|
-
const avgTokensMultiStep = multiStepCount > 0 ? Math.round(multiStepTokens / multiStepCount) : null;
|
|
140
|
-
|
|
141
|
-
return {
|
|
142
|
-
score: Math.max(0, Math.min(100, score)),
|
|
143
|
-
details: {
|
|
144
|
-
totalSessions: sessions.length,
|
|
145
|
-
multiStepSessions,
|
|
146
|
-
singleShotSessions,
|
|
147
|
-
avgExchangesPerSession: Math.round(avgExchangesPerSession * 10) / 10,
|
|
148
|
-
avgPromptLength: Math.round(avgPromptLength),
|
|
149
|
-
longPromptRatio: promptCount > 0 ? Math.round(longPromptCount / promptCount * 100) : 0,
|
|
150
|
-
contextualFollowupRatio: promptCount > 0 ? Math.round(followupRatio * 100) : 0,
|
|
151
|
-
tokenEvidence: {
|
|
152
|
-
avgTokensPerExchangeSingleShot: avgTokensSingleShot,
|
|
153
|
-
avgTokensPerExchangeMultiStep: avgTokensMultiStep,
|
|
154
|
-
},
|
|
155
|
-
},
|
|
156
|
-
examples,
|
|
157
|
-
};
|
|
158
|
-
}
|