chekk 0.5.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,186 +0,0 @@
1
- /**
2
- * AI Leverage
3
- *
4
- * Measures whether the engineer uses AI for high-level design and architecture
5
- * or mostly for boilerplate and simple tasks.
6
- *
7
- * Signals:
8
- * - Architectural/design prompts vs boilerplate/CRUD prompts
9
- * - Complexity of requested tasks
10
- * - Usage of AI for planning, design review, code review
11
- * - Diversity of tool usage (not just "write code" but also explore, analyze, test)
12
- */
13
-
14
- // ── Evidence quality filter ──
15
- const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
16
- function isGoodEvidence(prompt) {
17
- if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
18
- if (noisePatterns.test(prompt)) return false;
19
- const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
20
- if (alpha / prompt.length < 0.4) return false;
21
- return true;
22
- }
23
-
24
- const architecturalPatterns = /\b(architect|design|refactor|redesign|restructure|system design|data model|schema|api design|interface|abstract|pattern|trade-?off|scalab|approach|strategy|migration|infrastructure)\b/i;
25
- const planningPatterns = /\b(plan|breakdown|break down|think through|help me think|what('?s| is) the best (way|approach)|how should (i|we)|pros and cons|options for|compare|evaluate|review my|code review|audit)\b/i;
26
- const exploratoryPatterns = /\b(explain|understand|how does|what does|why does|walk me through|investigate|diagnose|analyze|explore|deep dive|look into)\b/i;
27
- const boilerplatePatterns = /\b(add a (button|field|column|route|endpoint|page|component)|create a (form|modal|table|list|card)|simple (function|class|component)|CRUD|boilerplate|scaffold|template|generate (a |the )?(basic|simple))\b/i;
28
- const testingPatterns = /\b(test|spec|unit test|integration test|e2e|coverage|assert|expect|mock|fixture)\b/i;
29
-
30
- const highLeverageTools = ['Task', 'WebSearch', 'WebFetch', 'Grep', 'Glob', 'Read'];
31
- const codingTools = ['Write', 'Edit', 'Bash', 'NotebookEdit'];
32
-
33
- export function computeAILeverage(sessions) {
34
- if (sessions.length === 0) return { score: 50, details: {} };
35
-
36
- let totalPrompts = 0;
37
- let architecturalPrompts = 0;
38
- let planningPrompts = 0;
39
- let exploratoryPrompts = 0;
40
- let boilerplatePrompts = 0;
41
- let testingPrompts = 0;
42
- let highLeverageToolUses = 0;
43
- let codingToolUses = 0;
44
- let totalToolUses = 0;
45
-
46
- // Track prompt complexity via length and structure
47
- let complexPrompts = 0; // > 200 chars with multiple sentences
48
- let trivialPrompts = 0; // < 50 chars, simple commands
49
-
50
- // Capture representative examples
51
- let bestArchPrompt = null; // best architectural prompt
52
- let bestPlanPrompt = null; // best planning prompt
53
- let bestExplorePrompt = null; // best exploratory prompt
54
- let bestArchLen = 0;
55
- let bestPlanLen = 0;
56
- let bestExploreLen = 0;
57
-
58
- for (const session of sessions) {
59
- for (const exchange of session.exchanges) {
60
- const prompt = exchange.userPrompt || '';
61
- totalPrompts++;
62
-
63
- // Categorize prompt type
64
- if (architecturalPatterns.test(prompt)) {
65
- architecturalPrompts++;
66
- if (isGoodEvidence(prompt) && prompt.length > bestArchLen) { bestArchLen = prompt.length; bestArchPrompt = prompt; }
67
- }
68
- if (planningPatterns.test(prompt)) {
69
- planningPrompts++;
70
- if (isGoodEvidence(prompt) && prompt.length > bestPlanLen) { bestPlanLen = prompt.length; bestPlanPrompt = prompt; }
71
- }
72
- if (exploratoryPatterns.test(prompt)) {
73
- exploratoryPrompts++;
74
- if (isGoodEvidence(prompt) && prompt.length > bestExploreLen) { bestExploreLen = prompt.length; bestExplorePrompt = prompt; }
75
- }
76
- if (boilerplatePatterns.test(prompt)) boilerplatePrompts++;
77
- if (testingPatterns.test(prompt)) testingPrompts++;
78
-
79
- // Complexity
80
- const sentences = prompt.split(/[.!?]+/).filter(s => s.trim().length > 10);
81
- if (prompt.length > 200 && sentences.length >= 2) {
82
- complexPrompts++;
83
- } else if (prompt.length < 50) {
84
- trivialPrompts++;
85
- }
86
-
87
- // Tool usage from assistant responses
88
- for (const tool of exchange.toolCalls) {
89
- totalToolUses++;
90
- const toolName = tool.tool || '';
91
- if (highLeverageTools.some(t => toolName.includes(t))) {
92
- highLeverageToolUses++;
93
- }
94
- if (codingTools.some(t => toolName.includes(t))) {
95
- codingToolUses++;
96
- }
97
- }
98
- }
99
- }
100
-
101
- // Score components
102
-
103
- // High-level thinking ratio (architectural + planning + exploratory vs total)
104
- const highLevelPrompts = architecturalPrompts + planningPrompts + exploratoryPrompts;
105
- const highLevelRatio = totalPrompts > 0 ? highLevelPrompts / totalPrompts : 0;
106
- const highLevelScore = Math.min(100, highLevelRatio * 250); // 40%+ = 100
107
-
108
- // Boilerplate ratio (lower is better)
109
- const boilerplateRatio = totalPrompts > 0 ? boilerplatePrompts / totalPrompts : 0;
110
- const boilerplatePenalty = boilerplateRatio * 60;
111
-
112
- // Complexity ratio
113
- const complexRatio = totalPrompts > 0 ? complexPrompts / totalPrompts : 0;
114
- const complexScore = Math.min(100, complexRatio * 300);
115
-
116
- // Tool diversity - using AI for research/exploration not just coding
117
- const researchToolRatio = totalToolUses > 0 ? highLeverageToolUses / totalToolUses : 0.5;
118
- const toolDiversityScore = Math.min(100, researchToolRatio * 200);
119
-
120
- // Testing awareness
121
- const testingRatio = totalPrompts > 0 ? testingPrompts / totalPrompts : 0;
122
- const testingBonus = Math.min(20, testingRatio * 200);
123
-
124
- const score = Math.round(
125
- highLevelScore * 0.35 +
126
- (100 - boilerplatePenalty) * 0.2 +
127
- complexScore * 0.2 +
128
- toolDiversityScore * 0.15 +
129
- testingBonus * 0.1 +
130
- 50 * 0 // baseline filler
131
- );
132
-
133
- // Build examples — pick the best one available
134
- const examples = [];
135
- if (bestArchPrompt) examples.push({ type: 'architectural', prompt: bestArchPrompt });
136
- if (bestPlanPrompt) examples.push({ type: 'planning', prompt: bestPlanPrompt });
137
- if (bestExplorePrompt) examples.push({ type: 'exploratory', prompt: bestExplorePrompt });
138
-
139
- // ── Token cost evidence ──
140
- // Compare cost of trivial prompts vs complex/architectural prompts
141
- let trivialTokens = 0, trivialTokenCount = 0;
142
- let complexTokensTotal = 0, complexTokensCount = 0;
143
- let boilerplateTokens = 0, boilerplateTokenCount = 0;
144
- let archTokens = 0, archTokenCount = 0;
145
-
146
- for (const session of sessions) {
147
- for (const exchange of session.exchanges) {
148
- const prompt = exchange.userPrompt || '';
149
- const t = exchange.tokenUsage;
150
- const tokens = t ? (t.inputTokens + t.outputTokens + t.cacheReadTokens + t.cacheCreationTokens) : 0;
151
- if (tokens === 0) continue;
152
-
153
- if (prompt.length < 50) { trivialTokens += tokens; trivialTokenCount++; }
154
- const sentences = prompt.split(/[.!?]+/).filter(s => s.trim().length > 10);
155
- if (prompt.length > 200 && sentences.length >= 2) { complexTokensTotal += tokens; complexTokensCount++; }
156
- if (boilerplatePatterns.test(prompt)) { boilerplateTokens += tokens; boilerplateTokenCount++; }
157
- if (architecturalPatterns.test(prompt) || planningPatterns.test(prompt)) { archTokens += tokens; archTokenCount++; }
158
- }
159
- }
160
-
161
- return {
162
- score: Math.max(0, Math.min(100, score)),
163
- details: {
164
- totalPrompts,
165
- architecturalPrompts,
166
- planningPrompts,
167
- exploratoryPrompts,
168
- boilerplatePrompts,
169
- testingPrompts,
170
- highLevelRatio: Math.round(highLevelRatio * 100),
171
- complexPromptRatio: Math.round(complexRatio * 100),
172
- toolDiversity: {
173
- total: totalToolUses,
174
- research: highLeverageToolUses,
175
- coding: codingToolUses,
176
- },
177
- tokenEvidence: {
178
- avgTokensTrivialPrompt: trivialTokenCount > 0 ? Math.round(trivialTokens / trivialTokenCount) : null,
179
- avgTokensComplexPrompt: complexTokensCount > 0 ? Math.round(complexTokensTotal / complexTokensCount) : null,
180
- avgTokensBoilerplate: boilerplateTokenCount > 0 ? Math.round(boilerplateTokens / boilerplateTokenCount) : null,
181
- avgTokensArchitectural: archTokenCount > 0 ? Math.round(archTokens / archTokenCount) : null,
182
- },
183
- },
184
- examples,
185
- };
186
- }
@@ -1,204 +0,0 @@
1
- /**
2
- * Debug Cycle Efficiency
3
- *
4
- * Measures how effectively the engineer resolves issues with AI assistance.
5
- *
6
- * Signals:
7
- * - Error/fix loops: user reports error → assistant tries fix → user reports same error
8
- * - Number of turns to resolution
9
- * - Quality of error context provided (stack traces, specific error messages)
10
- * - "it's still broken" vs targeted debug prompts
11
- */
12
-
13
- // ── Evidence quality filter ──
14
- // Prompts used as evidence should be human-written, readable, and illustrative.
15
- // Reject system-generated context, raw log pastes, and extreme lengths.
16
- const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
17
- function isGoodEvidence(prompt) {
18
- if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
19
- if (noisePatterns.test(prompt)) return false;
20
- // Reject if >40% of content is non-alpha (log lines, stack traces, JSON blobs)
21
- const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
22
- if (alpha / prompt.length < 0.4) return false;
23
- return true;
24
- }
25
-
26
- const errorPatterns = /\b(error|bug|broken|crash|fail|exception|traceback|stack trace|doesn'?t work|not working|issue|problem|wrong)\b/i;
27
- const vaguePhrases = /^(it'?s? (?:still )?(?:not working|broken|wrong|failing))|^(fix it|try again|still (?:the same|broken|failing|not working))|^(same (?:error|issue|problem|thing))/i;
28
- const specificDebugPatterns = /\b(line \d+|TypeError|SyntaxError|ImportError|ReferenceError|ValueError|KeyError|AttributeError|NoneType|undefined is not|cannot read prop|stack trace|traceback|\.py:\d+|\.ts:\d+|\.js:\d+|status (?:code )?\d{3}|HTTP \d{3}|ENOENT|EACCES|CORS|404|500|502|503)\b/i;
29
- const resolutionPatterns = /\b(works|working|fixed|solved|resolved|perfect|great|thanks|nice|awesome|that did it|looks good|ship it)\b/i;
30
-
31
- export function computeDebugCycles(sessions) {
32
- if (sessions.length === 0) return { score: 50, details: {} };
33
-
34
- let totalDebugSequences = 0;
35
- let totalTurnsToResolve = 0;
36
- let vagueReports = 0;
37
- let specificReports = 0;
38
- let unresolvedSequences = 0;
39
- let quickFixes = 0; // resolved in 1-2 turns
40
- let longLoops = 0; // > 5 turns to resolve
41
-
42
- // Capture representative examples
43
- let bestSpecificReport = null; // best specific error report
44
- let bestQuickFix = null; // prompt that led to quick resolution
45
- let bestSpecificLen = 0;
46
-
47
- for (const session of sessions) {
48
- const { exchanges } = session;
49
- let inDebugMode = false;
50
- let debugTurnCount = 0;
51
- let debugStartPrompt = null;
52
-
53
- for (let i = 0; i < exchanges.length; i++) {
54
- const prompt = exchanges[i].userPrompt || '';
55
-
56
- if (errorPatterns.test(prompt)) {
57
- if (!inDebugMode) {
58
- // Starting a new debug sequence
59
- inDebugMode = true;
60
- debugTurnCount = 1;
61
- debugStartPrompt = prompt;
62
- totalDebugSequences++;
63
- } else {
64
- debugTurnCount++;
65
- }
66
-
67
- // Check quality of error report
68
- if (vaguePhrases.test(prompt)) {
69
- vagueReports++;
70
- }
71
- if (specificDebugPatterns.test(prompt) || prompt.length > 200) {
72
- specificReports++;
73
- // Track best specific report — require actual debug pattern match
74
- // and readable evidence quality
75
- if (specificDebugPatterns.test(prompt) && isGoodEvidence(prompt) && prompt.length > bestSpecificLen) {
76
- bestSpecificLen = prompt.length;
77
- bestSpecificReport = prompt;
78
- }
79
- }
80
- } else if (inDebugMode) {
81
- // Check if this exchange resolves the debug
82
- if (resolutionPatterns.test(prompt)) {
83
- totalTurnsToResolve += debugTurnCount;
84
- if (debugTurnCount <= 2) {
85
- quickFixes++;
86
- if (!bestQuickFix && isGoodEvidence(debugStartPrompt)) bestQuickFix = debugStartPrompt;
87
- }
88
- if (debugTurnCount > 5) longLoops++;
89
- inDebugMode = false;
90
- debugTurnCount = 0;
91
- debugStartPrompt = null;
92
- } else {
93
- // Moved on without explicit resolution
94
- totalTurnsToResolve += debugTurnCount;
95
- unresolvedSequences++;
96
- inDebugMode = false;
97
- debugTurnCount = 0;
98
- debugStartPrompt = null;
99
- }
100
- }
101
- }
102
-
103
- // Handle session ending mid-debug
104
- if (inDebugMode) {
105
- totalTurnsToResolve += debugTurnCount;
106
- if (debugTurnCount > 5) longLoops++;
107
- }
108
- }
109
-
110
- const avgTurnsToResolve = totalDebugSequences > 0
111
- ? totalTurnsToResolve / totalDebugSequences
112
- : 0;
113
-
114
- const totalReports = vagueReports + specificReports;
115
- const specificRatio = totalReports > 0 ? specificReports / totalReports : 0.5;
116
-
117
- // Score components
118
- // Fewer turns to resolve = better
119
- const turnsScore = avgTurnsToResolve === 0 ? 70 :
120
- avgTurnsToResolve <= 2 ? 95 :
121
- avgTurnsToResolve <= 3 ? 85 :
122
- avgTurnsToResolve <= 5 ? 65 :
123
- avgTurnsToResolve <= 8 ? 40 : 20;
124
-
125
- // More specific reports = better
126
- const specificScore = specificRatio * 100;
127
-
128
- // Quick fix ratio
129
- const quickFixRatio = totalDebugSequences > 0 ? quickFixes / totalDebugSequences : 0.5;
130
- const quickFixScore = quickFixRatio * 100;
131
-
132
- // Long loop penalty
133
- const longLoopRatio = totalDebugSequences > 0 ? longLoops / totalDebugSequences : 0;
134
- const longLoopPenalty = longLoopRatio * 50;
135
-
136
- const score = Math.round(
137
- turnsScore * 0.35 +
138
- specificScore * 0.25 +
139
- quickFixScore * 0.25 +
140
- (100 - longLoopPenalty) * 0.15
141
- );
142
-
143
- // Build examples array
144
- const examples = [];
145
- if (bestSpecificReport) examples.push({ type: 'specific_report', prompt: bestSpecificReport });
146
- if (bestQuickFix) examples.push({ type: 'quick_fix', prompt: bestQuickFix });
147
-
148
- // ── Token cost evidence ──
149
- // Compare cost of vague vs specific debug exchanges
150
- let vagueTokens = 0, vagueTokenCount = 0;
151
- let specificTokens = 0, specificTokenCount = 0;
152
- let longLoopTokens = 0, longLoopTokenCount = 0;
153
- let quickFixTokens = 0, quickFixTokenCount = 0;
154
-
155
- for (const session of sessions) {
156
- const { exchanges } = session;
157
- let debugExchanges = [];
158
- let inDebug = false;
159
-
160
- for (let i = 0; i < exchanges.length; i++) {
161
- const prompt = exchanges[i].userPrompt || '';
162
- const t = exchanges[i].tokenUsage;
163
- const tokens = t ? (t.inputTokens + t.outputTokens + t.cacheReadTokens + t.cacheCreationTokens) : 0;
164
-
165
- if (errorPatterns.test(prompt) && tokens > 0) {
166
- if (!inDebug) { inDebug = true; debugExchanges = []; }
167
- debugExchanges.push({ prompt, tokens });
168
-
169
- if (vaguePhrases.test(prompt)) { vagueTokens += tokens; vagueTokenCount++; }
170
- if (specificDebugPatterns.test(prompt) || prompt.length > 200) { specificTokens += tokens; specificTokenCount++; }
171
- } else if (inDebug) {
172
- if (debugExchanges.length <= 2) {
173
- const total = debugExchanges.reduce((s, e) => s + e.tokens, 0);
174
- quickFixTokens += total; quickFixTokenCount++;
175
- } else if (debugExchanges.length > 5) {
176
- const total = debugExchanges.reduce((s, e) => s + e.tokens, 0);
177
- longLoopTokens += total; longLoopTokenCount++;
178
- }
179
- inDebug = false;
180
- debugExchanges = [];
181
- }
182
- }
183
- }
184
-
185
- return {
186
- score: Math.max(0, Math.min(100, score)),
187
- details: {
188
- totalDebugSequences,
189
- avgTurnsToResolve: Math.round(avgTurnsToResolve * 10) / 10,
190
- quickFixes,
191
- longLoops,
192
- specificReportRatio: Math.round(specificRatio * 100),
193
- vagueReports,
194
- specificReports,
195
- tokenEvidence: {
196
- avgTokensVagueDebug: vagueTokenCount > 0 ? Math.round(vagueTokens / vagueTokenCount) : null,
197
- avgTokensSpecificDebug: specificTokenCount > 0 ? Math.round(specificTokens / specificTokenCount) : null,
198
- avgTokensQuickFix: quickFixTokenCount > 0 ? Math.round(quickFixTokens / quickFixTokenCount) : null,
199
- avgTokensLongLoop: longLoopTokenCount > 0 ? Math.round(longLoopTokens / longLoopTokenCount) : null,
200
- },
201
- },
202
- examples,
203
- };
204
- }
@@ -1,158 +0,0 @@
1
- /**
2
- * Decomposition Quality
3
- *
4
- * Measures whether the engineer breaks complex tasks into subtasks
5
- * or dumps everything in single mega-prompts.
6
- *
7
- * Signals:
8
- * - Multi-step sessions (multiple exchanges building on each other) → higher
9
- * - Single mega-prompt sessions → lower
10
- * - Prompt length distribution (long single prompts = less decomposition)
11
- * - Follow-up prompts that reference or build on previous context
12
- */
13
-
14
- // ── Evidence quality filter ──
15
- const noisePatterns = /^This session is being continued|^\[?[0-9T:.Z-]{20,}|^\S+@\S+.*[%$#>]|^\s*\$\s|^\s*>\s/;
16
- function isGoodEvidence(prompt) {
17
- if (!prompt || prompt.length < 40 || prompt.length > 600) return false;
18
- if (noisePatterns.test(prompt)) return false;
19
- const alpha = prompt.replace(/[^a-zA-Z]/g, '').length;
20
- if (alpha / prompt.length < 0.4) return false;
21
- return true;
22
- }
23
-
24
- export function computeDecomposition(sessions) {
25
- if (sessions.length === 0) return { score: 50, details: {} };
26
-
27
- let totalExchanges = 0;
28
- let multiStepSessions = 0;
29
- let singleShotSessions = 0;
30
- let avgPromptLength = 0;
31
- let promptCount = 0;
32
- let longPromptCount = 0; // > 500 chars
33
- let shortPromptCount = 0; // < 100 chars
34
- let contextualFollowups = 0; // prompts that start with "now", "next", "also", "then", reference prev context
35
-
36
- const followupPatterns = /^(now |next |then |also |and |ok |okay |great |good |perfect |after that|building on|following up|continuing)/i;
37
- const refinementPatterns = /^(actually |wait |hmm |instead |change |modify |update |tweak |adjust |fix |but )/i;
38
-
39
- // Capture representative prompt examples
40
- // Keep top 3 candidates and pick the 2nd-longest to avoid overlap with other metrics
41
- const decompCandidates = [];
42
- let bestFollowupPrompt = null;
43
-
44
- for (const session of sessions) {
45
- const { exchanges } = session;
46
- totalExchanges += exchanges.length;
47
-
48
- if (exchanges.length >= 4) {
49
- multiStepSessions++;
50
- } else if (exchanges.length === 1) {
51
- singleShotSessions++;
52
- }
53
-
54
- for (let i = 0; i < exchanges.length; i++) {
55
- const prompt = exchanges[i].userPrompt || '';
56
- const len = prompt.length;
57
- promptCount++;
58
- avgPromptLength += len;
59
-
60
- if (len > 500) longPromptCount++;
61
- if (len < 100) shortPromptCount++;
62
-
63
- // Track decomposition examples (multi-sentence prompts showing task breakdown)
64
- if (isGoodEvidence(prompt)) {
65
- decompCandidates.push(prompt);
66
- }
67
-
68
- // Check for contextual followups (not the first prompt in a session)
69
- if (i > 0) {
70
- if (followupPatterns.test(prompt) || refinementPatterns.test(prompt)) {
71
- contextualFollowups++;
72
- // Capture best followup example
73
- if (isGoodEvidence(prompt) && (!bestFollowupPrompt || prompt.length > bestFollowupPrompt.length)) {
74
- bestFollowupPrompt = prompt;
75
- }
76
- }
77
- }
78
- }
79
- }
80
-
81
- avgPromptLength = promptCount > 0 ? avgPromptLength / promptCount : 0;
82
-
83
- // Score components (each 0-100)
84
- const multiStepRatio = sessions.length > 0 ? multiStepSessions / sessions.length : 0;
85
- const multiStepScore = Math.min(100, multiStepRatio * 150); // bonus for > 66%
86
-
87
- const singleShotRatio = sessions.length > 0 ? singleShotSessions / sessions.length : 0;
88
- const singleShotPenalty = singleShotRatio * 40; // up to -40
89
-
90
- // Moderate prompt length is good (not too mega, not too terse)
91
- const lengthScore = avgPromptLength > 50 && avgPromptLength < 400 ? 80 :
92
- avgPromptLength >= 400 && avgPromptLength < 800 ? 60 :
93
- avgPromptLength >= 800 ? 30 : 50;
94
-
95
- // Contextual followups show iterative thinking
96
- const followupRatio = promptCount > 0 ? contextualFollowups / promptCount : 0;
97
- const followupScore = Math.min(100, followupRatio * 300);
98
-
99
- const avgExchangesPerSession = sessions.length > 0 ? totalExchanges / sessions.length : 0;
100
- const depthScore = Math.min(100, avgExchangesPerSession * 8); // 12+ exchanges = 100
101
-
102
- const score = Math.round(
103
- multiStepScore * 0.25 +
104
- (100 - singleShotPenalty) * 0.15 +
105
- lengthScore * 0.2 +
106
- followupScore * 0.2 +
107
- depthScore * 0.2
108
- );
109
-
110
- // Build examples array — pick a mid-length prompt to avoid overlap with other metrics
111
- const examples = [];
112
- if (decompCandidates.length > 0) {
113
- decompCandidates.sort((a, b) => b.length - a.length);
114
- // Pick ~median length to avoid the longest (which will also match ai-leverage)
115
- const pickIdx = Math.min(Math.floor(decompCandidates.length / 3), decompCandidates.length - 1);
116
- examples.push({ type: 'decomposition', prompt: decompCandidates[pickIdx] });
117
- }
118
- if (bestFollowupPrompt) examples.push({ type: 'followup', prompt: bestFollowupPrompt });
119
-
120
- // ── Token cost evidence ──
121
- // Compare token cost of single-shot sessions vs multi-step sessions
122
- // to prove decomposition saves tokens
123
- let singleShotTokens = 0, singleShotCount = 0;
124
- let multiStepTokens = 0, multiStepCount = 0;
125
- for (const session of sessions) {
126
- const t = session.tokenUsage;
127
- if (!t || (t.inputTokens + t.outputTokens + t.cacheReadTokens + t.cacheCreationTokens) === 0) continue;
128
- const total = t.inputTokens + t.outputTokens + t.cacheReadTokens + t.cacheCreationTokens;
129
- const perExchange = session.exchangeCount > 0 ? total / session.exchangeCount : total;
130
- if (session.exchangeCount === 1) {
131
- singleShotTokens += perExchange;
132
- singleShotCount++;
133
- } else if (session.exchangeCount >= 4) {
134
- multiStepTokens += perExchange;
135
- multiStepCount++;
136
- }
137
- }
138
- const avgTokensSingleShot = singleShotCount > 0 ? Math.round(singleShotTokens / singleShotCount) : null;
139
- const avgTokensMultiStep = multiStepCount > 0 ? Math.round(multiStepTokens / multiStepCount) : null;
140
-
141
- return {
142
- score: Math.max(0, Math.min(100, score)),
143
- details: {
144
- totalSessions: sessions.length,
145
- multiStepSessions,
146
- singleShotSessions,
147
- avgExchangesPerSession: Math.round(avgExchangesPerSession * 10) / 10,
148
- avgPromptLength: Math.round(avgPromptLength),
149
- longPromptRatio: promptCount > 0 ? Math.round(longPromptCount / promptCount * 100) : 0,
150
- contextualFollowupRatio: promptCount > 0 ? Math.round(followupRatio * 100) : 0,
151
- tokenEvidence: {
152
- avgTokensPerExchangeSingleShot: avgTokensSingleShot,
153
- avgTokensPerExchangeMultiStep: avgTokensMultiStep,
154
- },
155
- },
156
- examples,
157
- };
158
- }