@axplusb/kepler 2.0.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/pulse/app/api/benchmark/route.ts +113 -0
- package/pulse/app/api/benchmarks/route.ts +195 -0
- package/pulse/app/benchmarks/page.tsx +224 -0
- package/pulse/components/layout/bottom-nav.tsx +2 -1
- package/pulse/components/layout/sidebar.tsx +2 -1
- package/src/core/risk-tier.mjs +8 -2
- package/src/core/stream-client.mjs +24 -1
- package/src/core/tool-executor.mjs +9 -2
- package/src/onboarding/preflight.mjs +51 -33
- package/src/terminal/repl.mjs +156 -48
- package/src/terminal/tool-display.mjs +29 -26
- package/src/tools/project-overview.mjs +109 -16
- package/src/ui/tool-card.mjs +27 -9
package/package.json
CHANGED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { NextResponse } from 'next/server'
|
|
2
|
+
import { readFileSync } from 'fs'
|
|
3
|
+
import { join } from 'path'
|
|
4
|
+
|
|
5
|
+
export const dynamic = 'force-dynamic'
|
|
6
|
+
|
|
7
|
+
interface BenchmarkResult {
|
|
8
|
+
instance_id: string
|
|
9
|
+
repo: string
|
|
10
|
+
base_commit: string
|
|
11
|
+
test_patch: string
|
|
12
|
+
resolved: boolean
|
|
13
|
+
test_result: {
|
|
14
|
+
result: string[]
|
|
15
|
+
exit_code: number
|
|
16
|
+
}
|
|
17
|
+
metadata: {
|
|
18
|
+
agent_class: string
|
|
19
|
+
model_name: string
|
|
20
|
+
max_iterations: number
|
|
21
|
+
eval_history: Array<{
|
|
22
|
+
timestamp: string
|
|
23
|
+
action: string
|
|
24
|
+
observation: string
|
|
25
|
+
}>
|
|
26
|
+
submission: string
|
|
27
|
+
instance_id: string
|
|
28
|
+
predict_output: string
|
|
29
|
+
model_patch: string
|
|
30
|
+
test_result: {
|
|
31
|
+
result: string[]
|
|
32
|
+
exit_code: number
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
interface BenchmarkData {
|
|
38
|
+
results: BenchmarkResult[]
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export async function GET(request: Request) {
|
|
42
|
+
try {
|
|
43
|
+
const { searchParams } = new URL(request.url)
|
|
44
|
+
const run = searchParams.get('run') || 'swebench-v4-flash-300'
|
|
45
|
+
const limit = parseInt(searchParams.get('limit') || '50', 10)
|
|
46
|
+
const offset = parseInt(searchParams.get('offset') || '0', 10)
|
|
47
|
+
|
|
48
|
+
// Load benchmark results from file
|
|
49
|
+
const resultsPath = join(
|
|
50
|
+
process.cwd(),
|
|
51
|
+
'..',
|
|
52
|
+
'benchmark',
|
|
53
|
+
'results',
|
|
54
|
+
'runs',
|
|
55
|
+
run,
|
|
56
|
+
'harness-results.json'
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
let data: BenchmarkData
|
|
60
|
+
try {
|
|
61
|
+
const fileContent = readFileSync(resultsPath, 'utf-8')
|
|
62
|
+
data = JSON.parse(fileContent)
|
|
63
|
+
} catch (error) {
|
|
64
|
+
return NextResponse.json(
|
|
65
|
+
{ error: `Benchmark run "${run}" not found` },
|
|
66
|
+
{ status: 404 }
|
|
67
|
+
)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Calculate statistics
|
|
71
|
+
const results = data.results || []
|
|
72
|
+
const totalTests = results.length
|
|
73
|
+
const resolvedTests = results.filter((r) => r.resolved).length
|
|
74
|
+
const passRate = totalTests > 0 ? (resolvedTests / totalTests) * 100 : 0
|
|
75
|
+
|
|
76
|
+
// Group by repo
|
|
77
|
+
const byRepo = new Map<string, number>()
|
|
78
|
+
const byRepoResolved = new Map<string, number>()
|
|
79
|
+
for (const result of results) {
|
|
80
|
+
const repo = result.repo || 'unknown'
|
|
81
|
+
byRepo.set(repo, (byRepo.get(repo) || 0) + 1)
|
|
82
|
+
if (result.resolved) {
|
|
83
|
+
byRepoResolved.set(repo, (byRepoResolved.get(repo) || 0) + 1)
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Paginate results
|
|
88
|
+
const paginatedResults = results.slice(offset, offset + limit)
|
|
89
|
+
|
|
90
|
+
return NextResponse.json({
|
|
91
|
+
run,
|
|
92
|
+
stats: {
|
|
93
|
+
totalTests,
|
|
94
|
+
resolvedTests,
|
|
95
|
+
passRate: parseFloat(passRate.toFixed(2)),
|
|
96
|
+
byRepo: Object.fromEntries(byRepo),
|
|
97
|
+
byRepoResolved: Object.fromEntries(byRepoResolved),
|
|
98
|
+
},
|
|
99
|
+
pagination: {
|
|
100
|
+
limit,
|
|
101
|
+
offset,
|
|
102
|
+
total: totalTests,
|
|
103
|
+
},
|
|
104
|
+
results: paginatedResults,
|
|
105
|
+
})
|
|
106
|
+
} catch (error) {
|
|
107
|
+
console.error('Benchmark API error:', error)
|
|
108
|
+
return NextResponse.json(
|
|
109
|
+
{ error: 'Failed to load benchmark data' },
|
|
110
|
+
{ status: 500 }
|
|
111
|
+
)
|
|
112
|
+
}
|
|
113
|
+
}
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import { NextResponse } from 'next/server'
|
|
2
|
+
import fs from 'fs'
|
|
3
|
+
import path from 'path'
|
|
4
|
+
|
|
5
|
+
export const dynamic = 'force-dynamic'
|
|
6
|
+
|
|
7
|
+
interface BenchmarkResult {
|
|
8
|
+
instance_id: string
|
|
9
|
+
repo: string
|
|
10
|
+
model: string
|
|
11
|
+
timestamp: string
|
|
12
|
+
kepler: {
|
|
13
|
+
status: string
|
|
14
|
+
exit_code: number
|
|
15
|
+
duration_seconds: number
|
|
16
|
+
tokens_used: number
|
|
17
|
+
cost: number
|
|
18
|
+
tool_calls: number
|
|
19
|
+
sub_agents: string[]
|
|
20
|
+
}
|
|
21
|
+
patch_lines: number
|
|
22
|
+
model_patch: string
|
|
23
|
+
status: string
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
interface BenchmarkStats {
|
|
27
|
+
total_runs: number
|
|
28
|
+
passed: number
|
|
29
|
+
failed: number
|
|
30
|
+
error: number
|
|
31
|
+
success_rate: number
|
|
32
|
+
avg_duration: number
|
|
33
|
+
total_cost: number
|
|
34
|
+
total_tokens: number
|
|
35
|
+
avg_tokens_per_run: number
|
|
36
|
+
by_status: Record<string, number>
|
|
37
|
+
by_repo: Record<string, { count: number; passed: number; success_rate: number }>
|
|
38
|
+
by_model: Record<string, { count: number; passed: number; success_rate: number }>
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async function loadBenchmarkResults(): Promise<BenchmarkResult[]> {
|
|
42
|
+
try {
|
|
43
|
+
const resultsPath = path.join(
|
|
44
|
+
process.cwd(),
|
|
45
|
+
'benchmark/results/runs/swebench-v4-flash-300/harness-results.json'
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
if (!fs.existsSync(resultsPath)) {
|
|
49
|
+
return []
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const data = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'))
|
|
53
|
+
return data.results || []
|
|
54
|
+
} catch (error) {
|
|
55
|
+
console.error('Error loading benchmark results:', error)
|
|
56
|
+
return []
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function calculateStats(results: BenchmarkResult[]): BenchmarkStats {
|
|
61
|
+
if (results.length === 0) {
|
|
62
|
+
return {
|
|
63
|
+
total_runs: 0,
|
|
64
|
+
passed: 0,
|
|
65
|
+
failed: 0,
|
|
66
|
+
error: 0,
|
|
67
|
+
success_rate: 0,
|
|
68
|
+
avg_duration: 0,
|
|
69
|
+
total_cost: 0,
|
|
70
|
+
total_tokens: 0,
|
|
71
|
+
avg_tokens_per_run: 0,
|
|
72
|
+
by_status: {},
|
|
73
|
+
by_repo: {},
|
|
74
|
+
by_model: {},
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const by_status: Record<string, number> = {}
|
|
79
|
+
const by_repo: Record<string, { count: number; passed: number }> = {}
|
|
80
|
+
const by_model: Record<string, { count: number; passed: number }> = {}
|
|
81
|
+
|
|
82
|
+
let total_cost = 0
|
|
83
|
+
let total_tokens = 0
|
|
84
|
+
let total_duration = 0
|
|
85
|
+
let passed = 0
|
|
86
|
+
|
|
87
|
+
results.forEach((result) => {
|
|
88
|
+
// Count by status
|
|
89
|
+
by_status[result.status] = (by_status[result.status] || 0) + 1
|
|
90
|
+
|
|
91
|
+
// Count by repo
|
|
92
|
+
if (!by_repo[result.repo]) {
|
|
93
|
+
by_repo[result.repo] = { count: 0, passed: 0 }
|
|
94
|
+
}
|
|
95
|
+
by_repo[result.repo].count++
|
|
96
|
+
|
|
97
|
+
// Count by model
|
|
98
|
+
if (!by_model[result.model]) {
|
|
99
|
+
by_model[result.model] = { count: 0, passed: 0 }
|
|
100
|
+
}
|
|
101
|
+
by_model[result.model].count++
|
|
102
|
+
|
|
103
|
+
// Aggregate metrics
|
|
104
|
+
if (result.kepler) {
|
|
105
|
+
total_cost += result.kepler.cost || 0
|
|
106
|
+
total_tokens += result.kepler.tokens_used || 0
|
|
107
|
+
total_duration += result.kepler.duration_seconds || 0
|
|
108
|
+
|
|
109
|
+
if (result.kepler.status === 'success') {
|
|
110
|
+
passed++
|
|
111
|
+
by_repo[result.repo].passed++
|
|
112
|
+
by_model[result.model].passed++
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
// Calculate success rates
|
|
118
|
+
const by_repo_with_rates = Object.entries(by_repo).reduce(
|
|
119
|
+
(acc, [repo, data]) => {
|
|
120
|
+
acc[repo] = {
|
|
121
|
+
...data,
|
|
122
|
+
success_rate: data.count > 0 ? (data.passed / data.count) * 100 : 0,
|
|
123
|
+
}
|
|
124
|
+
return acc
|
|
125
|
+
},
|
|
126
|
+
{} as Record<string, { count: number; passed: number; success_rate: number }>
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
const by_model_with_rates = Object.entries(by_model).reduce(
|
|
130
|
+
(acc, [model, data]) => {
|
|
131
|
+
acc[model] = {
|
|
132
|
+
...data,
|
|
133
|
+
success_rate: data.count > 0 ? (data.passed / data.count) * 100 : 0,
|
|
134
|
+
}
|
|
135
|
+
return acc
|
|
136
|
+
},
|
|
137
|
+
{} as Record<string, { count: number; passed: number; success_rate: number }>
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
total_runs: results.length,
|
|
142
|
+
passed,
|
|
143
|
+
failed: by_status['failed'] || 0,
|
|
144
|
+
error: by_status['error'] || 0,
|
|
145
|
+
success_rate: (passed / results.length) * 100,
|
|
146
|
+
avg_duration: total_duration / results.length,
|
|
147
|
+
total_cost,
|
|
148
|
+
total_tokens,
|
|
149
|
+
avg_tokens_per_run: total_tokens / results.length,
|
|
150
|
+
by_status,
|
|
151
|
+
by_repo: by_repo_with_rates,
|
|
152
|
+
by_model: by_model_with_rates,
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
export async function GET(request: Request) {
|
|
157
|
+
const { searchParams } = new URL(request.url)
|
|
158
|
+
const format = searchParams.get('format') || 'summary'
|
|
159
|
+
const repo = searchParams.get('repo')
|
|
160
|
+
const model = searchParams.get('model')
|
|
161
|
+
const status = searchParams.get('status')
|
|
162
|
+
|
|
163
|
+
const results = await loadBenchmarkResults()
|
|
164
|
+
|
|
165
|
+
// Filter results
|
|
166
|
+
let filtered = results
|
|
167
|
+
if (repo) {
|
|
168
|
+
filtered = filtered.filter((r) => r.repo === repo)
|
|
169
|
+
}
|
|
170
|
+
if (model) {
|
|
171
|
+
filtered = filtered.filter((r) => r.model === model)
|
|
172
|
+
}
|
|
173
|
+
if (status) {
|
|
174
|
+
filtered = filtered.filter((r) => r.status === status)
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
if (format === 'detailed') {
|
|
178
|
+
return NextResponse.json({
|
|
179
|
+
results: filtered,
|
|
180
|
+
count: filtered.length,
|
|
181
|
+
})
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Default: summary format
|
|
185
|
+
const stats = calculateStats(filtered)
|
|
186
|
+
|
|
187
|
+
return NextResponse.json({
|
|
188
|
+
stats,
|
|
189
|
+
filters: {
|
|
190
|
+
repo: repo || null,
|
|
191
|
+
model: model || null,
|
|
192
|
+
status: status || null,
|
|
193
|
+
},
|
|
194
|
+
})
|
|
195
|
+
}
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
'use client'
|
|
2
|
+
|
|
3
|
+
import { useEffect, useState } from 'react'
|
|
4
|
+
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card'
|
|
5
|
+
import { Badge } from '@/components/ui/badge'
|
|
6
|
+
|
|
7
|
+
interface BenchmarkStats {
|
|
8
|
+
total_runs: number
|
|
9
|
+
passed: number
|
|
10
|
+
failed: number
|
|
11
|
+
error: number
|
|
12
|
+
success_rate: number
|
|
13
|
+
avg_duration: number
|
|
14
|
+
total_cost: number
|
|
15
|
+
total_tokens: number
|
|
16
|
+
avg_tokens_per_run: number
|
|
17
|
+
by_status: Record<string, number>
|
|
18
|
+
by_repo: Record<string, { count: number; passed: number; success_rate: number }>
|
|
19
|
+
by_model: Record<string, { count: number; passed: number; success_rate: number }>
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
interface BenchmarkResponse {
|
|
23
|
+
stats: BenchmarkStats
|
|
24
|
+
filters: {
|
|
25
|
+
repo: string | null
|
|
26
|
+
model: string | null
|
|
27
|
+
status: string | null
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export default function BenchmarksPage() {
|
|
32
|
+
const [data, setData] = useState<BenchmarkResponse | null>(null)
|
|
33
|
+
const [loading, setLoading] = useState(true)
|
|
34
|
+
const [error, setError] = useState<string | null>(null)
|
|
35
|
+
|
|
36
|
+
useEffect(() => {
|
|
37
|
+
const fetchBenchmarks = async () => {
|
|
38
|
+
try {
|
|
39
|
+
const response = await fetch('/api/benchmarks')
|
|
40
|
+
if (!response.ok) {
|
|
41
|
+
throw new Error('Failed to fetch benchmarks')
|
|
42
|
+
}
|
|
43
|
+
const json = await response.json()
|
|
44
|
+
setData(json)
|
|
45
|
+
} catch (err) {
|
|
46
|
+
setError(err instanceof Error ? err.message : 'Unknown error')
|
|
47
|
+
} finally {
|
|
48
|
+
setLoading(false)
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
fetchBenchmarks()
|
|
53
|
+
}, [])
|
|
54
|
+
|
|
55
|
+
if (loading) {
|
|
56
|
+
return (
|
|
57
|
+
<div className="flex items-center justify-center min-h-screen">
|
|
58
|
+
<p className="text-muted-foreground">Loading benchmarks...</p>
|
|
59
|
+
</div>
|
|
60
|
+
)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (error) {
|
|
64
|
+
return (
|
|
65
|
+
<div className="flex items-center justify-center min-h-screen">
|
|
66
|
+
<p className="text-destructive">Error: {error}</p>
|
|
67
|
+
</div>
|
|
68
|
+
)
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
if (!data) {
|
|
72
|
+
return (
|
|
73
|
+
<div className="flex items-center justify-center min-h-screen">
|
|
74
|
+
<p className="text-muted-foreground">No benchmark data available</p>
|
|
75
|
+
</div>
|
|
76
|
+
)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const stats = data.stats
|
|
80
|
+
|
|
81
|
+
return (
|
|
82
|
+
<div className="space-y-6 p-6">
|
|
83
|
+
<div>
|
|
84
|
+
<h1 className="text-3xl font-bold tracking-tight">Benchmarks</h1>
|
|
85
|
+
<p className="text-muted-foreground mt-2">SWE-Bench v4 Flash 300 Results</p>
|
|
86
|
+
</div>
|
|
87
|
+
|
|
88
|
+
{/* Key Metrics */}
|
|
89
|
+
<div className="grid gap-4 md:grid-cols-2 lg:grid-cols-4">
|
|
90
|
+
<Card>
|
|
91
|
+
<CardHeader className="pb-2">
|
|
92
|
+
<CardTitle className="text-sm font-medium">Total Runs</CardTitle>
|
|
93
|
+
</CardHeader>
|
|
94
|
+
<CardContent>
|
|
95
|
+
<div className="text-2xl font-bold">{stats.total_runs}</div>
|
|
96
|
+
</CardContent>
|
|
97
|
+
</Card>
|
|
98
|
+
|
|
99
|
+
<Card>
|
|
100
|
+
<CardHeader className="pb-2">
|
|
101
|
+
<CardTitle className="text-sm font-medium">Success Rate</CardTitle>
|
|
102
|
+
</CardHeader>
|
|
103
|
+
<CardContent>
|
|
104
|
+
<div className="text-2xl font-bold">{stats.success_rate.toFixed(1)}%</div>
|
|
105
|
+
<p className="text-xs text-muted-foreground mt-1">
|
|
106
|
+
{stats.passed} passed, {stats.failed} failed
|
|
107
|
+
</p>
|
|
108
|
+
</CardContent>
|
|
109
|
+
</Card>
|
|
110
|
+
|
|
111
|
+
<Card>
|
|
112
|
+
<CardHeader className="pb-2">
|
|
113
|
+
<CardTitle className="text-sm font-medium">Total Cost</CardTitle>
|
|
114
|
+
</CardHeader>
|
|
115
|
+
<CardContent>
|
|
116
|
+
<div className="text-2xl font-bold">${stats.total_cost.toFixed(2)}</div>
|
|
117
|
+
<p className="text-xs text-muted-foreground mt-1">
|
|
118
|
+
{stats.avg_tokens_per_run.toFixed(0)} tokens/run
|
|
119
|
+
</p>
|
|
120
|
+
</CardContent>
|
|
121
|
+
</Card>
|
|
122
|
+
|
|
123
|
+
<Card>
|
|
124
|
+
<CardHeader className="pb-2">
|
|
125
|
+
<CardTitle className="text-sm font-medium">Avg Duration</CardTitle>
|
|
126
|
+
</CardHeader>
|
|
127
|
+
<CardContent>
|
|
128
|
+
<div className="text-2xl font-bold">{stats.avg_duration.toFixed(1)}s</div>
|
|
129
|
+
<p className="text-xs text-muted-foreground mt-1">
|
|
130
|
+
{(stats.total_tokens / 1000).toFixed(1)}K tokens total
|
|
131
|
+
</p>
|
|
132
|
+
</CardContent>
|
|
133
|
+
</Card>
|
|
134
|
+
</div>
|
|
135
|
+
|
|
136
|
+
{/* Status Breakdown */}
|
|
137
|
+
<Card>
|
|
138
|
+
<CardHeader>
|
|
139
|
+
<CardTitle>Status Breakdown</CardTitle>
|
|
140
|
+
<CardDescription>Distribution of run statuses</CardDescription>
|
|
141
|
+
</CardHeader>
|
|
142
|
+
<CardContent>
|
|
143
|
+
<div className="space-y-3">
|
|
144
|
+
{Object.entries(stats.by_status).map(([status, count]) => (
|
|
145
|
+
<div key={status} className="flex items-center justify-between">
|
|
146
|
+
<div className="flex items-center gap-2">
|
|
147
|
+
<Badge
|
|
148
|
+
variant={
|
|
149
|
+
status === 'success'
|
|
150
|
+
? 'default'
|
|
151
|
+
: status === 'failed'
|
|
152
|
+
? 'destructive'
|
|
153
|
+
: 'secondary'
|
|
154
|
+
}
|
|
155
|
+
>
|
|
156
|
+
{status}
|
|
157
|
+
</Badge>
|
|
158
|
+
<span className="text-sm text-muted-foreground">{count} runs</span>
|
|
159
|
+
</div>
|
|
160
|
+
<span className="text-sm font-medium">
|
|
161
|
+
{((count / stats.total_runs) * 100).toFixed(1)}%
|
|
162
|
+
</span>
|
|
163
|
+
</div>
|
|
164
|
+
))}
|
|
165
|
+
</div>
|
|
166
|
+
</CardContent>
|
|
167
|
+
</Card>
|
|
168
|
+
|
|
169
|
+
{/* By Repository */}
|
|
170
|
+
<Card>
|
|
171
|
+
<CardHeader>
|
|
172
|
+
<CardTitle>Performance by Repository</CardTitle>
|
|
173
|
+
<CardDescription>Success rate and run count per repository</CardDescription>
|
|
174
|
+
</CardHeader>
|
|
175
|
+
<CardContent>
|
|
176
|
+
<div className="space-y-4">
|
|
177
|
+
{Object.entries(stats.by_repo)
|
|
178
|
+
.sort((a, b) => b[1].count - a[1].count)
|
|
179
|
+
.map(([repo, data]) => (
|
|
180
|
+
<div key={repo} className="flex items-center justify-between border-b pb-3 last:border-0">
|
|
181
|
+
<div>
|
|
182
|
+
<p className="font-medium text-sm">{repo}</p>
|
|
183
|
+
<p className="text-xs text-muted-foreground">
|
|
184
|
+
{data.count} runs, {data.passed} passed
|
|
185
|
+
</p>
|
|
186
|
+
</div>
|
|
187
|
+
<div className="text-right">
|
|
188
|
+
<p className="font-bold text-sm">{data.success_rate.toFixed(1)}%</p>
|
|
189
|
+
</div>
|
|
190
|
+
</div>
|
|
191
|
+
))}
|
|
192
|
+
</div>
|
|
193
|
+
</CardContent>
|
|
194
|
+
</Card>
|
|
195
|
+
|
|
196
|
+
{/* By Model */}
|
|
197
|
+
<Card>
|
|
198
|
+
<CardHeader>
|
|
199
|
+
<CardTitle>Performance by Model</CardTitle>
|
|
200
|
+
<CardDescription>Success rate and run count per model</CardDescription>
|
|
201
|
+
</CardHeader>
|
|
202
|
+
<CardContent>
|
|
203
|
+
<div className="space-y-4">
|
|
204
|
+
{Object.entries(stats.by_model)
|
|
205
|
+
.sort((a, b) => b[1].count - a[1].count)
|
|
206
|
+
.map(([model, data]) => (
|
|
207
|
+
<div key={model} className="flex items-center justify-between border-b pb-3 last:border-0">
|
|
208
|
+
<div>
|
|
209
|
+
<p className="font-medium text-sm">{model}</p>
|
|
210
|
+
<p className="text-xs text-muted-foreground">
|
|
211
|
+
{data.count} runs, {data.passed} passed
|
|
212
|
+
</p>
|
|
213
|
+
</div>
|
|
214
|
+
<div className="text-right">
|
|
215
|
+
<p className="font-bold text-sm">{data.success_rate.toFixed(1)}%</p>
|
|
216
|
+
</div>
|
|
217
|
+
</div>
|
|
218
|
+
))}
|
|
219
|
+
</div>
|
|
220
|
+
</CardContent>
|
|
221
|
+
</Card>
|
|
222
|
+
</div>
|
|
223
|
+
)
|
|
224
|
+
}
|
|
@@ -4,7 +4,7 @@ import Link from 'next/link'
|
|
|
4
4
|
import { usePathname } from 'next/navigation'
|
|
5
5
|
import {
|
|
6
6
|
LayoutDashboard, MessageSquare, DollarSign,
|
|
7
|
-
FolderOpen, Activity, Moon, Sun,
|
|
7
|
+
FolderOpen, Activity, Moon, Sun, Zap,
|
|
8
8
|
} from 'lucide-react'
|
|
9
9
|
import { useTheme } from '@/components/theme-provider'
|
|
10
10
|
import { cn } from '@/lib/utils'
|
|
@@ -15,6 +15,7 @@ const NAV = [
|
|
|
15
15
|
{ href: '/costs', label: 'Costs', icon: DollarSign },
|
|
16
16
|
{ href: '/projects', label: 'Projects', icon: FolderOpen },
|
|
17
17
|
{ href: '/activity', label: 'Activity', icon: Activity },
|
|
18
|
+
{ href: '/benchmarks', label: 'Benchmarks', icon: Zap },
|
|
18
19
|
]
|
|
19
20
|
|
|
20
21
|
export function BottomNav() {
|
|
@@ -5,7 +5,7 @@ import { usePathname } from 'next/navigation'
|
|
|
5
5
|
import {
|
|
6
6
|
LayoutDashboard, FolderOpen, MessageSquare, DollarSign,
|
|
7
7
|
Wrench, Activity, History, CheckSquare, FileText,
|
|
8
|
-
Brain, Settings, Download, HelpCircle, Moon, Sun, PanelLeftClose, PanelLeft,
|
|
8
|
+
Brain, Settings, Download, HelpCircle, Moon, Sun, PanelLeftClose, PanelLeft, Zap,
|
|
9
9
|
} from 'lucide-react'
|
|
10
10
|
import { useTheme } from '@/components/theme-provider'
|
|
11
11
|
import { useSidebar } from '@/components/layout/sidebar-context'
|
|
@@ -24,6 +24,7 @@ const NAV = [
|
|
|
24
24
|
{ href: '/todos', label: 'Todos', icon: CheckSquare },
|
|
25
25
|
{ href: '/plans', label: 'Plans', icon: FileText },
|
|
26
26
|
{ href: '/memory', label: 'Memory', icon: Brain },
|
|
27
|
+
{ href: '/benchmarks', label: 'Benchmarks', icon: Zap },
|
|
27
28
|
{ href: '/settings', label: 'Settings', icon: Settings },
|
|
28
29
|
{ href: '/help', label: 'Help', icon: HelpCircle },
|
|
29
30
|
{ href: '/export', label: 'Export', icon: Download },
|
package/src/core/risk-tier.mjs
CHANGED
|
@@ -73,8 +73,14 @@ const NETWORK_TOOLS = new Set([
|
|
|
73
73
|
// ── Shell sub-classifier ────────────────────────────────────────────────
|
|
74
74
|
|
|
75
75
|
const SHELL_SAFE_RE = [
|
|
76
|
-
// Inspection / read-only
|
|
77
|
-
|
|
76
|
+
// Inspection / read-only + harmless shell navigation built-ins.
|
|
77
|
+
// `cd` / `pushd` / `popd` only change the process working directory; if
|
|
78
|
+
// chained with something dangerous, the multi-segment classifier still
|
|
79
|
+
// catches the danger (`cd /x && rm -rf .` → SHELL_DANGEROUS).
|
|
80
|
+
/^\s*(cd|pushd|popd|ls|cat|head|tail|less|more|wc|file|stat|tree|find|grep|rg|ag|fd|echo|printf|pwd|whoami|date|which|type|env|printenv|uname|hostname|id|df|du|uptime|free|top|ps|lsof)\b/i,
|
|
81
|
+
// mkdir -p / touch are creation primitives but harmless in scope.
|
|
82
|
+
/^\s*mkdir\s+-p\b/i,
|
|
83
|
+
/^\s*touch\s/i,
|
|
78
84
|
/^\s*git\s+(status|log|diff|show|branch|tag|remote|stash\s+list|blame|shortlog|describe|rev-parse|ls-files|ls-tree|config\s+--get)\b/i,
|
|
79
85
|
// Test-only invocations
|
|
80
86
|
/^\s*(npm|pnpm|yarn)\s+(test|run\s+test|run\s+lint|list|ls|view|info|outdated)\b/i,
|
|
@@ -93,14 +93,23 @@ export class TarangStreamClient {
|
|
|
93
93
|
};
|
|
94
94
|
if (this.token) headers['Authorization'] = `Bearer ${this.token}`;
|
|
95
95
|
|
|
96
|
+
// Abort controller so cancel() can break out of a stalled reader
|
|
97
|
+
// instead of waiting for the next SSE event to notice _cancelled.
|
|
98
|
+
this._abort = new AbortController();
|
|
99
|
+
|
|
96
100
|
let response;
|
|
97
101
|
try {
|
|
98
102
|
response = await fetch(url, {
|
|
99
103
|
method: 'POST',
|
|
100
104
|
headers,
|
|
101
105
|
body: JSON.stringify(body),
|
|
106
|
+
signal: this._abort.signal,
|
|
102
107
|
});
|
|
103
108
|
} catch (err) {
|
|
109
|
+
if (err.name === 'AbortError') {
|
|
110
|
+
yield { type: EVENT_TYPES.STATUS, data: { message: 'Cancelled by user.' } };
|
|
111
|
+
return;
|
|
112
|
+
}
|
|
104
113
|
yield { type: EVENT_TYPES.ERROR, data: { message: `Network error: ${err.message}. Check your connection or use --local mode.`, fatal: true } };
|
|
105
114
|
return;
|
|
106
115
|
}
|
|
@@ -175,7 +184,15 @@ export class TarangStreamClient {
|
|
|
175
184
|
|
|
176
185
|
try {
|
|
177
186
|
while (true) {
|
|
178
|
-
|
|
187
|
+
let read;
|
|
188
|
+
try {
|
|
189
|
+
read = await reader.read();
|
|
190
|
+
} catch (err) {
|
|
191
|
+
// Aborted via cancel() — treat as a clean end-of-stream.
|
|
192
|
+
if (err && (err.name === 'AbortError' || this._cancelled)) break;
|
|
193
|
+
throw err;
|
|
194
|
+
}
|
|
195
|
+
const { done, value } = read;
|
|
179
196
|
if (done) break;
|
|
180
197
|
|
|
181
198
|
buffer += decoder.decode(value, { stream: true });
|
|
@@ -335,6 +352,7 @@ export class TarangStreamClient {
|
|
|
335
352
|
/** Cancel the current stream. */
|
|
336
353
|
async cancel() {
|
|
337
354
|
this._cancelled = true;
|
|
355
|
+
// Best-effort backend POST — the stream may already be torn down.
|
|
338
356
|
if (this.currentTaskId) {
|
|
339
357
|
try {
|
|
340
358
|
await fetch(`${this.baseUrl}/api/cancel/${this.currentTaskId}`, {
|
|
@@ -343,6 +361,11 @@ export class TarangStreamClient {
|
|
|
343
361
|
});
|
|
344
362
|
} catch { /* best effort */ }
|
|
345
363
|
}
|
|
364
|
+
// Force the in-flight SSE reader to abort so the REPL returns to the
|
|
365
|
+
// prompt immediately instead of waiting on a parked reader.read().
|
|
366
|
+
if (this._abort) {
|
|
367
|
+
try { this._abort.abort(); } catch {}
|
|
368
|
+
}
|
|
346
369
|
}
|
|
347
370
|
|
|
348
371
|
/** Pause the current stream. */
|
|
@@ -91,6 +91,12 @@ export function createToolExecutor({
|
|
|
91
91
|
'.rs': (file) => `rustfmt --check "${file}" 2>&1`,
|
|
92
92
|
};
|
|
93
93
|
|
|
94
|
+
// tsc --pretty and eslint emit ANSI codes (including background-red
|
|
95
|
+
// highlights) which bleed when our renderer slices the first 80 chars.
|
|
96
|
+
// Strip color codes so the stored lint string is always plain text.
|
|
97
|
+
const ANSI_RE = /\x1b\[[0-9;]*[a-zA-Z]/g;
|
|
98
|
+
function stripAnsi(s) { return String(s || '').replace(ANSI_RE, ''); }
|
|
99
|
+
|
|
94
100
|
function autoLint(filePath) {
|
|
95
101
|
const ext = path.extname(filePath);
|
|
96
102
|
const cmdFn = LINT_COMMANDS[ext];
|
|
@@ -102,13 +108,14 @@ export function createToolExecutor({
|
|
|
102
108
|
timeout: 15_000,
|
|
103
109
|
cwd: process.cwd(),
|
|
104
110
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
111
|
+
env: { ...process.env, FORCE_COLOR: '0', NO_COLOR: '1', TERM: 'dumb' },
|
|
105
112
|
});
|
|
106
|
-
const trimmed = output.trim();
|
|
113
|
+
const trimmed = stripAnsi(output).trim();
|
|
107
114
|
if (!trimmed) return null;
|
|
108
115
|
return trimmed;
|
|
109
116
|
} catch (err) {
|
|
110
117
|
// Non-zero exit means lint errors found
|
|
111
|
-
const output = (err.stderr || err.stdout || '').trim();
|
|
118
|
+
const output = stripAnsi(err.stderr || err.stdout || '').trim();
|
|
112
119
|
if (!output) return null;
|
|
113
120
|
return output;
|
|
114
121
|
}
|