npm - @axplusb/kepler - Versions diffs - 2.0.0 → 2.0.3 - Mend

@axplusb/kepler 2.0.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/package.json +1 -1
package/pulse/app/api/benchmark/route.ts +113 -0
package/pulse/app/api/benchmarks/route.ts +195 -0
package/pulse/app/benchmarks/page.tsx +224 -0
package/pulse/components/layout/bottom-nav.tsx +2 -1
package/pulse/components/layout/sidebar.tsx +2 -1
package/src/core/risk-tier.mjs +8 -2
package/src/core/stream-client.mjs +24 -1
package/src/core/tool-executor.mjs +9 -2
package/src/onboarding/preflight.mjs +51 -33
package/src/terminal/repl.mjs +156 -48
package/src/terminal/tool-display.mjs +29 -26
package/src/tools/project-overview.mjs +109 -16
package/src/ui/tool-card.mjs +27 -9

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@axplusb/kepler",
-  "version": "2.0.0",
+  "version": "2.0.3",
   "description": "Kepler — AI coding agent with operating brief, preflight planning, and sub-agents. SWE-bench Lite evaluated.",
   "type": "module",
   "bin": {

package/pulse/app/api/benchmark/route.ts ADDED Viewed

@@ -0,0 +1,113 @@
+import { NextResponse } from 'next/server'
+import { readFileSync } from 'fs'
+import { join } from 'path'
+export const dynamic = 'force-dynamic'
+interface BenchmarkResult {
+  instance_id: string
+  repo: string
+  base_commit: string
+  test_patch: string
+  resolved: boolean
+  test_result: {
+    result: string[]
+    exit_code: number
+  }
+  metadata: {
+    agent_class: string
+    model_name: string
+    max_iterations: number
+    eval_history: Array<{
+      timestamp: string
+      action: string
+      observation: string
+    }>
+    submission: string
+    instance_id: string
+    predict_output: string
+    model_patch: string
+    test_result: {
+      result: string[]
+      exit_code: number
+    }
+  }
+}
+interface BenchmarkData {
+  results: BenchmarkResult[]
+}
+export async function GET(request: Request) {
+  try {
+    const { searchParams } = new URL(request.url)
+    const run = searchParams.get('run') || 'swebench-v4-flash-300'
+    const limit = parseInt(searchParams.get('limit') || '50', 10)
+    const offset = parseInt(searchParams.get('offset') || '0', 10)
+    // Load benchmark results from file
+    const resultsPath = join(
+      process.cwd(),
+      '..',
+      'benchmark',
+      'results',
+      'runs',
+      run,
+      'harness-results.json'
+    )
+    let data: BenchmarkData
+    try {
+      const fileContent = readFileSync(resultsPath, 'utf-8')
+      data = JSON.parse(fileContent)
+    } catch (error) {
+      return NextResponse.json(
+        { error: `Benchmark run "${run}" not found` },
+        { status: 404 }
+      )
+    }
+    // Calculate statistics
+    const results = data.results || []
+    const totalTests = results.length
+    const resolvedTests = results.filter((r) => r.resolved).length
+    const passRate = totalTests > 0 ? (resolvedTests / totalTests) * 100 : 0
+    // Group by repo
+    const byRepo = new Map<string, number>()
+    const byRepoResolved = new Map<string, number>()
+    for (const result of results) {
+      const repo = result.repo || 'unknown'
+      byRepo.set(repo, (byRepo.get(repo) || 0) + 1)
+      if (result.resolved) {
+        byRepoResolved.set(repo, (byRepoResolved.get(repo) || 0) + 1)
+      }
+    }
+    // Paginate results
+    const paginatedResults = results.slice(offset, offset + limit)
+    return NextResponse.json({
+      run,
+      stats: {
+        totalTests,
+        resolvedTests,
+        passRate: parseFloat(passRate.toFixed(2)),
+        byRepo: Object.fromEntries(byRepo),
+        byRepoResolved: Object.fromEntries(byRepoResolved),
+      },
+      pagination: {
+        limit,
+        offset,
+        total: totalTests,
+      },
+      results: paginatedResults,
+    })
+  } catch (error) {
+    console.error('Benchmark API error:', error)
+    return NextResponse.json(
+      { error: 'Failed to load benchmark data' },
+      { status: 500 }
+    )
+  }
+}

package/pulse/app/api/benchmarks/route.ts ADDED Viewed

@@ -0,0 +1,195 @@
+import { NextResponse } from 'next/server'
+import fs from 'fs'
+import path from 'path'
+export const dynamic = 'force-dynamic'
+interface BenchmarkResult {
+  instance_id: string
+  repo: string
+  model: string
+  timestamp: string
+  kepler: {
+    status: string
+    exit_code: number
+    duration_seconds: number
+    tokens_used: number
+    cost: number
+    tool_calls: number
+    sub_agents: string[]
+  }
+  patch_lines: number
+  model_patch: string
+  status: string
+}
+interface BenchmarkStats {
+  total_runs: number
+  passed: number
+  failed: number
+  error: number
+  success_rate: number
+  avg_duration: number
+  total_cost: number
+  total_tokens: number
+  avg_tokens_per_run: number
+  by_status: Record<string, number>
+  by_repo: Record<string, { count: number; passed: number; success_rate: number }>
+  by_model: Record<string, { count: number; passed: number; success_rate: number }>
+}
+async function loadBenchmarkResults(): Promise<BenchmarkResult[]> {
+  try {
+    const resultsPath = path.join(
+      process.cwd(),
+      'benchmark/results/runs/swebench-v4-flash-300/harness-results.json'
+    )
+    if (!fs.existsSync(resultsPath)) {
+      return []
+    }
+    const data = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'))
+    return data.results || []
+  } catch (error) {
+    console.error('Error loading benchmark results:', error)
+    return []
+  }
+}
+function calculateStats(results: BenchmarkResult[]): BenchmarkStats {
+  if (results.length === 0) {
+    return {
+      total_runs: 0,
+      passed: 0,
+      failed: 0,
+      error: 0,
+      success_rate: 0,
+      avg_duration: 0,
+      total_cost: 0,
+      total_tokens: 0,
+      avg_tokens_per_run: 0,
+      by_status: {},
+      by_repo: {},
+      by_model: {},
+    }
+  }
+  const by_status: Record<string, number> = {}
+  const by_repo: Record<string, { count: number; passed: number }> = {}
+  const by_model: Record<string, { count: number; passed: number }> = {}
+  let total_cost = 0
+  let total_tokens = 0
+  let total_duration = 0
+  let passed = 0
+  results.forEach((result) => {
+    // Count by status
+    by_status[result.status] = (by_status[result.status] || 0) + 1
+    // Count by repo
+    if (!by_repo[result.repo]) {
+      by_repo[result.repo] = { count: 0, passed: 0 }
+    }
+    by_repo[result.repo].count++
+    // Count by model
+    if (!by_model[result.model]) {
+      by_model[result.model] = { count: 0, passed: 0 }
+    }
+    by_model[result.model].count++
+    // Aggregate metrics
+    if (result.kepler) {
+      total_cost += result.kepler.cost || 0
+      total_tokens += result.kepler.tokens_used || 0
+      total_duration += result.kepler.duration_seconds || 0
+      if (result.kepler.status === 'success') {
+        passed++
+        by_repo[result.repo].passed++
+        by_model[result.model].passed++
+      }
+    }
+  })
+  // Calculate success rates
+  const by_repo_with_rates = Object.entries(by_repo).reduce(
+    (acc, [repo, data]) => {
+      acc[repo] = {
+        ...data,
+        success_rate: data.count > 0 ? (data.passed / data.count) * 100 : 0,
+      }
+      return acc
+    },
+    {} as Record<string, { count: number; passed: number; success_rate: number }>
+  )
+  const by_model_with_rates = Object.entries(by_model).reduce(
+    (acc, [model, data]) => {
+      acc[model] = {
+        ...data,
+        success_rate: data.count > 0 ? (data.passed / data.count) * 100 : 0,
+      }
+      return acc
+    },
+    {} as Record<string, { count: number; passed: number; success_rate: number }>
+  )
+  return {
+    total_runs: results.length,
+    passed,
+    failed: by_status['failed'] || 0,
+    error: by_status['error'] || 0,
+    success_rate: (passed / results.length) * 100,
+    avg_duration: total_duration / results.length,
+    total_cost,
+    total_tokens,
+    avg_tokens_per_run: total_tokens / results.length,
+    by_status,
+    by_repo: by_repo_with_rates,
+    by_model: by_model_with_rates,
+  }
+}
+export async function GET(request: Request) {
+  const { searchParams } = new URL(request.url)
+  const format = searchParams.get('format') || 'summary'
+  const repo = searchParams.get('repo')
+  const model = searchParams.get('model')
+  const status = searchParams.get('status')
+  const results = await loadBenchmarkResults()
+  // Filter results
+  let filtered = results
+  if (repo) {
+    filtered = filtered.filter((r) => r.repo === repo)
+  }
+  if (model) {
+    filtered = filtered.filter((r) => r.model === model)
+  }
+  if (status) {
+    filtered = filtered.filter((r) => r.status === status)
+  }
+  if (format === 'detailed') {
+    return NextResponse.json({
+      results: filtered,
+      count: filtered.length,
+    })
+  }
+  // Default: summary format
+  const stats = calculateStats(filtered)
+  return NextResponse.json({
+    stats,
+    filters: {
+      repo: repo || null,
+      model: model || null,
+      status: status || null,
+    },
+  })
+}

package/pulse/app/benchmarks/page.tsx ADDED Viewed

@@ -0,0 +1,224 @@
+'use client'
+import { useEffect, useState } from 'react'
+import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card'
+import { Badge } from '@/components/ui/badge'
+interface BenchmarkStats {
+  total_runs: number
+  passed: number
+  failed: number
+  error: number
+  success_rate: number
+  avg_duration: number
+  total_cost: number
+  total_tokens: number
+  avg_tokens_per_run: number
+  by_status: Record<string, number>
+  by_repo: Record<string, { count: number; passed: number; success_rate: number }>
+  by_model: Record<string, { count: number; passed: number; success_rate: number }>
+}
+interface BenchmarkResponse {
+  stats: BenchmarkStats
+  filters: {
+    repo: string | null
+    model: string | null
+    status: string | null
+  }
+}
+export default function BenchmarksPage() {
+  const [data, setData] = useState<BenchmarkResponse | null>(null)
+  const [loading, setLoading] = useState(true)
+  const [error, setError] = useState<string | null>(null)
+  useEffect(() => {
+    const fetchBenchmarks = async () => {
+      try {
+        const response = await fetch('/api/benchmarks')
+        if (!response.ok) {
+          throw new Error('Failed to fetch benchmarks')
+        }
+        const json = await response.json()
+        setData(json)
+      } catch (err) {
+        setError(err instanceof Error ? err.message : 'Unknown error')
+      } finally {
+        setLoading(false)
+      }
+    }
+    fetchBenchmarks()
+  }, [])
+  if (loading) {
+    return (
+      <div className="flex items-center justify-center min-h-screen">
+        <p className="text-muted-foreground">Loading benchmarks...</p>
+      </div>
+    )
+  }
+  if (error) {
+    return (
+      <div className="flex items-center justify-center min-h-screen">
+        <p className="text-destructive">Error: {error}</p>
+      </div>
+    )
+  }
+  if (!data) {
+    return (
+      <div className="flex items-center justify-center min-h-screen">
+        <p className="text-muted-foreground">No benchmark data available</p>
+      </div>
+    )
+  }
+  const stats = data.stats
+  return (
+    <div className="space-y-6 p-6">
+      <div>
+        <h1 className="text-3xl font-bold tracking-tight">Benchmarks</h1>
+        <p className="text-muted-foreground mt-2">SWE-Bench v4 Flash 300 Results</p>
+      </div>
+      {/* Key Metrics */}
+      <div className="grid gap-4 md:grid-cols-2 lg:grid-cols-4">
+        <Card>
+          <CardHeader className="pb-2">
+            <CardTitle className="text-sm font-medium">Total Runs</CardTitle>
+          </CardHeader>
+          <CardContent>
+            <div className="text-2xl font-bold">{stats.total_runs}</div>
+          </CardContent>
+        </Card>
+        <Card>
+          <CardHeader className="pb-2">
+            <CardTitle className="text-sm font-medium">Success Rate</CardTitle>
+          </CardHeader>
+          <CardContent>
+            <div className="text-2xl font-bold">{stats.success_rate.toFixed(1)}%</div>
+            <p className="text-xs text-muted-foreground mt-1">
+              {stats.passed} passed, {stats.failed} failed
+            </p>
+          </CardContent>
+        </Card>
+        <Card>
+          <CardHeader className="pb-2">
+            <CardTitle className="text-sm font-medium">Total Cost</CardTitle>
+          </CardHeader>
+          <CardContent>
+            <div className="text-2xl font-bold">${stats.total_cost.toFixed(2)}</div>
+            <p className="text-xs text-muted-foreground mt-1">
+              {stats.avg_tokens_per_run.toFixed(0)} tokens/run
+            </p>
+          </CardContent>
+        </Card>
+        <Card>
+          <CardHeader className="pb-2">
+            <CardTitle className="text-sm font-medium">Avg Duration</CardTitle>
+          </CardHeader>
+          <CardContent>
+            <div className="text-2xl font-bold">{stats.avg_duration.toFixed(1)}s</div>
+            <p className="text-xs text-muted-foreground mt-1">
+              {(stats.total_tokens / 1000).toFixed(1)}K tokens total
+            </p>
+          </CardContent>
+        </Card>
+      </div>
+      {/* Status Breakdown */}
+      <Card>
+        <CardHeader>
+          <CardTitle>Status Breakdown</CardTitle>
+          <CardDescription>Distribution of run statuses</CardDescription>
+        </CardHeader>
+        <CardContent>
+          <div className="space-y-3">
+            {Object.entries(stats.by_status).map(([status, count]) => (
+              <div key={status} className="flex items-center justify-between">
+                <div className="flex items-center gap-2">
+                  <Badge
+                    variant={
+                      status === 'success'
+                        ? 'default'
+                        : status === 'failed'
+                          ? 'destructive'
+                          : 'secondary'
+                    }
+                  >
+                    {status}
+                  </Badge>
+                  <span className="text-sm text-muted-foreground">{count} runs</span>
+                </div>
+                <span className="text-sm font-medium">
+                  {((count / stats.total_runs) * 100).toFixed(1)}%
+                </span>
+              </div>
+            ))}
+          </div>
+        </CardContent>
+      </Card>
+      {/* By Repository */}
+      <Card>
+        <CardHeader>
+          <CardTitle>Performance by Repository</CardTitle>
+          <CardDescription>Success rate and run count per repository</CardDescription>
+        </CardHeader>
+        <CardContent>
+          <div className="space-y-4">
+            {Object.entries(stats.by_repo)
+              .sort((a, b) => b[1].count - a[1].count)
+              .map(([repo, data]) => (
+                <div key={repo} className="flex items-center justify-between border-b pb-3 last:border-0">
+                  <div>
+                    <p className="font-medium text-sm">{repo}</p>
+                    <p className="text-xs text-muted-foreground">
+                      {data.count} runs, {data.passed} passed
+                    </p>
+                  </div>
+                  <div className="text-right">
+                    <p className="font-bold text-sm">{data.success_rate.toFixed(1)}%</p>
+                  </div>
+                </div>
+              ))}
+          </div>
+        </CardContent>
+      </Card>
+      {/* By Model */}
+      <Card>
+        <CardHeader>
+          <CardTitle>Performance by Model</CardTitle>
+          <CardDescription>Success rate and run count per model</CardDescription>
+        </CardHeader>
+        <CardContent>
+          <div className="space-y-4">
+            {Object.entries(stats.by_model)
+              .sort((a, b) => b[1].count - a[1].count)
+              .map(([model, data]) => (
+                <div key={model} className="flex items-center justify-between border-b pb-3 last:border-0">
+                  <div>
+                    <p className="font-medium text-sm">{model}</p>
+                    <p className="text-xs text-muted-foreground">
+                      {data.count} runs, {data.passed} passed
+                    </p>
+                  </div>
+                  <div className="text-right">
+                    <p className="font-bold text-sm">{data.success_rate.toFixed(1)}%</p>
+                  </div>
+                </div>
+              ))}
+          </div>
+        </CardContent>
+      </Card>
+    </div>
+  )
+}

package/pulse/components/layout/bottom-nav.tsx CHANGED Viewed

@@ -4,7 +4,7 @@ import Link from 'next/link'
 import { usePathname } from 'next/navigation'
 import {
   LayoutDashboard, MessageSquare, DollarSign,
-  FolderOpen, Activity, Moon, Sun,
+  FolderOpen, Activity, Moon, Sun, Zap,
 } from 'lucide-react'
 import { useTheme } from '@/components/theme-provider'
 import { cn } from '@/lib/utils'
@@ -15,6 +15,7 @@ const NAV = [
   { href: '/costs',    label: 'Costs',     icon: DollarSign      },
   { href: '/projects', label: 'Projects',  icon: FolderOpen      },
   { href: '/activity', label: 'Activity',  icon: Activity        },
+  { href: '/benchmarks', label: 'Benchmarks', icon: Zap          },
 ]
 export function BottomNav() {

package/pulse/components/layout/sidebar.tsx CHANGED Viewed

@@ -5,7 +5,7 @@ import { usePathname } from 'next/navigation'
 import {
   LayoutDashboard, FolderOpen, MessageSquare, DollarSign,
   Wrench, Activity, History, CheckSquare, FileText,
-  Brain, Settings, Download, HelpCircle, Moon, Sun, PanelLeftClose, PanelLeft,
+  Brain, Settings, Download, HelpCircle, Moon, Sun, PanelLeftClose, PanelLeft, Zap,
 } from 'lucide-react'
 import { useTheme } from '@/components/theme-provider'
 import { useSidebar } from '@/components/layout/sidebar-context'
@@ -24,6 +24,7 @@ const NAV = [
   { href: '/todos',    label: 'Todos',     icon: CheckSquare     },
   { href: '/plans',    label: 'Plans',     icon: FileText        },
   { href: '/memory',   label: 'Memory',    icon: Brain           },
+  { href: '/benchmarks', label: 'Benchmarks', icon: Zap          },
   { href: '/settings', label: 'Settings',  icon: Settings        },
   { href: '/help',     label: 'Help',      icon: HelpCircle      },
   { href: '/export',   label: 'Export',    icon: Download        },

package/src/core/risk-tier.mjs CHANGED Viewed

@@ -73,8 +73,14 @@ const NETWORK_TOOLS = new Set([
 // ── Shell sub-classifier ────────────────────────────────────────────────
 const SHELL_SAFE_RE = [
-  // Inspection / read-only
-  /^\s*(ls|cat|head|tail|less|more|wc|file|stat|tree|find|grep|rg|ag|fd|echo|printf|pwd|whoami|date|which|type|env|printenv|uname|hostname|id|df|du|uptime|free|top|ps|lsof)\b/i,
+  // Inspection / read-only + harmless shell navigation built-ins.
+  // `cd` / `pushd` / `popd` only change the process working directory; if
+  // chained with something dangerous, the multi-segment classifier still
+  // catches the danger (`cd /x && rm -rf .` → SHELL_DANGEROUS).
+  /^\s*(cd|pushd|popd|ls|cat|head|tail|less|more|wc|file|stat|tree|find|grep|rg|ag|fd|echo|printf|pwd|whoami|date|which|type|env|printenv|uname|hostname|id|df|du|uptime|free|top|ps|lsof)\b/i,
+  // mkdir -p / touch are creation primitives but harmless in scope.
+  /^\s*mkdir\s+-p\b/i,
+  /^\s*touch\s/i,
   /^\s*git\s+(status|log|diff|show|branch|tag|remote|stash\s+list|blame|shortlog|describe|rev-parse|ls-files|ls-tree|config\s+--get)\b/i,
   // Test-only invocations
   /^\s*(npm|pnpm|yarn)\s+(test|run\s+test|run\s+lint|list|ls|view|info|outdated)\b/i,

package/src/core/stream-client.mjs CHANGED Viewed

@@ -93,14 +93,23 @@ export class TarangStreamClient {
         };
         if (this.token) headers['Authorization'] = `Bearer ${this.token}`;
+        // Abort controller so cancel() can break out of a stalled reader
+        // instead of waiting for the next SSE event to notice _cancelled.
+        this._abort = new AbortController();
         let response;
         try {
             response = await fetch(url, {
                 method: 'POST',
                 headers,
                 body: JSON.stringify(body),
+                signal: this._abort.signal,
             });
         } catch (err) {
+            if (err.name === 'AbortError') {
+                yield { type: EVENT_TYPES.STATUS, data: { message: 'Cancelled by user.' } };
+                return;
+            }
             yield { type: EVENT_TYPES.ERROR, data: { message: `Network error: ${err.message}. Check your connection or use --local mode.`, fatal: true } };
             return;
         }
@@ -175,7 +184,15 @@ export class TarangStreamClient {
         try {
             while (true) {
-                const { done, value } = await reader.read();
+                let read;
+                try {
+                    read = await reader.read();
+                } catch (err) {
+                    // Aborted via cancel() — treat as a clean end-of-stream.
+                    if (err && (err.name === 'AbortError' || this._cancelled)) break;
+                    throw err;
+                }
+                const { done, value } = read;
                 if (done) break;
                 buffer += decoder.decode(value, { stream: true });
@@ -335,6 +352,7 @@ export class TarangStreamClient {
     /** Cancel the current stream. */
     async cancel() {
         this._cancelled = true;
+        // Best-effort backend POST — the stream may already be torn down.
         if (this.currentTaskId) {
             try {
                 await fetch(`${this.baseUrl}/api/cancel/${this.currentTaskId}`, {
@@ -343,6 +361,11 @@ export class TarangStreamClient {
                 });
             } catch { /* best effort */ }
         }
+        // Force the in-flight SSE reader to abort so the REPL returns to the
+        // prompt immediately instead of waiting on a parked reader.read().
+        if (this._abort) {
+            try { this._abort.abort(); } catch {}
+        }
     }
     /** Pause the current stream. */

package/src/core/tool-executor.mjs CHANGED Viewed

@@ -91,6 +91,12 @@ export function createToolExecutor({
         '.rs':  (file) => `rustfmt --check "${file}" 2>&1`,
     };
+    // tsc --pretty and eslint emit ANSI codes (including background-red
+    // highlights) which bleed when our renderer slices the first 80 chars.
+    // Strip color codes so the stored lint string is always plain text.
+    const ANSI_RE = /\x1b\[[0-9;]*[a-zA-Z]/g;
+    function stripAnsi(s) { return String(s || '').replace(ANSI_RE, ''); }
     function autoLint(filePath) {
         const ext = path.extname(filePath);
         const cmdFn = LINT_COMMANDS[ext];
@@ -102,13 +108,14 @@ export function createToolExecutor({
                 timeout: 15_000,
                 cwd: process.cwd(),
                 stdio: ['pipe', 'pipe', 'pipe'],
+                env: { ...process.env, FORCE_COLOR: '0', NO_COLOR: '1', TERM: 'dumb' },
             });
-            const trimmed = output.trim();
+            const trimmed = stripAnsi(output).trim();
             if (!trimmed) return null;
             return trimmed;
         } catch (err) {
             // Non-zero exit means lint errors found
-            const output = (err.stderr || err.stdout || '').trim();
+            const output = stripAnsi(err.stderr || err.stdout || '').trim();
             if (!output) return null;
             return output;
         }