npm - gsd-trae - Versions diffs - 1.0.0 → 1.0.2 - Mend

gsd-trae 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (763) hide show

package/refs/vbenchmark/packages/cli/src/commands/eval.ts DELETED Viewed

@@ -1,197 +0,0 @@
-import { Command } from 'commander';
-import { loadAllTasks } from '../loader.js';
-import { createAgent } from '../agents/index.js';
-import { DockerRuntime } from '../runtime/docker.js';
-import { Evaluator } from '../evaluator.js';
-import _ora from 'ora';
-import chalk from 'chalk';
-import { table } from 'table';
-import * as fs from 'fs/promises';
-import * as path from 'path';
-interface AgentResult {
-  agent: string;
-  taskId: string;
-  scores: {
-    functional: number;
-    quality: number;
-    security: { passed: boolean; issues: string[] };
-    cost: number;
-    speed: number;
-    final: number;
-  };
-  metrics: {
-    duration: number;
-    tokens: number;
-    inputTokens: number;
-    outputTokens: number;
-    cost: number;
-  };
-}
-export const evalCommand = new Command('eval')
-  .description('Run full benchmark evaluation across multiple agents')
-  .option('-a, --agents <agents>', 'Comma-separated agents (default: all)', 'claude,glm,minimax')
-  .option('-c, --category <category>', 'Only run tasks in category')
-  .option('-n, --limit <n>', 'Limit number of tasks per category')
-  .option('-o, --output <file>', 'Output results to JSON file', 'results.json')
-  .option('--parallel <n>', 'Run n tasks in parallel', '1')
-  .option('--skip <n>', 'Skip first n tasks (for resuming)')
-  .action(async (options) => {
-    const agents = options.agents.split(',').map((a: string) => a.trim());
-    const tasks = await loadAllTasks();
-    let filtered = tasks;
-    if (options.category) {
-      filtered = filtered.filter(t => t.category === options.category);
-    }
-    if (options.limit) {
-      const limit = parseInt(options.limit);
-      const byCategory = new Map<string, typeof tasks>();
-      for (const t of filtered) {
-        const cat = byCategory.get(t.category) || [];
-        if (cat.length < limit) {
-          cat.push(t);
-          byCategory.set(t.category, cat);
-        }
-      }
-      filtered = [...byCategory.values()].flat();
-    }
-    // Skip first N tasks if specified (for resuming runs)
-    if (options.skip) {
-      const skipCount = parseInt(options.skip);
-      filtered = filtered.slice(skipCount);
-      console.log(chalk.yellow(`Skipping first ${skipCount} tasks\n`));
-    }
-    console.log(chalk.bold.cyan('\n🚀 VibeCodingBench Evaluation\n'));
-    console.log(`  Tasks: ${filtered.length}`);
-    console.log(`  Agents: ${agents.join(', ')}`);
-    console.log('');
-    const results: AgentResult[] = [];
-    const evaluator = new Evaluator();
-    const _runtime = new DockerRuntime({ timeout: 300000, tokenLimit: 100000 });
-    const parallelism = parseInt(options.parallel) || 1;
-    for (const agentName of agents) {
-      console.log(chalk.bold(`\n📊 Evaluating ${agentName.toUpperCase()} (parallel: ${parallelism})\n`));
-      const agent = createAgent(agentName);
-      const agentResults: AgentResult[] = [];
-      // Process tasks in batches for parallel execution
-      for (let i = 0; i < filtered.length; i += parallelism) {
-        const batch = filtered.slice(i, i + parallelism);
-        const batchPromises = batch.map(async (task) => {
-          const taskRuntime = new DockerRuntime({ timeout: 300000, tokenLimit: 100000 });
-          console.log(`- [${agentName}] ${task.id}`);
-          try {
-            const workspaceId = await taskRuntime.createWorkspace(task);
-            const startTime = Date.now();
-            const result = await taskRuntime.execute({ task, agent, workspaceId });
-            const duration = (Date.now() - startTime) / 1000;
-            const scores = await evaluator.evaluate(task, result);
-            const agentResult: AgentResult = {
-              agent: agentName,
-              taskId: task.id,
-              scores: {
-                functional: scores.functional,
-                quality: scores.quality,
-                security: scores.security,
-                cost: scores.cost,
-                speed: scores.speed,
-                final: scores.final
-              },
-              metrics: {
-                duration,
-                tokens: result.metrics.totalTokens,
-                inputTokens: result.metrics.inputTokens,
-                outputTokens: result.metrics.outputTokens,
-                cost: result.metrics.cost
-              }
-            };
-            await taskRuntime.cleanup(workspaceId);
-            console.log(`✔ [${agentName}] ${task.id}: ${scores.final.toFixed(1)}%`);
-            return agentResult;
-          } catch (error) {
-            console.log(`✖ [${agentName}] ${task.id}: ERROR - ${error}`);
-            return {
-              agent: agentName,
-              taskId: task.id,
-              scores: { functional: 0, quality: 0, security: { passed: false, issues: [] }, cost: 0, speed: 0, final: 0 },
-              metrics: { duration: 0, tokens: 0, inputTokens: 0, outputTokens: 0, cost: 0 }
-            } as AgentResult;
-          }
-        });
-        const batchResults = await Promise.all(batchPromises);
-        agentResults.push(...batchResults);
-        results.push(...batchResults);
-      }
-      const avgScore = agentResults.reduce((a, r) => a + r.scores.final, 0) / agentResults.length;
-      const avgCost = agentResults.reduce((a, r) => a + r.metrics.cost, 0) / agentResults.length;
-      console.log(chalk.gray(`\n  ${agentName} Average: ${avgScore.toFixed(1)}% | Avg Cost: $${avgCost.toFixed(4)}`));
-    }
-    // Summary table
-    console.log(chalk.bold.cyan('\n\n📈 LEADERBOARD\n'));
-    const leaderboard = agents.map((agentName: string) => {
-      const agentResults = results.filter(r => r.agent === agentName);
-      const avgFunctional = avg(agentResults.map(r => r.scores.functional));
-      const avgQuality = avg(agentResults.map(r => r.scores.quality));
-      const avgFinal = avg(agentResults.map(r => r.scores.final));
-      const totalCost = agentResults.reduce((acc, r) => acc + r.metrics.cost, 0);
-      const passRate = agentResults.filter(r => r.scores.functional >= 80).length / agentResults.length * 100;
-      return {
-        agent: agentName,
-        functional: avgFunctional,
-        quality: avgQuality,
-        final: avgFinal,
-        cost: totalCost,
-        passRate
-      };
-    }).sort((a: { final: number }, b: { final: number }) => b.final - a.final);
-    const tableData = [
-      ['Rank', 'Agent', 'Pass Rate', 'Functional', 'Quality', 'Final', 'Total Cost'].map(h => chalk.bold(h)),
-      ...leaderboard.map((r: { agent: string; passRate: number; functional: number; quality: number; final: number; cost: number }, i: number) => [
-        `#${i + 1}`,
-        r.agent.toUpperCase(),
-        `${r.passRate.toFixed(0)}%`,
-        `${r.functional.toFixed(1)}%`,
-        `${r.quality.toFixed(1)}%`,
-        chalk.bold(`${r.final.toFixed(1)}%`),
-        `$${r.cost.toFixed(2)}`
-      ])
-    ];
-    console.log(table(tableData));
-    // Save results
-    const outputPath = path.resolve(options.output);
-    await fs.writeFile(outputPath, JSON.stringify({
-      timestamp: new Date().toISOString(),
-      tasks: filtered.length,
-      results,
-      leaderboard
-    }, null, 2));
-    console.log(chalk.gray(`\nResults saved to: ${outputPath}`));
-  });
-function avg(nums: number[]): number {
-  return nums.length ? nums.reduce((a, b) => a + b, 0) / nums.length : 0;
-}

package/refs/vbenchmark/packages/cli/src/commands/list.ts DELETED Viewed

@@ -1,63 +0,0 @@
-import { Command } from 'commander';
-import { loadAllTasks } from '../loader.js';
-import { table } from 'table';
-import chalk from 'chalk';
-export const listCommand = new Command('list')
-  .description('List all available benchmark tasks')
-  .option('-c, --category <category>', 'Filter by category')
-  .option('-d, --difficulty <level>', 'Filter by difficulty (easy, medium, hard)')
-  .option('--json', 'Output as JSON')
-  .action(async (options) => {
-    const tasks = await loadAllTasks();
-    let filtered = tasks;
-    if (options.category) {
-      filtered = filtered.filter(t => t.category === options.category);
-    }
-    if (options.difficulty) {
-      filtered = filtered.filter(t => t.difficulty === options.difficulty);
-    }
-    if (options.json) {
-      console.log(JSON.stringify(filtered, null, 2));
-      return;
-    }
-    if (filtered.length === 0) {
-      console.log(chalk.yellow('No tasks found matching criteria.'));
-      return;
-    }
-    const categories = [...new Set(filtered.map(t => t.category))];
-    for (const category of categories) {
-      console.log(chalk.bold.cyan(`\n${category.toUpperCase()}`));
-      const categoryTasks = filtered.filter(t => t.category === category);
-      const data = [
-        [chalk.gray('ID'), chalk.gray('Name'), chalk.gray('Difficulty'), chalk.gray('Stack')],
-        ...categoryTasks.map(t => [
-          t.id,
-          t.name,
-          colorDifficulty(t.difficulty),
-          t.stack || '-'
-        ])
-      ];
-      console.log(table(data, {
-        border: { bodyLeft: '  ' }
-      }));
-    }
-    console.log(chalk.gray(`\nTotal: ${filtered.length} tasks`));
-  });
-function colorDifficulty(d: string): string {
-  switch (d) {
-    case 'easy': return chalk.green(d);
-    case 'medium': return chalk.yellow(d);
-    case 'hard': return chalk.red(d);
-    default: return d;
-  }
-}

package/refs/vbenchmark/packages/cli/src/commands/run.ts DELETED Viewed

@@ -1,147 +0,0 @@
-import { Command } from 'commander';
-import { loadTask } from '../loader.js';
-import { createAgent } from '../agents/index.js';
-import { DockerRuntime } from '../runtime/docker.js';
-import { Evaluator } from '../evaluator.js';
-import { createReporter } from '../reporter.js';
-import ora from 'ora';
-import chalk from 'chalk';
-export const runCommand = new Command('run')
-  .description('Run a benchmark task with an AI agent')
-  .argument('<task-id>', 'Task ID to run (e.g., saas-core/auth/supabase-oauth)')
-  .requiredOption('-a, --agent <agent>', 'Agent to use (claude, glm, minimax, openai)')
-  .option('-t, --timeout <seconds>', 'Timeout in seconds', '300')
-  .option('--token-limit <tokens>', 'Max tokens', '100000')
-  .option('--live', 'Stream output in real-time and report to leaderboard service')
-  .option('--live-url <url>', 'Leaderboard API URL for live reporting', 'http://localhost:3001/api/live')
-  .option('--record', 'Record session for replay')
-  .option('--no-docker', 'Run without Docker isolation (for debugging)')
-  .action(async (taskId, options) => {
-    const spinner = ora('Loading task...').start();
-    // Initialize live reporter if enabled
-    const reporter = createReporter({
-      apiUrl: options.liveUrl,
-      enabled: options.live,
-    });
-    try {
-      const task = await loadTask(taskId);
-      if (!task) {
-        spinner.fail(`Task not found: ${taskId}`);
-        process.exit(1);
-      }
-      spinner.succeed(`Loaded task: ${task.name}`);
-      spinner.start(`Initializing ${options.agent} agent...`);
-      const agent = createAgent(options.agent);
-      spinner.succeed(`Agent ready: ${options.agent}`);
-      // Start live reporting
-      if (options.live) {
-        const runId = await reporter.start(options.agent, taskId);
-        if (runId) {
-          console.log(chalk.blue(`Live reporting: ${options.liveUrl.replace('/api/live', '')}/live/${runId}`));
-        }
-        await reporter.setStatus('initializing', 0, 'Setting up environment');
-      }
-      const runtime = new DockerRuntime({
-        timeout: parseInt(options.timeout) * 1000,
-        tokenLimit: parseInt(options.tokenLimit),
-        useDocker: options.docker !== false
-      });
-      spinner.start('Setting up environment...');
-      const workspaceId = await runtime.createWorkspace(task);
-      spinner.succeed('Environment ready');
-      if (options.live) {
-        await reporter.setStatus('running', 10, 'Agent executing task');
-      }
-      console.log(chalk.cyan('\n--- Agent Execution ---\n'));
-      const startTime = Date.now();
-      const result = await runtime.execute({
-        task,
-        agent,
-        workspaceId,
-        live: options.live,
-        record: options.record,
-        onProgress: (event) => {
-          if (options.live) {
-            console.log(chalk.gray(`[${event.type}] ${event.message}`));
-            reporter.log(`[${event.type}] ${event.message}`);
-            // Update metrics based on event type
-            if (event.type === 'tool_use') {
-              // Track file operations from tool use events
-              if (event.message?.includes('read')) {
-                reporter.incrementFilesRead();
-              } else if (event.message?.includes('write') || event.message?.includes('edit')) {
-                reporter.incrementFilesWritten();
-              }
-            }
-          }
-        }
-      });
-      const duration = (Date.now() - startTime) / 1000;
-      if (options.live) {
-        await reporter.setStatus('evaluating', 80, 'Running evaluation');
-      }
-      console.log(chalk.cyan('\n--- Evaluation ---\n'));
-      const evaluator = new Evaluator();
-      const scores = await evaluator.evaluate(task, result);
-      // Report test results
-      if (options.live) {
-        reporter.setTestResults(
-          scores.functional >= 70 ? 1 : 0,
-          scores.functional < 70 ? 1 : 0
-        );
-      }
-      console.log(chalk.bold('\nResults:'));
-      console.log(`  ${chalk.gray('Duration:')} ${duration.toFixed(1)}s`);
-      console.log(`  ${chalk.gray('Tokens:')} ${result.metrics.totalTokens.toLocaleString()}`);
-      console.log(`  ${chalk.gray('Cost:')} $${result.metrics.cost.toFixed(4)}`);
-      console.log(`  ${chalk.gray('Files Changed:')} ${result.metrics.filesChanged}`);
-      console.log(chalk.bold('\nScores:'));
-      console.log(`  ${chalk.gray('Functional:')} ${colorScore(scores.functional)}%`);
-      console.log(`  ${chalk.gray('Quality:')} ${colorScore(scores.quality)}%`);
-      console.log(`  ${chalk.gray('Security:')} ${scores.security.passed ? chalk.green('PASS') : chalk.red('FAIL')}`);
-      console.log(`  ${chalk.bold('Final:')} ${colorScore(scores.final)}%`);
-      // Complete live reporting
-      if (options.live) {
-        await reporter.complete({
-          tokensUsed: result.metrics.totalTokens,
-          filesRead: result.metrics.filesRead || 0,
-          filesWritten: result.metrics.filesChanged,
-          elapsedMs: duration * 1000,
-        });
-      }
-      await runtime.cleanup(workspaceId);
-    } catch (error) {
-      if (options.live) {
-        await reporter.fail(String(error));
-      }
-      spinner.fail(`Error: ${error}`);
-      process.exit(1);
-    }
-  });
-function colorScore(score: number): string {
-  if (score >= 90) return chalk.green(score.toFixed(1));
-  if (score >= 70) return chalk.yellow(score.toFixed(1));
-  return chalk.red(score.toFixed(1));
-}

package/refs/vbenchmark/packages/cli/src/evaluator.ts DELETED Viewed

@@ -1,125 +0,0 @@
-import type { Task } from './loader.js';
-export interface ExecutionResult {
-  success: boolean;
-  output: string;
-  files: {
-    created: string[];
-    modified: string[];
-    deleted: string[];
-  };
-  metrics: {
-    totalTokens: number;
-    inputTokens: number;
-    outputTokens: number;
-    cost: number;
-    filesRead: number;
-    filesChanged: number;
-    duration: number;
-    steps: number;
-  };
-}
-export interface EvaluationScores {
-  functional: number;      // 0-100
-  visual: number;          // 0-100
-  quality: number;         // 0-100
-  security: {
-    passed: boolean;
-    issues: string[];
-  };
-  cost: number;            // 0-100 (higher = more efficient)
-  speed: number;           // 0-100 (higher = faster)
-  final: number;           // Weighted average
-}
-export class Evaluator {
-  private weights = {
-    functional: 0.40,
-    visual: 0.20,
-    quality: 0.20,
-    cost: 0.10,
-    speed: 0.10
-  };
-  async evaluate(task: Task, result: ExecutionResult): Promise<EvaluationScores> {
-    const functional = await this.evaluateFunctional(task, result);
-    const visual = await this.evaluateVisual(task, result);
-    const quality = await this.evaluateQuality(task, result);
-    const security = await this.evaluateSecurity(task, result);
-    const cost = this.evaluateCost(result);
-    const speed = this.evaluateSpeed(task, result);
-    // Security failure = automatic fail
-    if (!security.passed) {
-      return {
-        functional,
-        visual,
-        quality,
-        security,
-        cost,
-        speed,
-        final: 0
-      };
-    }
-    const final =
-      functional * this.weights.functional +
-      visual * this.weights.visual +
-      quality * this.weights.quality +
-      cost * this.weights.cost +
-      speed * this.weights.speed;
-    return {
-      functional,
-      visual,
-      quality,
-      security,
-      cost,
-      speed,
-      final: Math.round(final * 10) / 10
-    };
-  }
-  private async evaluateFunctional(task: Task, result: ExecutionResult): Promise<number> {
-    if (!task.tests.functional || !result.success) return 0;
-    // Run functional tests and compute pass rate
-    // For now, return mock score
-    return result.success ? 85 : 0;
-  }
-  private async evaluateVisual(task: Task, _result: ExecutionResult): Promise<number> {
-    if (!task.tests.visual) return 100; // No visual tests = skip
-    // Run visual diff and compute match percentage
-    return 90;
-  }
-  private async evaluateQuality(_task: Task, _result: ExecutionResult): Promise<number> {
-    // Run linters, compute complexity
-    // Deduct points for lint errors, high complexity
-    return 80;
-  }
-  private async evaluateSecurity(_task: Task, _result: ExecutionResult): Promise<{ passed: boolean; issues: string[] }> {
-    // Run Semgrep/security scanner
-    // Any critical/high = fail
-    return { passed: true, issues: [] };
-  }
-  private evaluateCost(result: ExecutionResult): number {
-    // Lower cost = higher score
-    // Baseline: $0.50 per task = 50 points
-    // $0 = 100, $1+ = 0
-    const maxCost = 1.0;
-    const score = Math.max(0, 100 - (result.metrics.cost / maxCost) * 100);
-    return Math.round(score);
-  }
-  private evaluateSpeed(task: Task, result: ExecutionResult): number {
-    // Compare to timeout
-    const ratio = result.metrics.duration / (task.timeout * 1000);
-    // Under 25% of timeout = 100, at timeout = 0
-    const score = Math.max(0, 100 - (ratio * 100));
-    return Math.round(score);
-  }
-}

package/refs/vbenchmark/packages/cli/src/index.ts DELETED Viewed

@@ -1,21 +0,0 @@
-#!/usr/bin/env node
-import { Command } from 'commander';
-import { config } from 'dotenv';
-import { listCommand } from './commands/list.js';
-import { runCommand } from './commands/run.js';
-import { evalCommand } from './commands/eval.js';
-config();
-const program = new Command();
-program
-  .name('vibecodingbench')
-  .description('Benchmark for evaluating AI coding agents on real-world developer tasks')
-  .version('0.1.0');
-program.addCommand(listCommand);
-program.addCommand(runCommand);
-program.addCommand(evalCommand);
-program.parse();