npm - gsd-trae - Versions diffs - 1.0.1 → 1.0.2 - Mend

gsd-trae 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (761) hide show

package/refs/vbenchmark/packages/leaderboard/tests/api.test.ts DELETED Viewed

@@ -1,228 +0,0 @@
-import { describe, it, expect, beforeEach } from 'vitest';
-import { app } from '../src/app.js';
-describe('Leaderboard API', () => {
-  describe('Health Check', () => {
-    it('should return healthy status', async () => {
-      const response = await app.request('/health');
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body).toEqual({ status: 'healthy' });
-    });
-  });
-  describe('Submissions API', () => {
-    let submissionId: string;
-    it('should create a submission', async () => {
-      const response = await app.request('/api/submissions', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          agentName: 'TestAgent',
-          agentVersion: '1.0.0',
-          taskId: 'saas-core/auth/supabase-oauth',
-        }),
-      });
-      expect(response.status).toBe(201);
-      const body = await response.json();
-      expect(body).toHaveProperty('id');
-      expect(body.agentName).toBe('TestAgent');
-      expect(body.status).toBe('pending');
-      submissionId = body.id;
-    });
-    it('should list submissions', async () => {
-      const response = await app.request('/api/submissions');
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body).toHaveProperty('submissions');
-      expect(Array.isArray(body.submissions)).toBe(true);
-    });
-    it('should get submission by id', async () => {
-      // First create a submission
-      const createResponse = await app.request('/api/submissions', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          agentName: 'GetTest',
-          agentVersion: '1.0.0',
-          taskId: 'test-task',
-        }),
-      });
-      const created = await createResponse.json();
-      const response = await app.request(`/api/submissions/${created.id}`);
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body.id).toBe(created.id);
-    });
-    it('should return 404 for non-existent submission', async () => {
-      const response = await app.request('/api/submissions/non-existent-id');
-      expect(response.status).toBe(404);
-    });
-    it('should update submission scores', async () => {
-      // Create a submission first
-      const createResponse = await app.request('/api/submissions', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          agentName: 'UpdateTest',
-          agentVersion: '1.0.0',
-          taskId: 'test-task',
-        }),
-      });
-      const created = await createResponse.json();
-      const response = await app.request(`/api/submissions/${created.id}`, {
-        method: 'PATCH',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          status: 'completed',
-          scores: {
-            functional: 90,
-            visual: 85,
-            quality: 88,
-            security: 92,
-            cost: 75,
-            speed: 80,
-            overall: 85,
-          },
-        }),
-      });
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body.status).toBe('completed');
-      expect(body.scores?.functional).toBe(90);
-    });
-  });
-  describe('Leaderboard API', () => {
-    it('should get overall leaderboard', async () => {
-      const response = await app.request('/api/leaderboard');
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body).toHaveProperty('leaderboard');
-      expect(Array.isArray(body.leaderboard)).toBe(true);
-      expect(body.leaderboard[0]).toHaveProperty('rank');
-      expect(body.leaderboard[0]).toHaveProperty('avgScore');
-    });
-    it('should sort leaderboard by different metrics', async () => {
-      const response = await app.request('/api/leaderboard?sort=avgCost&order=asc');
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body.leaderboard.length).toBeGreaterThan(0);
-    });
-    it('should get leaderboard by category', async () => {
-      const response = await app.request('/api/leaderboard/category/saas-core');
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body.category).toBe('saas-core');
-      expect(body).toHaveProperty('leaderboard');
-    });
-    it('should get task-specific leaderboard', async () => {
-      const response = await app.request(
-        '/api/leaderboard/task/saas-core/auth/supabase-oauth'
-      );
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body.taskId).toBe('saas-core/auth/supabase-oauth');
-      expect(body).toHaveProperty('leaderboard');
-    });
-    it('should get agent stats', async () => {
-      const response = await app.request('/api/leaderboard/agent/Claude');
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body.agent).toHaveProperty('agentName');
-      expect(body).toHaveProperty('taskBreakdown');
-    });
-  });
-  describe('Live API', () => {
-    it('should list active runs', async () => {
-      const response = await app.request('/api/live/runs');
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body).toHaveProperty('runs');
-      expect(body).toHaveProperty('totalActive');
-    });
-    it('should create a live run', async () => {
-      const response = await app.request('/api/live/runs', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          agentName: 'TestAgent',
-          taskId: 'test-task',
-        }),
-      });
-      expect(response.status).toBe(201);
-      const body = await response.json();
-      expect(body).toHaveProperty('id');
-      expect(body.status).toBe('initializing');
-    });
-    it('should update a live run', async () => {
-      // Create a run
-      const createResponse = await app.request('/api/live/runs', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          agentName: 'UpdateTest',
-          taskId: 'test-task',
-        }),
-      });
-      const created = await createResponse.json();
-      const response = await app.request(`/api/live/runs/${created.id}`, {
-        method: 'PATCH',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          status: 'running',
-          progress: 50,
-          currentStep: 'Implementing feature',
-          metrics: {
-            tokensUsed: 5000,
-            filesRead: 10,
-            filesWritten: 3,
-            testsPass: 5,
-            testsFail: 0,
-            elapsedMs: 30000,
-          },
-        }),
-      });
-      expect(response.status).toBe(200);
-      const body = await response.json();
-      expect(body.status).toBe('running');
-      expect(body.progress).toBe(50);
-    });
-  });
-});

package/refs/vbenchmark/packages/leaderboard/tsconfig.json DELETED Viewed

@@ -1,9 +0,0 @@
-{
-  "extends": "../../tsconfig.base.json",
-  "compilerOptions": {
-    "outDir": "dist",
-    "rootDir": "src"
-  },
-  "include": ["src/**/*"],
-  "exclude": ["node_modules", "dist", "tests", "src/components", "src/app"]
-}

package/refs/vbenchmark/scripts/deploy.sh DELETED Viewed

@@ -1,70 +0,0 @@
-#!/bin/bash
-set -e
-# CodingBench Deployment Script
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-ROOT_DIR="$(dirname "$SCRIPT_DIR")"
-usage() {
-    echo "Usage: $0 <command> [options]"
-    echo ""
-    echo "Commands:"
-    echo "  fly       Deploy to Fly.io"
-    echo "  docker    Build and run with Docker Compose"
-    echo "  build     Build all packages"
-    echo ""
-    echo "Examples:"
-    echo "  $0 fly              # Deploy leaderboard to Fly.io"
-    echo "  $0 docker           # Run production stack locally"
-    echo "  $0 docker --detach  # Run in background"
-}
-build_all() {
-    echo "Building all packages..."
-    cd "$ROOT_DIR"
-    npm run build
-}
-deploy_fly() {
-    echo "Deploying to Fly.io..."
-    cd "$ROOT_DIR/packages/leaderboard"
-    if ! command -v fly &> /dev/null; then
-        echo "Error: flyctl not installed. Install from https://fly.io/docs/hands-on/install-flyctl/"
-        exit 1
-    fi
-    fly deploy
-}
-deploy_docker() {
-    echo "Building and starting Docker containers..."
-    cd "$ROOT_DIR"
-    if [ "$1" == "--detach" ] || [ "$1" == "-d" ]; then
-        docker compose -f docker-compose.prod.yaml up --build -d
-        echo ""
-        echo "Services running:"
-        echo "  - Leaderboard API: http://localhost:3001"
-        echo "  - Dashboard: http://localhost:3000"
-    else
-        docker compose -f docker-compose.prod.yaml up --build
-    fi
-}
-case "$1" in
-    fly)
-        deploy_fly
-        ;;
-    docker)
-        deploy_docker "$2"
-        ;;
-    build)
-        build_all
-        ;;
-    *)
-        usage
-        exit 1
-        ;;
-esac

package/refs/vbenchmark/tasks/ai-integration/advanced/context-management/PROMPT.md DELETED Viewed

@@ -1,15 +0,0 @@
-# Intelligent Context Management
-Build advanced context management for long conversations.
-## Requirements
-1. Dynamic context compression
-2. Relevance-based message selection
-3. Hierarchical summarization
-4. Context window optimization
-5. Entity tracking across turns
-6. Topic segmentation
-7. Important information extraction
-8. Context restoration from summary
-9. Multi-modal context handling
-10. Context versioning and rollback

package/refs/vbenchmark/tasks/ai-integration/advanced/context-management/task.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-name: Intelligent Context Management
-category: ai-integration
-subcategory: advanced
-description: Build context management with dynamic compression, relevance scoring, hierarchical summarization, and context window optimization
-difficulty: hard
-stack: python
-timeout: 1500
-tokenLimit: 300000
-tests:
-  functional: tests/test_context.py
-tags: [python, context, llm, compression, embeddings]
-evaluation:
-  functional: 0.5
-  quality: 0.25
-  cost: 0.1
-  speed: 0.15

package/refs/vbenchmark/tasks/ai-integration/advanced/evaluation-framework/PROMPT.md DELETED Viewed

@@ -1,15 +0,0 @@
-# LLM Evaluation Framework
-Build production evaluation framework for LLM applications.
-## Requirements
-1. Custom metric definition DSL
-2. LLM-as-judge evaluation
-3. Human feedback collection and aggregation
-4. Regression test suite management
-5. Statistical significance testing
-6. Evaluation dataset versioning
-7. Bias and fairness metrics
-8. Latency and cost tracking
-9. Automated eval in CI/CD
-10. Evaluation results dashboard

package/refs/vbenchmark/tasks/ai-integration/advanced/evaluation-framework/task.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-name: LLM Evaluation Framework
-category: ai-integration
-subcategory: advanced
-description: Build comprehensive evaluation framework with custom metrics, human feedback integration, and regression testing
-difficulty: hard
-stack: python
-timeout: 1500
-tokenLimit: 300000
-tests:
-  functional: tests/test_evaluation.py
-tags: [python, evaluation, metrics, llm, testing]
-evaluation:
-  functional: 0.5
-  quality: 0.25
-  cost: 0.1
-  speed: 0.15

package/refs/vbenchmark/tasks/ai-integration/advanced/guardrails-safety/PROMPT.md DELETED Viewed

@@ -1,15 +0,0 @@
-# AI Guardrails and Safety System
-Build production safety system for AI applications.
-## Requirements
-1. Input content filtering
-2. Jailbreak/injection detection
-3. Output validation against policies
-4. PII detection and redaction
-5. Topic restriction enforcement
-6. Response factuality checking
-7. Bias detection and mitigation
-8. Rate limiting abuse prevention
-9. Audit logging for compliance
-10. Real-time safety dashboard

package/refs/vbenchmark/tasks/ai-integration/advanced/guardrails-safety/task.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-name: AI Guardrails and Safety System
-category: ai-integration
-subcategory: advanced
-description: Build comprehensive safety system with content filtering, jailbreak detection, output validation, and compliance enforcement
-difficulty: hard
-stack: python
-timeout: 1500
-tokenLimit: 300000
-tests:
-  functional: tests/test_guardrails.py
-tags: [python, safety, guardrails, moderation, compliance]
-evaluation:
-  functional: 0.5
-  quality: 0.25
-  cost: 0.1
-  speed: 0.15

package/refs/vbenchmark/tasks/ai-integration/advanced/memory-system/PROMPT.md DELETED Viewed

@@ -1,15 +0,0 @@
-# Long-term Memory System
-Build sophisticated memory system for AI agents.
-## Requirements
-1. Episodic memory (specific events)
-2. Semantic memory (facts and concepts)
-3. Working memory management
-4. Memory consolidation (short to long term)
-5. Forgetting curves and decay
-6. Memory retrieval optimization
-7. Cross-session memory persistence
-8. Memory conflict resolution
-9. Privacy-aware memory handling
-10. Memory debugging and inspection

package/refs/vbenchmark/tasks/ai-integration/advanced/memory-system/task.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-name: Long-term Memory System
-category: ai-integration
-subcategory: advanced
-description: Build memory system with episodic and semantic memory, retrieval optimization, and memory consolidation
-difficulty: hard
-stack: python
-timeout: 1500
-tokenLimit: 300000
-tests:
-  functional: tests/test_memory.py
-tags: [python, memory, embeddings, vector-db, retrieval]
-evaluation:
-  functional: 0.5
-  quality: 0.25
-  cost: 0.1
-  speed: 0.15

package/refs/vbenchmark/tasks/ai-integration/advanced/model-routing/PROMPT.md DELETED Viewed

@@ -1,15 +0,0 @@
-# Intelligent Model Router
-Build smart model routing for multi-model deployments.
-## Requirements
-1. Cost-based routing optimization
-2. Capability matching per request
-3. Fallback chain configuration
-4. A/B testing for model comparison
-5. Latency-based routing
-6. Request classification
-7. Budget enforcement per user/org
-8. Model health monitoring
-9. Graceful degradation
-10. Routing analytics dashboard

package/refs/vbenchmark/tasks/ai-integration/advanced/model-routing/task.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-name: Intelligent Model Router
-category: ai-integration
-subcategory: advanced
-description: Build model router with cost optimization, capability matching, fallback chains, and A/B testing
-difficulty: hard
-stack: python
-timeout: 1500
-tokenLimit: 300000
-tests:
-  functional: tests/test_router.py
-tags: [python, llm, routing, optimization, load-balancing]
-evaluation:
-  functional: 0.5
-  quality: 0.25
-  cost: 0.1
-  speed: 0.15

package/refs/vbenchmark/tasks/ai-integration/advanced/multi-agent-system/PROMPT.md DELETED Viewed

@@ -1,15 +0,0 @@
-# Multi-Agent Collaboration System
-Build a system where multiple AI agents collaborate on complex tasks.
-## Requirements
-1. Agent role definition and specialization
-2. Task decomposition and delegation
-3. Inter-agent message passing protocol
-4. Conflict resolution mechanisms
-5. Consensus building for decisions
-6. Shared memory/context management
-7. Agent capability discovery
-8. Deadlock prevention
-9. Result aggregation and synthesis
-10. Performance monitoring per agent

package/refs/vbenchmark/tasks/ai-integration/advanced/multi-agent-system/task.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-name: Multi-Agent Collaboration System
-category: ai-integration
-subcategory: advanced
-description: Build multi-agent system with task delegation, inter-agent communication, conflict resolution, and consensus mechanisms
-difficulty: hard
-stack: python
-timeout: 1500
-tokenLimit: 300000
-tests:
-  functional: tests/test_multiagent.py
-tags: [python, agents, llm, collaboration, orchestration]
-evaluation:
-  functional: 0.5
-  quality: 0.25
-  cost: 0.1
-  speed: 0.15

package/refs/vbenchmark/tasks/ai-integration/advanced/prompt-optimization/PROMPT.md DELETED Viewed

@@ -1,15 +0,0 @@
-# Automated Prompt Optimization
-Build automated prompt optimization system.
-## Requirements
-1. Prompt A/B testing framework
-2. Automatic prompt improvement suggestions
-3. Few-shot example optimization
-4. Prompt version control
-5. Performance regression detection
-6. Cost vs quality tradeoff analysis
-7. Prompt template management
-8. Variable injection with validation
-9. Multi-language prompt support
-10. Prompt performance dashboard

package/refs/vbenchmark/tasks/ai-integration/advanced/prompt-optimization/task.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-name: Automated Prompt Optimization
-category: ai-integration
-subcategory: advanced
-description: Build prompt optimization system with automatic tuning, A/B testing, version control, and performance tracking
-difficulty: hard
-stack: python
-timeout: 1500
-tokenLimit: 300000
-tests:
-  functional: tests/test_prompt_opt.py
-tags: [python, prompts, optimization, dspy, evaluation]
-evaluation:
-  functional: 0.5
-  quality: 0.25
-  cost: 0.1
-  speed: 0.15

package/refs/vbenchmark/tasks/ai-integration/advanced/reasoning-chain/PROMPT.md DELETED Viewed

@@ -1,15 +0,0 @@
-# Chain of Thought Orchestration
-Build advanced reasoning orchestration system.
-## Requirements
-1. Multi-step reasoning chains
-2. Step-by-step verification
-3. Branching and parallel reasoning
-4. Self-correction on errors
-5. Confidence scoring per step
-6. Reasoning trace visualization
-7. Human-in-the-loop breakpoints
-8. Reasoning template library
-9. Performance optimization (caching)
-10. Reasoning analytics and debugging

package/refs/vbenchmark/tasks/ai-integration/advanced/reasoning-chain/task.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-name: Chain of Thought Orchestration
-category: ai-integration
-subcategory: advanced
-description: Build reasoning orchestration with step verification, branching logic, and self-correction mechanisms
-difficulty: hard
-stack: python
-timeout: 1500
-tokenLimit: 300000
-tests:
-  functional: tests/test_reasoning.py
-tags: [python, reasoning, chain-of-thought, verification, llm]
-evaluation:
-  functional: 0.5
-  quality: 0.25
-  cost: 0.1
-  speed: 0.15

package/refs/vbenchmark/tasks/ai-integration/advanced/streaming-pipeline/PROMPT.md DELETED Viewed

@@ -1,15 +0,0 @@
-# LLM Streaming Pipeline
-Build advanced streaming infrastructure for LLM responses.
-## Requirements
-1. Token-level stream processing
-2. Real-time content transformation
-3. Multi-destination fanout (WebSocket, SSE, webhook)
-4. Backpressure handling
-5. Stream aggregation from multiple models
-6. Partial response caching
-7. Stream interruption handling
-8. Bandwidth optimization
-9. Client reconnection with resume
-10. Stream analytics and monitoring

package/refs/vbenchmark/tasks/ai-integration/advanced/streaming-pipeline/task.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-name: LLM Streaming Pipeline
-category: ai-integration
-subcategory: advanced
-description: Build streaming pipeline with token-level processing, real-time transformation, and multi-destination fanout
-difficulty: hard
-stack: typescript
-timeout: 1500
-tokenLimit: 300000
-tests:
-  functional: tests/streaming.test.ts
-tags: [typescript, streaming, sse, websocket, transformation]
-evaluation:
-  functional: 0.5
-  quality: 0.25
-  cost: 0.1
-  speed: 0.15

package/refs/vbenchmark/tasks/ai-integration/advanced/tool-use-orchestration/PROMPT.md DELETED Viewed

@@ -1,15 +0,0 @@
-# Advanced Tool Use Orchestration
-Build sophisticated tool orchestration for LLM agents.
-## Requirements
-1. Parallel tool execution
-2. Tool dependency graph resolution
-3. Sandboxed code execution
-4. Error recovery and retry strategies
-5. Tool result caching
-6. Dynamic tool discovery
-7. Rate limiting per tool
-8. Tool permission management
-9. Execution timeout handling
-10. Tool usage analytics

package/refs/vbenchmark/tasks/ai-integration/advanced/tool-use-orchestration/task.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-name: Advanced Tool Use Orchestration
-category: ai-integration
-subcategory: advanced
-description: Build tool orchestration with parallel execution, dependency resolution, error recovery, and sandboxed execution
-difficulty: hard
-stack: python
-timeout: 1500
-tokenLimit: 300000
-tests:
-  functional: tests/test_tools.py
-tags: [python, function-calling, tools, sandbox, orchestration]
-evaluation:
-  functional: 0.5
-  quality: 0.25
-  cost: 0.1
-  speed: 0.15