gsd-trae 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +40 -0
- package/README.md +7 -76
- package/assets/screenshot.png +0 -0
- package/package.json +12 -3
- package/.claude/settings.local.json +0 -8
- package/.gitmodules +0 -6
- package/.trae/project_rules.md +0 -56
- package/.trae/rules/project_rules.md +0 -56
- package/.vscode/code-counter/code-counter.db +0 -0
- package/.vscode/settings.json +0 -5
- package/refs/gsd/.github/CODEOWNERS +0 -2
- package/refs/gsd/.github/FUNDING.yml +0 -1
- package/refs/gsd/.github/ISSUE_TEMPLATE/bug_report.yml +0 -59
- package/refs/gsd/.github/ISSUE_TEMPLATE/feature_request.yml +0 -37
- package/refs/gsd/.github/pull_request_template.md +0 -24
- package/refs/gsd/.github/workflows/auto-label-issues.yml +0 -21
- package/refs/gsd/CHANGELOG.md +0 -1520
- package/refs/gsd/LICENSE +0 -21
- package/refs/gsd/README.md +0 -704
- package/refs/gsd/SECURITY.md +0 -33
- package/refs/gsd/agents/gsd-codebase-mapper.md +0 -764
- package/refs/gsd/agents/gsd-debugger.md +0 -1246
- package/refs/gsd/agents/gsd-executor.md +0 -469
- package/refs/gsd/agents/gsd-integration-checker.md +0 -443
- package/refs/gsd/agents/gsd-phase-researcher.md +0 -546
- package/refs/gsd/agents/gsd-plan-checker.md +0 -690
- package/refs/gsd/agents/gsd-planner.md +0 -1275
- package/refs/gsd/agents/gsd-project-researcher.md +0 -621
- package/refs/gsd/agents/gsd-research-synthesizer.md +0 -239
- package/refs/gsd/agents/gsd-roadmapper.md +0 -642
- package/refs/gsd/agents/gsd-verifier.md +0 -573
- package/refs/gsd/assets/gsd-logo-2000-transparent.png +0 -0
- package/refs/gsd/assets/gsd-logo-2000-transparent.svg +0 -17
- package/refs/gsd/assets/gsd-logo-2000.png +0 -0
- package/refs/gsd/assets/gsd-logo-2000.svg +0 -21
- package/refs/gsd/assets/terminal.svg +0 -68
- package/refs/gsd/bin/install.js +0 -2090
- package/refs/gsd/commands/gsd/add-phase.md +0 -43
- package/refs/gsd/commands/gsd/add-tests.md +0 -41
- package/refs/gsd/commands/gsd/add-todo.md +0 -47
- package/refs/gsd/commands/gsd/audit-milestone.md +0 -36
- package/refs/gsd/commands/gsd/check-todos.md +0 -45
- package/refs/gsd/commands/gsd/cleanup.md +0 -18
- package/refs/gsd/commands/gsd/complete-milestone.md +0 -136
- package/refs/gsd/commands/gsd/debug.md +0 -167
- package/refs/gsd/commands/gsd/discuss-phase.md +0 -83
- package/refs/gsd/commands/gsd/execute-phase.md +0 -41
- package/refs/gsd/commands/gsd/health.md +0 -22
- package/refs/gsd/commands/gsd/help.md +0 -22
- package/refs/gsd/commands/gsd/insert-phase.md +0 -32
- package/refs/gsd/commands/gsd/join-discord.md +0 -18
- package/refs/gsd/commands/gsd/list-phase-assumptions.md +0 -46
- package/refs/gsd/commands/gsd/map-codebase.md +0 -71
- package/refs/gsd/commands/gsd/new-milestone.md +0 -44
- package/refs/gsd/commands/gsd/new-project.md +0 -42
- package/refs/gsd/commands/gsd/new-project.md.bak +0 -1041
- package/refs/gsd/commands/gsd/pause-work.md +0 -38
- package/refs/gsd/commands/gsd/plan-milestone-gaps.md +0 -34
- package/refs/gsd/commands/gsd/plan-phase.md +0 -45
- package/refs/gsd/commands/gsd/progress.md +0 -24
- package/refs/gsd/commands/gsd/quick.md +0 -41
- package/refs/gsd/commands/gsd/reapply-patches.md +0 -110
- package/refs/gsd/commands/gsd/remove-phase.md +0 -31
- package/refs/gsd/commands/gsd/research-phase.md +0 -189
- package/refs/gsd/commands/gsd/resume-work.md +0 -40
- package/refs/gsd/commands/gsd/set-profile.md +0 -34
- package/refs/gsd/commands/gsd/settings.md +0 -36
- package/refs/gsd/commands/gsd/update.md +0 -37
- package/refs/gsd/commands/gsd/verify-work.md +0 -38
- package/refs/gsd/docs/USER-GUIDE.md +0 -471
- package/refs/gsd/docs/context-monitor.md +0 -96
- package/refs/gsd/get-shit-done/bin/gsd-tools.cjs +0 -585
- package/refs/gsd/get-shit-done/bin/lib/commands.cjs +0 -553
- package/refs/gsd/get-shit-done/bin/lib/config.cjs +0 -162
- package/refs/gsd/get-shit-done/bin/lib/core.cjs +0 -411
- package/refs/gsd/get-shit-done/bin/lib/frontmatter.cjs +0 -299
- package/refs/gsd/get-shit-done/bin/lib/init.cjs +0 -710
- package/refs/gsd/get-shit-done/bin/lib/milestone.cjs +0 -215
- package/refs/gsd/get-shit-done/bin/lib/phase.cjs +0 -870
- package/refs/gsd/get-shit-done/bin/lib/roadmap.cjs +0 -298
- package/refs/gsd/get-shit-done/bin/lib/state.cjs +0 -521
- package/refs/gsd/get-shit-done/bin/lib/template.cjs +0 -222
- package/refs/gsd/get-shit-done/bin/lib/verify.cjs +0 -772
- package/refs/gsd/get-shit-done/references/checkpoints.md +0 -776
- package/refs/gsd/get-shit-done/references/continuation-format.md +0 -249
- package/refs/gsd/get-shit-done/references/decimal-phase-calculation.md +0 -65
- package/refs/gsd/get-shit-done/references/git-integration.md +0 -248
- package/refs/gsd/get-shit-done/references/git-planning-commit.md +0 -38
- package/refs/gsd/get-shit-done/references/model-profile-resolution.md +0 -34
- package/refs/gsd/get-shit-done/references/model-profiles.md +0 -92
- package/refs/gsd/get-shit-done/references/phase-argument-parsing.md +0 -61
- package/refs/gsd/get-shit-done/references/planning-config.md +0 -196
- package/refs/gsd/get-shit-done/references/questioning.md +0 -145
- package/refs/gsd/get-shit-done/references/tdd.md +0 -263
- package/refs/gsd/get-shit-done/references/ui-brand.md +0 -160
- package/refs/gsd/get-shit-done/references/verification-patterns.md +0 -612
- package/refs/gsd/get-shit-done/templates/DEBUG.md +0 -164
- package/refs/gsd/get-shit-done/templates/UAT.md +0 -247
- package/refs/gsd/get-shit-done/templates/VALIDATION.md +0 -76
- package/refs/gsd/get-shit-done/templates/codebase/architecture.md +0 -255
- package/refs/gsd/get-shit-done/templates/codebase/concerns.md +0 -310
- package/refs/gsd/get-shit-done/templates/codebase/conventions.md +0 -307
- package/refs/gsd/get-shit-done/templates/codebase/integrations.md +0 -280
- package/refs/gsd/get-shit-done/templates/codebase/stack.md +0 -186
- package/refs/gsd/get-shit-done/templates/codebase/structure.md +0 -285
- package/refs/gsd/get-shit-done/templates/codebase/testing.md +0 -480
- package/refs/gsd/get-shit-done/templates/config.json +0 -37
- package/refs/gsd/get-shit-done/templates/context.md +0 -283
- package/refs/gsd/get-shit-done/templates/continue-here.md +0 -78
- package/refs/gsd/get-shit-done/templates/debug-subagent-prompt.md +0 -91
- package/refs/gsd/get-shit-done/templates/discovery.md +0 -146
- package/refs/gsd/get-shit-done/templates/milestone-archive.md +0 -123
- package/refs/gsd/get-shit-done/templates/milestone.md +0 -115
- package/refs/gsd/get-shit-done/templates/phase-prompt.md +0 -569
- package/refs/gsd/get-shit-done/templates/planner-subagent-prompt.md +0 -117
- package/refs/gsd/get-shit-done/templates/project.md +0 -184
- package/refs/gsd/get-shit-done/templates/requirements.md +0 -231
- package/refs/gsd/get-shit-done/templates/research-project/ARCHITECTURE.md +0 -204
- package/refs/gsd/get-shit-done/templates/research-project/FEATURES.md +0 -147
- package/refs/gsd/get-shit-done/templates/research-project/PITFALLS.md +0 -200
- package/refs/gsd/get-shit-done/templates/research-project/STACK.md +0 -120
- package/refs/gsd/get-shit-done/templates/research-project/SUMMARY.md +0 -170
- package/refs/gsd/get-shit-done/templates/research.md +0 -552
- package/refs/gsd/get-shit-done/templates/retrospective.md +0 -54
- package/refs/gsd/get-shit-done/templates/roadmap.md +0 -202
- package/refs/gsd/get-shit-done/templates/state.md +0 -176
- package/refs/gsd/get-shit-done/templates/summary-complex.md +0 -59
- package/refs/gsd/get-shit-done/templates/summary-minimal.md +0 -41
- package/refs/gsd/get-shit-done/templates/summary-standard.md +0 -48
- package/refs/gsd/get-shit-done/templates/summary.md +0 -248
- package/refs/gsd/get-shit-done/templates/user-setup.md +0 -311
- package/refs/gsd/get-shit-done/templates/verification-report.md +0 -322
- package/refs/gsd/get-shit-done/workflows/add-phase.md +0 -111
- package/refs/gsd/get-shit-done/workflows/add-tests.md +0 -350
- package/refs/gsd/get-shit-done/workflows/add-todo.md +0 -157
- package/refs/gsd/get-shit-done/workflows/audit-milestone.md +0 -297
- package/refs/gsd/get-shit-done/workflows/check-todos.md +0 -176
- package/refs/gsd/get-shit-done/workflows/cleanup.md +0 -152
- package/refs/gsd/get-shit-done/workflows/complete-milestone.md +0 -763
- package/refs/gsd/get-shit-done/workflows/diagnose-issues.md +0 -219
- package/refs/gsd/get-shit-done/workflows/discovery-phase.md +0 -289
- package/refs/gsd/get-shit-done/workflows/discuss-phase.md +0 -542
- package/refs/gsd/get-shit-done/workflows/execute-phase.md +0 -449
- package/refs/gsd/get-shit-done/workflows/execute-plan.md +0 -448
- package/refs/gsd/get-shit-done/workflows/health.md +0 -156
- package/refs/gsd/get-shit-done/workflows/help.md +0 -489
- package/refs/gsd/get-shit-done/workflows/insert-phase.md +0 -129
- package/refs/gsd/get-shit-done/workflows/list-phase-assumptions.md +0 -178
- package/refs/gsd/get-shit-done/workflows/map-codebase.md +0 -315
- package/refs/gsd/get-shit-done/workflows/new-milestone.md +0 -382
- package/refs/gsd/get-shit-done/workflows/new-project.md +0 -1116
- package/refs/gsd/get-shit-done/workflows/pause-work.md +0 -122
- package/refs/gsd/get-shit-done/workflows/plan-milestone-gaps.md +0 -274
- package/refs/gsd/get-shit-done/workflows/plan-phase.md +0 -569
- package/refs/gsd/get-shit-done/workflows/progress.md +0 -381
- package/refs/gsd/get-shit-done/workflows/quick.md +0 -453
- package/refs/gsd/get-shit-done/workflows/remove-phase.md +0 -154
- package/refs/gsd/get-shit-done/workflows/research-phase.md +0 -73
- package/refs/gsd/get-shit-done/workflows/resume-project.md +0 -306
- package/refs/gsd/get-shit-done/workflows/set-profile.md +0 -80
- package/refs/gsd/get-shit-done/workflows/settings.md +0 -213
- package/refs/gsd/get-shit-done/workflows/transition.md +0 -544
- package/refs/gsd/get-shit-done/workflows/update.md +0 -219
- package/refs/gsd/get-shit-done/workflows/verify-phase.md +0 -242
- package/refs/gsd/get-shit-done/workflows/verify-work.md +0 -569
- package/refs/gsd/hooks/gsd-check-update.js +0 -62
- package/refs/gsd/hooks/gsd-context-monitor.js +0 -122
- package/refs/gsd/hooks/gsd-statusline.js +0 -108
- package/refs/gsd/package.json +0 -50
- package/refs/gsd/scripts/build-hooks.js +0 -43
- package/refs/gsd/tests/commands.test.cjs +0 -661
- package/refs/gsd/tests/helpers.cjs +0 -40
- package/refs/gsd/tests/init.test.cjs +0 -205
- package/refs/gsd/tests/milestone.test.cjs +0 -98
- package/refs/gsd/tests/phase.test.cjs +0 -1241
- package/refs/gsd/tests/roadmap.test.cjs +0 -265
- package/refs/gsd/tests/state.test.cjs +0 -302
- package/refs/gsd/tests/verify.test.cjs +0 -80
- package/refs/vbenchmark/.agent/agents/codebase-explorer.md +0 -224
- package/refs/vbenchmark/.agent/agents/debugger.md +0 -180
- package/refs/vbenchmark/.agent/agents/documenter.md +0 -166
- package/refs/vbenchmark/.agent/agents/implementer.md +0 -70
- package/refs/vbenchmark/.agent/agents/orchestrator.md +0 -212
- package/refs/vbenchmark/.agent/agents/researcher.md +0 -80
- package/refs/vbenchmark/.agent/agents/reviewer.md +0 -184
- package/refs/vbenchmark/.agent/agents/tester.md +0 -170
- package/refs/vbenchmark/.agent/commands/commit.md +0 -29
- package/refs/vbenchmark/.agent/commands/debug.md +0 -59
- package/refs/vbenchmark/.agent/commands/document.md +0 -52
- package/refs/vbenchmark/.agent/commands/gather-context.md +0 -58
- package/refs/vbenchmark/.agent/commands/init.md +0 -56
- package/refs/vbenchmark/.agent/commands/preset-help.md +0 -50
- package/refs/vbenchmark/.agent/commands/refactor.md +0 -71
- package/refs/vbenchmark/.agent/commands/research.md +0 -37
- package/refs/vbenchmark/.agent/commands/review.md +0 -38
- package/refs/vbenchmark/.agent/commands/test.md +0 -61
- package/refs/vbenchmark/.agent/rules/01-code-quality.md +0 -33
- package/refs/vbenchmark/.agent/rules/02-typescript-go.md +0 -46
- package/refs/vbenchmark/.agent/rules/03-security-git.md +0 -34
- package/refs/vbenchmark/.agent/rules/04-architecture.md +0 -40
- package/refs/vbenchmark/.agent/sync.js +0 -536
- package/refs/vbenchmark/.agent/workflows/commit.md +0 -29
- package/refs/vbenchmark/.agent/workflows/debug.md +0 -59
- package/refs/vbenchmark/.agent/workflows/document.md +0 -52
- package/refs/vbenchmark/.agent/workflows/gather-context.md +0 -58
- package/refs/vbenchmark/.agent/workflows/init.md +0 -56
- package/refs/vbenchmark/.agent/workflows/preset-help.md +0 -50
- package/refs/vbenchmark/.agent/workflows/refactor.md +0 -71
- package/refs/vbenchmark/.agent/workflows/research.md +0 -37
- package/refs/vbenchmark/.agent/workflows/review.md +0 -38
- package/refs/vbenchmark/.agent/workflows/test.md +0 -61
- package/refs/vbenchmark/.claude/commands/agentic-dev/apply.md +0 -222
- package/refs/vbenchmark/.claude/commands/agentic-dev/done.md +0 -166
- package/refs/vbenchmark/.claude/commands/agentic-dev/proposal.md +0 -220
- package/refs/vbenchmark/.claude/commands/openspec/apply.md +0 -23
- package/refs/vbenchmark/.claude/commands/openspec/archive.md +0 -27
- package/refs/vbenchmark/.claude/commands/openspec/proposal.md +0 -28
- package/refs/vbenchmark/.clinerules/01-rules.md +0 -73
- package/refs/vbenchmark/.clinerules/02-agents.md +0 -34
- package/refs/vbenchmark/.cursor/commands/commit.md +0 -29
- package/refs/vbenchmark/.cursor/commands/debug.md +0 -59
- package/refs/vbenchmark/.cursor/commands/document.md +0 -52
- package/refs/vbenchmark/.cursor/commands/gather-context.md +0 -58
- package/refs/vbenchmark/.cursor/commands/init.md +0 -56
- package/refs/vbenchmark/.cursor/commands/preset-help.md +0 -50
- package/refs/vbenchmark/.cursor/commands/refactor.md +0 -71
- package/refs/vbenchmark/.cursor/commands/research.md +0 -37
- package/refs/vbenchmark/.cursor/commands/review.md +0 -38
- package/refs/vbenchmark/.cursor/commands/test.md +0 -61
- package/refs/vbenchmark/.cursor/rules/agents.mdc +0 -1357
- package/refs/vbenchmark/.factory/droids/codebase-explorer.md +0 -224
- package/refs/vbenchmark/.factory/droids/debugger.md +0 -180
- package/refs/vbenchmark/.factory/droids/documenter.md +0 -166
- package/refs/vbenchmark/.factory/droids/implementer.md +0 -70
- package/refs/vbenchmark/.factory/droids/orchestrator.md +0 -212
- package/refs/vbenchmark/.factory/droids/researcher.md +0 -80
- package/refs/vbenchmark/.factory/droids/reviewer.md +0 -184
- package/refs/vbenchmark/.factory/droids/tester.md +0 -170
- package/refs/vbenchmark/.gemini/workflows/commit.md +0 -29
- package/refs/vbenchmark/.gemini/workflows/debug.md +0 -59
- package/refs/vbenchmark/.gemini/workflows/document.md +0 -52
- package/refs/vbenchmark/.gemini/workflows/gather-context.md +0 -58
- package/refs/vbenchmark/.gemini/workflows/init.md +0 -56
- package/refs/vbenchmark/.gemini/workflows/preset-help.md +0 -50
- package/refs/vbenchmark/.gemini/workflows/refactor.md +0 -71
- package/refs/vbenchmark/.gemini/workflows/research.md +0 -37
- package/refs/vbenchmark/.gemini/workflows/review.md +0 -38
- package/refs/vbenchmark/.gemini/workflows/test.md +0 -61
- package/refs/vbenchmark/.github/CODEOWNERS +0 -20
- package/refs/vbenchmark/.github/FUNDING.yml +0 -4
- package/refs/vbenchmark/.github/ISSUE_TEMPLATE/bug-report.yml +0 -76
- package/refs/vbenchmark/.github/ISSUE_TEMPLATE/new-task.yml +0 -106
- package/refs/vbenchmark/.github/PULL_REQUEST_TEMPLATE.md +0 -38
- package/refs/vbenchmark/.github/copilot-instructions.md +0 -73
- package/refs/vbenchmark/.github/workflows/ci.yaml +0 -33
- package/refs/vbenchmark/.github/workflows/vercel-auto-pr.yml +0 -478
- package/refs/vbenchmark/.github/workflows/vercel-deploy.yaml +0 -487
- package/refs/vbenchmark/.github/workflows/vercel-pr-command.yaml +0 -337
- package/refs/vbenchmark/.github/workflows/vercel-project-init.yaml +0 -208
- package/refs/vbenchmark/.opencode/agent/codebase-explorer.md +0 -224
- package/refs/vbenchmark/.opencode/agent/debugger.md +0 -180
- package/refs/vbenchmark/.opencode/agent/documenter.md +0 -166
- package/refs/vbenchmark/.opencode/agent/implementer.md +0 -70
- package/refs/vbenchmark/.opencode/agent/orchestrator.md +0 -212
- package/refs/vbenchmark/.opencode/agent/researcher.md +0 -80
- package/refs/vbenchmark/.opencode/agent/reviewer.md +0 -184
- package/refs/vbenchmark/.opencode/agent/tester.md +0 -170
- package/refs/vbenchmark/.opencode/command/commit.md +0 -29
- package/refs/vbenchmark/.opencode/command/debug.md +0 -59
- package/refs/vbenchmark/.opencode/command/document.md +0 -52
- package/refs/vbenchmark/.opencode/command/gather-context.md +0 -58
- package/refs/vbenchmark/.opencode/command/init.md +0 -56
- package/refs/vbenchmark/.opencode/command/preset-help.md +0 -50
- package/refs/vbenchmark/.opencode/command/refactor.md +0 -71
- package/refs/vbenchmark/.opencode/command/research.md +0 -37
- package/refs/vbenchmark/.opencode/command/review.md +0 -38
- package/refs/vbenchmark/.opencode/command/test.md +0 -61
- package/refs/vbenchmark/.trae/project_rules.md +0 -73
- package/refs/vbenchmark/.windsurf/rules/rules.md +0 -85
- package/refs/vbenchmark/AGENTS.md +0 -73
- package/refs/vbenchmark/CONTRIBUTING.md +0 -332
- package/refs/vbenchmark/Caddyfile +0 -3
- package/refs/vbenchmark/LICENSE +0 -47
- package/refs/vbenchmark/README.md +0 -354
- package/refs/vbenchmark/docker-compose.prod.yaml +0 -35
- package/refs/vbenchmark/docker-compose.yaml +0 -53
- package/refs/vbenchmark/docs/TASK_EXPANSION_PLAN.md +0 -211
- package/refs/vbenchmark/docs/THESIS.md +0 -441
- package/refs/vbenchmark/docs/categories/code-evolution.md +0 -138
- package/refs/vbenchmark/openspec/changes/init-vibecodingbench/design.md +0 -111
- package/refs/vbenchmark/openspec/changes/init-vibecodingbench/proposal.md +0 -15
- package/refs/vbenchmark/openspec/changes/init-vibecodingbench/specs/evaluation/spec.md +0 -105
- package/refs/vbenchmark/openspec/changes/init-vibecodingbench/specs/leaderboard/spec.md +0 -68
- package/refs/vbenchmark/openspec/changes/init-vibecodingbench/specs/task-definition/spec.md +0 -45
- package/refs/vbenchmark/openspec/changes/init-vibecodingbench/specs/task-runner/spec.md +0 -49
- package/refs/vbenchmark/openspec/changes/init-vibecodingbench/tasks.md +0 -413
- package/refs/vbenchmark/package.json +0 -51
- package/refs/vbenchmark/packages/cli/eslint.config.js +0 -16
- package/refs/vbenchmark/packages/cli/package.json +0 -35
- package/refs/vbenchmark/packages/cli/src/agents/index.ts +0 -655
- package/refs/vbenchmark/packages/cli/src/commands/eval.ts +0 -197
- package/refs/vbenchmark/packages/cli/src/commands/list.ts +0 -63
- package/refs/vbenchmark/packages/cli/src/commands/run.ts +0 -147
- package/refs/vbenchmark/packages/cli/src/evaluator.ts +0 -125
- package/refs/vbenchmark/packages/cli/src/index.ts +0 -21
- package/refs/vbenchmark/packages/cli/src/lib/task-variation.ts +0 -153
- package/refs/vbenchmark/packages/cli/src/loader.ts +0 -258
- package/refs/vbenchmark/packages/cli/src/reporter.ts +0 -222
- package/refs/vbenchmark/packages/cli/src/runtime/docker.ts +0 -385
- package/refs/vbenchmark/packages/cli/tsconfig.json +0 -8
- package/refs/vbenchmark/packages/dashboard/Dockerfile +0 -42
- package/refs/vbenchmark/packages/dashboard/index.html +0 -21
- package/refs/vbenchmark/packages/dashboard/package.json +0 -29
- package/refs/vbenchmark/packages/dashboard/postcss.config.js +0 -6
- package/refs/vbenchmark/packages/dashboard/public/favicon.svg +0 -24
- package/refs/vbenchmark/packages/dashboard/public/logo.png +0 -0
- package/refs/vbenchmark/packages/dashboard/public/logo.svg +0 -39
- package/refs/vbenchmark/packages/dashboard/src/App.tsx +0 -1468
- package/refs/vbenchmark/packages/dashboard/src/data/category-performance.json +0 -1
- package/refs/vbenchmark/packages/dashboard/src/data/leaderboard.json +0 -1
- package/refs/vbenchmark/packages/dashboard/src/data/task-results.json +0 -1
- package/refs/vbenchmark/packages/dashboard/src/data/tasks.json +0 -1
- package/refs/vbenchmark/packages/dashboard/src/index.css +0 -3
- package/refs/vbenchmark/packages/dashboard/src/main.tsx +0 -13
- package/refs/vbenchmark/packages/dashboard/src/vite-env.d.ts +0 -9
- package/refs/vbenchmark/packages/dashboard/tailwind.config.js +0 -11
- package/refs/vbenchmark/packages/dashboard/tsconfig.json +0 -21
- package/refs/vbenchmark/packages/dashboard/tsconfig.node.json +0 -11
- package/refs/vbenchmark/packages/dashboard/vercel.json +0 -6
- package/refs/vbenchmark/packages/dashboard/vite.config.ts +0 -28
- package/refs/vbenchmark/packages/evaluator/eslint.config.js +0 -16
- package/refs/vbenchmark/packages/evaluator/package.json +0 -24
- package/refs/vbenchmark/packages/evaluator/src/index.ts +0 -15
- package/refs/vbenchmark/packages/evaluator/src/runners/functional.ts +0 -88
- package/refs/vbenchmark/packages/evaluator/src/runners/quality.ts +0 -140
- package/refs/vbenchmark/packages/evaluator/src/runners/security.ts +0 -94
- package/refs/vbenchmark/packages/evaluator/src/runners/visual.ts +0 -108
- package/refs/vbenchmark/packages/evaluator/src/types.d.ts +0 -19
- package/refs/vbenchmark/packages/evaluator/tsconfig.json +0 -8
- package/refs/vbenchmark/packages/leaderboard/Dockerfile +0 -38
- package/refs/vbenchmark/packages/leaderboard/drizzle.config.ts +0 -10
- package/refs/vbenchmark/packages/leaderboard/eslint.config.js +0 -16
- package/refs/vbenchmark/packages/leaderboard/fly.toml +0 -29
- package/refs/vbenchmark/packages/leaderboard/package.json +0 -36
- package/refs/vbenchmark/packages/leaderboard/src/app.ts +0 -29
- package/refs/vbenchmark/packages/leaderboard/src/components/BrowserPreview.tsx +0 -190
- package/refs/vbenchmark/packages/leaderboard/src/components/ComparisonView.tsx +0 -205
- package/refs/vbenchmark/packages/leaderboard/src/components/LeaderboardTable.tsx +0 -150
- package/refs/vbenchmark/packages/leaderboard/src/components/LiveRunCard.tsx +0 -133
- package/refs/vbenchmark/packages/leaderboard/src/components/SubmissionForm.tsx +0 -406
- package/refs/vbenchmark/packages/leaderboard/src/components/SubmitForm.tsx +0 -293
- package/refs/vbenchmark/packages/leaderboard/src/components/TerminalStream.tsx +0 -111
- package/refs/vbenchmark/packages/leaderboard/src/config/pricing.ts +0 -206
- package/refs/vbenchmark/packages/leaderboard/src/db/index.ts +0 -31
- package/refs/vbenchmark/packages/leaderboard/src/db/schema.ts +0 -125
- package/refs/vbenchmark/packages/leaderboard/src/index.ts +0 -13
- package/refs/vbenchmark/packages/leaderboard/src/lib/websocket.ts +0 -124
- package/refs/vbenchmark/packages/leaderboard/src/routes/leaderboard.ts +0 -698
- package/refs/vbenchmark/packages/leaderboard/src/routes/live.ts +0 -175
- package/refs/vbenchmark/packages/leaderboard/src/routes/submissions.ts +0 -183
- package/refs/vbenchmark/packages/leaderboard/src/routes/tasks.ts +0 -215
- package/refs/vbenchmark/packages/leaderboard/tests/api.test.ts +0 -228
- package/refs/vbenchmark/packages/leaderboard/tsconfig.json +0 -9
- package/refs/vbenchmark/scripts/deploy.sh +0 -70
- package/refs/vbenchmark/tasks/ai-integration/advanced/context-management/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/ai-integration/advanced/context-management/task.yaml +0 -16
- package/refs/vbenchmark/tasks/ai-integration/advanced/evaluation-framework/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/ai-integration/advanced/evaluation-framework/task.yaml +0 -16
- package/refs/vbenchmark/tasks/ai-integration/advanced/guardrails-safety/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/ai-integration/advanced/guardrails-safety/task.yaml +0 -16
- package/refs/vbenchmark/tasks/ai-integration/advanced/memory-system/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/ai-integration/advanced/memory-system/task.yaml +0 -16
- package/refs/vbenchmark/tasks/ai-integration/advanced/model-routing/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/ai-integration/advanced/model-routing/task.yaml +0 -16
- package/refs/vbenchmark/tasks/ai-integration/advanced/multi-agent-system/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/ai-integration/advanced/multi-agent-system/task.yaml +0 -16
- package/refs/vbenchmark/tasks/ai-integration/advanced/prompt-optimization/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/ai-integration/advanced/prompt-optimization/task.yaml +0 -16
- package/refs/vbenchmark/tasks/ai-integration/advanced/reasoning-chain/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/ai-integration/advanced/reasoning-chain/task.yaml +0 -16
- package/refs/vbenchmark/tasks/ai-integration/advanced/streaming-pipeline/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/ai-integration/advanced/streaming-pipeline/task.yaml +0 -16
- package/refs/vbenchmark/tasks/ai-integration/advanced/tool-use-orchestration/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/ai-integration/advanced/tool-use-orchestration/task.yaml +0 -16
- package/refs/vbenchmark/tasks/ai-integration/agents/code-review-agent/PROMPT.md +0 -64
- package/refs/vbenchmark/tasks/ai-integration/agents/code-review-agent/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/agents/research-agent/PROMPT.md +0 -61
- package/refs/vbenchmark/tasks/ai-integration/agents/research-agent/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/agents/web-scraper-agent/PROMPT.md +0 -57
- package/refs/vbenchmark/tasks/ai-integration/agents/web-scraper-agent/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/embeddings/duplicate-detection/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/ai-integration/embeddings/duplicate-detection/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/embeddings/recommendation-engine/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/ai-integration/embeddings/recommendation-engine/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/embeddings/semantic-search/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/ai-integration/embeddings/semantic-search/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/fine-tuning/classification-model/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/ai-integration/fine-tuning/classification-model/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/function-calling/api-orchestrator/PROMPT.md +0 -60
- package/refs/vbenchmark/tasks/ai-integration/function-calling/api-orchestrator/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/function-calling/calendar-assistant/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/ai-integration/function-calling/calendar-assistant/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/function-calling/database-query/PROMPT.md +0 -62
- package/refs/vbenchmark/tasks/ai-integration/function-calling/database-query/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/multimodal/chart-interpreter/PROMPT.md +0 -60
- package/refs/vbenchmark/tasks/ai-integration/multimodal/chart-interpreter/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/multimodal/image-captioning/PROMPT.md +0 -49
- package/refs/vbenchmark/tasks/ai-integration/multimodal/image-captioning/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/rag-chatbot/code-assistant/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/ai-integration/rag-chatbot/code-assistant/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/rag-chatbot/doc-search/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/ai-integration/rag-chatbot/doc-search/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/rag-chatbot/pdf-qa/PROMPT.md +0 -76
- package/refs/vbenchmark/tasks/ai-integration/rag-chatbot/pdf-qa/docker-compose.yaml +0 -30
- package/refs/vbenchmark/tasks/ai-integration/rag-chatbot/pdf-qa/task.yaml +0 -30
- package/refs/vbenchmark/tasks/ai-integration/rag-chatbot/pdf-qa/tests/functional/qa.test.py +0 -146
- package/refs/vbenchmark/tasks/ai-integration/rag-chatbot/support-bot/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/ai-integration/rag-chatbot/support-bot/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/structured-output/contract-analyzer/PROMPT.md +0 -67
- package/refs/vbenchmark/tasks/ai-integration/structured-output/contract-analyzer/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/structured-output/invoice-parser/PROMPT.md +0 -61
- package/refs/vbenchmark/tasks/ai-integration/structured-output/invoice-parser/task.yaml +0 -27
- package/refs/vbenchmark/tasks/ai-integration/structured-output/receipt-scanner/PROMPT.md +0 -65
- package/refs/vbenchmark/tasks/ai-integration/structured-output/receipt-scanner/task.yaml +0 -24
- package/refs/vbenchmark/tasks/ai-integration/structured-output/resume-parser/PROMPT.md +0 -70
- package/refs/vbenchmark/tasks/ai-integration/structured-output/resume-parser/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/advanced/api-analytics/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/api-integrations/advanced/api-analytics/task.yaml +0 -16
- package/refs/vbenchmark/tasks/api-integrations/advanced/api-gateway/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/api-integrations/advanced/api-gateway/task.yaml +0 -16
- package/refs/vbenchmark/tasks/api-integrations/advanced/api-mocking/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/api-integrations/advanced/api-mocking/task.yaml +0 -16
- package/refs/vbenchmark/tasks/api-integrations/advanced/contract-testing/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/api-integrations/advanced/contract-testing/task.yaml +0 -16
- package/refs/vbenchmark/tasks/api-integrations/advanced/graphql-federation/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/api-integrations/advanced/graphql-federation/task.yaml +0 -16
- package/refs/vbenchmark/tasks/api-integrations/advanced/grpc-gateway/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/api-integrations/advanced/grpc-gateway/task.yaml +0 -16
- package/refs/vbenchmark/tasks/api-integrations/advanced/rate-limiter/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/api-integrations/advanced/rate-limiter/task.yaml +0 -16
- package/refs/vbenchmark/tasks/api-integrations/advanced/request-validator/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/api-integrations/advanced/request-validator/task.yaml +0 -16
- package/refs/vbenchmark/tasks/api-integrations/advanced/sdk-generator/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/api-integrations/advanced/sdk-generator/task.yaml +0 -16
- package/refs/vbenchmark/tasks/api-integrations/advanced/webhook-processor/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/api-integrations/advanced/webhook-processor/task.yaml +0 -16
- package/refs/vbenchmark/tasks/api-integrations/analytics/mixpanel-events/PROMPT.md +0 -42
- package/refs/vbenchmark/tasks/api-integrations/analytics/mixpanel-events/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/analytics/segment-tracking/PROMPT.md +0 -42
- package/refs/vbenchmark/tasks/api-integrations/analytics/segment-tracking/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/auth-provider/oauth2-github/PROMPT.md +0 -42
- package/refs/vbenchmark/tasks/api-integrations/auth-provider/oauth2-github/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/auth-provider/okta-integration/PROMPT.md +0 -44
- package/refs/vbenchmark/tasks/api-integrations/auth-provider/okta-integration/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/auth-provider/saml-sso/PROMPT.md +0 -42
- package/refs/vbenchmark/tasks/api-integrations/auth-provider/saml-sso/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/communication/discord-webhook/PROMPT.md +0 -44
- package/refs/vbenchmark/tasks/api-integrations/communication/discord-webhook/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/communication/slack-bot/PROMPT.md +0 -42
- package/refs/vbenchmark/tasks/api-integrations/communication/slack-bot/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/communication/twilio-sms/PROMPT.md +0 -42
- package/refs/vbenchmark/tasks/api-integrations/communication/twilio-sms/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/email/transactional/PROMPT.md +0 -82
- package/refs/vbenchmark/tasks/api-integrations/email/transactional/task.yaml +0 -27
- package/refs/vbenchmark/tasks/api-integrations/maps/google-maps-geocoding/PROMPT.md +0 -41
- package/refs/vbenchmark/tasks/api-integrations/maps/google-maps-geocoding/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/maps/mapbox-directions/PROMPT.md +0 -41
- package/refs/vbenchmark/tasks/api-integrations/maps/mapbox-directions/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/payment/crypto-payments/PROMPT.md +0 -43
- package/refs/vbenchmark/tasks/api-integrations/payment/crypto-payments/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/payment/paypal-integration/PROMPT.md +0 -41
- package/refs/vbenchmark/tasks/api-integrations/payment/paypal-integration/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/social/twitter-api/PROMPT.md +0 -41
- package/refs/vbenchmark/tasks/api-integrations/social/twitter-api/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/storage/cloudinary-upload/PROMPT.md +0 -43
- package/refs/vbenchmark/tasks/api-integrations/storage/cloudinary-upload/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/storage/gcs-streaming/PROMPT.md +0 -43
- package/refs/vbenchmark/tasks/api-integrations/storage/gcs-streaming/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/storage/s3-presigned-urls/PROMPT.md +0 -41
- package/refs/vbenchmark/tasks/api-integrations/storage/s3-presigned-urls/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/stripe/checkout-session/PROMPT.md +0 -41
- package/refs/vbenchmark/tasks/api-integrations/stripe/checkout-session/task.yaml +0 -24
- package/refs/vbenchmark/tasks/api-integrations/stripe/payment-webhook/PROMPT.md +0 -60
- package/refs/vbenchmark/tasks/api-integrations/stripe/payment-webhook/docker-compose.yaml +0 -38
- package/refs/vbenchmark/tasks/api-integrations/stripe/payment-webhook/task.yaml +0 -31
- package/refs/vbenchmark/tasks/api-integrations/stripe/payment-webhook/tests/webhook.test.ts +0 -193
- package/refs/vbenchmark/tasks/api-integrations/stripe/subscription-portal/PROMPT.md +0 -41
- package/refs/vbenchmark/tasks/api-integrations/stripe/subscription-portal/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/advanced/api-deprecation/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/code-evolution/advanced/api-deprecation/task.yaml +0 -16
- package/refs/vbenchmark/tasks/code-evolution/advanced/ast-refactoring/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/code-evolution/advanced/ast-refactoring/task.yaml +0 -16
- package/refs/vbenchmark/tasks/code-evolution/advanced/concurrency-fix/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/code-evolution/advanced/concurrency-fix/task.yaml +0 -16
- package/refs/vbenchmark/tasks/code-evolution/advanced/database-schema-migration/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/code-evolution/advanced/database-schema-migration/task.yaml +0 -16
- package/refs/vbenchmark/tasks/code-evolution/advanced/dead-code-elimination/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/code-evolution/advanced/dead-code-elimination/task.yaml +0 -16
- package/refs/vbenchmark/tasks/code-evolution/advanced/dependency-upgrade/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/code-evolution/advanced/dependency-upgrade/task.yaml +0 -16
- package/refs/vbenchmark/tasks/code-evolution/advanced/memory-optimization/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/code-evolution/advanced/memory-optimization/task.yaml +0 -16
- package/refs/vbenchmark/tasks/code-evolution/advanced/monorepo-extraction/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/code-evolution/advanced/monorepo-extraction/task.yaml +0 -16
- package/refs/vbenchmark/tasks/code-evolution/advanced/performance-profiling/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/code-evolution/advanced/performance-profiling/task.yaml +0 -16
- package/refs/vbenchmark/tasks/code-evolution/advanced/type-migration/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/code-evolution/advanced/type-migration/task.yaml +0 -16
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/callback-to-async/PROMPT.md +0 -47
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/callback-to-async/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/express-to-fastify/PROMPT.md +0 -49
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/express-to-fastify/base-code/src/app.ts +0 -22
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/express-to-fastify/task.yaml +0 -37
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/express-to-fastify/tests/api.test.ts +0 -70
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/flask-to-fastapi/PROMPT.md +0 -46
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/flask-to-fastapi/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/java-to-kotlin/PROMPT.md +0 -45
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/java-to-kotlin/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/jquery-to-react/PROMPT.md +0 -47
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/jquery-to-react/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/rest-to-grpc/PROMPT.md +0 -47
- package/refs/vbenchmark/tasks/code-evolution/legacy-migration/rest-to-grpc/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/performance/async-refactor/PROMPT.md +0 -47
- package/refs/vbenchmark/tasks/code-evolution/performance/async-refactor/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/performance/memory-leak-fix/PROMPT.md +0 -47
- package/refs/vbenchmark/tasks/code-evolution/performance/memory-leak-fix/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/performance/query-optimization/PROMPT.md +0 -49
- package/refs/vbenchmark/tasks/code-evolution/performance/query-optimization/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/refactoring/class-to-hooks/PROMPT.md +0 -96
- package/refs/vbenchmark/tasks/code-evolution/refactoring/class-to-hooks/task.yaml +0 -27
- package/refs/vbenchmark/tasks/code-evolution/refactoring/dependency-injection/PROMPT.md +0 -47
- package/refs/vbenchmark/tasks/code-evolution/refactoring/dependency-injection/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/refactoring/error-handling/PROMPT.md +0 -48
- package/refs/vbenchmark/tasks/code-evolution/refactoring/error-handling/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/refactoring/monolith-to-modules/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/code-evolution/refactoring/monolith-to-modules/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/refactoring/orm-migration/PROMPT.md +0 -47
- package/refs/vbenchmark/tasks/code-evolution/refactoring/orm-migration/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/security/secrets-rotation/PROMPT.md +0 -49
- package/refs/vbenchmark/tasks/code-evolution/security/secrets-rotation/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/security/sql-injection-fix/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/code-evolution/security/sql-injection-fix/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/security/xss-prevention/PROMPT.md +0 -47
- package/refs/vbenchmark/tasks/code-evolution/security/xss-prevention/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/testing/add-unit-tests/PROMPT.md +0 -48
- package/refs/vbenchmark/tasks/code-evolution/testing/add-unit-tests/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/testing/e2e-playwright/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/code-evolution/testing/e2e-playwright/task.yaml +0 -24
- package/refs/vbenchmark/tasks/code-evolution/testing/pytest-fixtures/PROMPT.md +0 -47
- package/refs/vbenchmark/tasks/code-evolution/testing/pytest-fixtures/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/accessibility/keyboard-shortcuts/PROMPT.md +0 -44
- package/refs/vbenchmark/tasks/frontend/accessibility/keyboard-shortcuts/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/accessibility/screen-reader-nav/PROMPT.md +0 -44
- package/refs/vbenchmark/tasks/frontend/accessibility/screen-reader-nav/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/advanced/canvas-editor/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/frontend/advanced/canvas-editor/task.yaml +0 -16
- package/refs/vbenchmark/tasks/frontend/advanced/micro-frontend/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/frontend/advanced/micro-frontend/task.yaml +0 -16
- package/refs/vbenchmark/tasks/frontend/advanced/offline-first/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/frontend/advanced/offline-first/task.yaml +0 -16
- package/refs/vbenchmark/tasks/frontend/advanced/realtime-collab/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/frontend/advanced/realtime-collab/task.yaml +0 -16
- package/refs/vbenchmark/tasks/frontend/advanced/service-worker/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/frontend/advanced/service-worker/task.yaml +0 -16
- package/refs/vbenchmark/tasks/frontend/advanced/state-machine/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/frontend/advanced/state-machine/task.yaml +0 -16
- package/refs/vbenchmark/tasks/frontend/advanced/virtual-list/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/frontend/advanced/virtual-list/task.yaml +0 -16
- package/refs/vbenchmark/tasks/frontend/advanced/wasm-integration/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/frontend/advanced/wasm-integration/task.yaml +0 -16
- package/refs/vbenchmark/tasks/frontend/advanced/web-worker/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/frontend/advanced/web-worker/task.yaml +0 -16
- package/refs/vbenchmark/tasks/frontend/advanced/webgl-visualization/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/frontend/advanced/webgl-visualization/task.yaml +0 -16
- package/refs/vbenchmark/tasks/frontend/animation/page-transitions/PROMPT.md +0 -44
- package/refs/vbenchmark/tasks/frontend/animation/page-transitions/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/components/data-grid/PROMPT.md +0 -59
- package/refs/vbenchmark/tasks/frontend/components/data-grid/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/components/date-range-picker/PROMPT.md +0 -57
- package/refs/vbenchmark/tasks/frontend/components/date-range-picker/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/components/file-uploader/PROMPT.md +0 -55
- package/refs/vbenchmark/tasks/frontend/components/file-uploader/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/components/form-builder/PROMPT.md +0 -96
- package/refs/vbenchmark/tasks/frontend/components/form-builder/task.yaml +0 -28
- package/refs/vbenchmark/tasks/frontend/components/rich-text-editor/PROMPT.md +0 -45
- package/refs/vbenchmark/tasks/frontend/components/rich-text-editor/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/figma-to-code/dashboard-layout/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/frontend/figma-to-code/dashboard-layout/task.yaml +0 -25
- package/refs/vbenchmark/tasks/frontend/figma-to-code/landing-page/PROMPT.md +0 -49
- package/refs/vbenchmark/tasks/frontend/figma-to-code/landing-page/task.yaml +0 -25
- package/refs/vbenchmark/tasks/frontend/figma-to-code/mobile-app-screen/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/frontend/figma-to-code/mobile-app-screen/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/figma-to-code/pricing-card/PROMPT.md +0 -93
- package/refs/vbenchmark/tasks/frontend/figma-to-code/pricing-card/docker-compose.yaml +0 -23
- package/refs/vbenchmark/tasks/frontend/figma-to-code/pricing-card/task.yaml +0 -30
- package/refs/vbenchmark/tasks/frontend/figma-to-code/pricing-card/tests/visual/diff.test.ts +0 -107
- package/refs/vbenchmark/tasks/frontend/figma-to-code/pricing-card/tests/visual/interaction.test.ts +0 -88
- package/refs/vbenchmark/tasks/frontend/performance/image-lazy-load/PROMPT.md +0 -43
- package/refs/vbenchmark/tasks/frontend/performance/image-lazy-load/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/performance/infinite-scroll/PROMPT.md +0 -44
- package/refs/vbenchmark/tasks/frontend/performance/infinite-scroll/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/state-management/collaborative-editor/PROMPT.md +0 -44
- package/refs/vbenchmark/tasks/frontend/state-management/collaborative-editor/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/state-management/shopping-cart/PROMPT.md +0 -53
- package/refs/vbenchmark/tasks/frontend/state-management/shopping-cart/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/visualization/chart-dashboard/PROMPT.md +0 -83
- package/refs/vbenchmark/tasks/frontend/visualization/chart-dashboard/task.yaml +0 -28
- package/refs/vbenchmark/tasks/frontend/visualization/gantt-chart/PROMPT.md +0 -57
- package/refs/vbenchmark/tasks/frontend/visualization/gantt-chart/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/visualization/map-dashboard/PROMPT.md +0 -44
- package/refs/vbenchmark/tasks/frontend/visualization/map-dashboard/task.yaml +0 -24
- package/refs/vbenchmark/tasks/frontend/visualization/realtime-charts/PROMPT.md +0 -43
- package/refs/vbenchmark/tasks/frontend/visualization/realtime-charts/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/advanced/blue-green-deploy/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/glue-code/advanced/blue-green-deploy/task.yaml +0 -16
- package/refs/vbenchmark/tasks/glue-code/advanced/canary-release/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/glue-code/advanced/canary-release/task.yaml +0 -16
- package/refs/vbenchmark/tasks/glue-code/advanced/change-data-capture/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/glue-code/advanced/change-data-capture/task.yaml +0 -16
- package/refs/vbenchmark/tasks/glue-code/advanced/config-management/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/glue-code/advanced/config-management/task.yaml +0 -16
- package/refs/vbenchmark/tasks/glue-code/advanced/data-pipeline/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/glue-code/advanced/data-pipeline/task.yaml +0 -16
- package/refs/vbenchmark/tasks/glue-code/advanced/distributed-tracing/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/glue-code/advanced/distributed-tracing/task.yaml +0 -16
- package/refs/vbenchmark/tasks/glue-code/advanced/log-aggregation/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/glue-code/advanced/log-aggregation/task.yaml +0 -16
- package/refs/vbenchmark/tasks/glue-code/advanced/schema-registry/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/glue-code/advanced/schema-registry/task.yaml +0 -16
- package/refs/vbenchmark/tasks/glue-code/advanced/secret-rotation/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/glue-code/advanced/secret-rotation/task.yaml +0 -16
- package/refs/vbenchmark/tasks/glue-code/advanced/stream-processing/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/glue-code/advanced/stream-processing/task.yaml +0 -16
- package/refs/vbenchmark/tasks/glue-code/api-sync/rest-to-graphql/PROMPT.md +0 -66
- package/refs/vbenchmark/tasks/glue-code/api-sync/rest-to-graphql/task.yaml +0 -27
- package/refs/vbenchmark/tasks/glue-code/caching/redis-cache/PROMPT.md +0 -82
- package/refs/vbenchmark/tasks/glue-code/caching/redis-cache/task.yaml +0 -27
- package/refs/vbenchmark/tasks/glue-code/data-transform/avro-schema-evolution/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/glue-code/data-transform/avro-schema-evolution/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/data-transform/csv-normalizer/PROMPT.md +0 -49
- package/refs/vbenchmark/tasks/glue-code/data-transform/csv-normalizer/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/data-transform/excel-to-json/PROMPT.md +0 -67
- package/refs/vbenchmark/tasks/glue-code/data-transform/excel-to-json/task.yaml +0 -28
- package/refs/vbenchmark/tasks/glue-code/data-transform/excel-to-json/tests/transform.test.py +0 -137
- package/refs/vbenchmark/tasks/glue-code/data-transform/json-to-xml/PROMPT.md +0 -45
- package/refs/vbenchmark/tasks/glue-code/data-transform/json-to-xml/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/data-transform/protobuf-converter/PROMPT.md +0 -44
- package/refs/vbenchmark/tasks/glue-code/data-transform/protobuf-converter/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/etl/cdc-pipeline/PROMPT.md +0 -52
- package/refs/vbenchmark/tasks/glue-code/etl/cdc-pipeline/task.yaml +0 -27
- package/refs/vbenchmark/tasks/glue-code/etl/database-sync/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/glue-code/etl/database-sync/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/etl/s3-to-warehouse/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/glue-code/etl/s3-to-warehouse/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/file-processing/image-resizer/PROMPT.md +0 -52
- package/refs/vbenchmark/tasks/glue-code/file-processing/image-resizer/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/file-processing/pdf-merger/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/glue-code/file-processing/pdf-merger/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/file-processing/video-transcoder/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/glue-code/file-processing/video-transcoder/task.yaml +0 -27
- package/refs/vbenchmark/tasks/glue-code/migration/data-backfill/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/glue-code/migration/data-backfill/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/migration/database-versioning/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/glue-code/migration/database-versioning/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/queue/kafka-producer/PROMPT.md +0 -49
- package/refs/vbenchmark/tasks/glue-code/queue/kafka-producer/task.yaml +0 -27
- package/refs/vbenchmark/tasks/glue-code/queue/rabbitmq-consumer/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/glue-code/queue/rabbitmq-consumer/task.yaml +0 -27
- package/refs/vbenchmark/tasks/glue-code/queue/sqs-batch-processor/PROMPT.md +0 -47
- package/refs/vbenchmark/tasks/glue-code/queue/sqs-batch-processor/task.yaml +0 -24
- package/refs/vbenchmark/tasks/glue-code/scheduler/cron-job-manager/PROMPT.md +0 -52
- package/refs/vbenchmark/tasks/glue-code/scheduler/cron-job-manager/task.yaml +0 -27
- package/refs/vbenchmark/tasks/glue-code/scheduler/delayed-tasks/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/glue-code/scheduler/delayed-tasks/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/advanced/api-versioning/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/saas-core/advanced/api-versioning/task.yaml +0 -16
- package/refs/vbenchmark/tasks/saas-core/advanced/circuit-breaker/PROMPT.md +0 -13
- package/refs/vbenchmark/tasks/saas-core/advanced/circuit-breaker/task.yaml +0 -16
- package/refs/vbenchmark/tasks/saas-core/advanced/compliance-gdpr/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/saas-core/advanced/compliance-gdpr/task.yaml +0 -16
- package/refs/vbenchmark/tasks/saas-core/advanced/cqrs-pattern/PROMPT.md +0 -13
- package/refs/vbenchmark/tasks/saas-core/advanced/cqrs-pattern/task.yaml +0 -16
- package/refs/vbenchmark/tasks/saas-core/advanced/data-encryption/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/saas-core/advanced/data-encryption/task.yaml +0 -16
- package/refs/vbenchmark/tasks/saas-core/advanced/distributed-locking/PROMPT.md +0 -46
- package/refs/vbenchmark/tasks/saas-core/advanced/distributed-locking/task.yaml +0 -24
- package/refs/vbenchmark/tasks/saas-core/advanced/event-sourcing/PROMPT.md +0 -23
- package/refs/vbenchmark/tasks/saas-core/advanced/event-sourcing/task.yaml +0 -16
- package/refs/vbenchmark/tasks/saas-core/advanced/feature-flags-ab/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/saas-core/advanced/feature-flags-ab/task.yaml +0 -16
- package/refs/vbenchmark/tasks/saas-core/advanced/saga-orchestration/PROMPT.md +0 -13
- package/refs/vbenchmark/tasks/saas-core/advanced/saga-orchestration/task.yaml +0 -16
- package/refs/vbenchmark/tasks/saas-core/advanced/webhook-delivery/PROMPT.md +0 -15
- package/refs/vbenchmark/tasks/saas-core/advanced/webhook-delivery/task.yaml +0 -16
- package/refs/vbenchmark/tasks/saas-core/audit/activity-logging/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/saas-core/audit/activity-logging/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/auth/jwt-refresh-tokens/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/saas-core/auth/jwt-refresh-tokens/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/auth/magic-link-email/PROMPT.md +0 -53
- package/refs/vbenchmark/tasks/saas-core/auth/magic-link-email/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/auth/mfa-totp/PROMPT.md +0 -79
- package/refs/vbenchmark/tasks/saas-core/auth/mfa-totp/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/auth/rbac-permissions/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/saas-core/auth/rbac-permissions/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/auth/session-management/PROMPT.md +0 -52
- package/refs/vbenchmark/tasks/saas-core/auth/session-management/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/auth/supabase-oauth/PROMPT.md +0 -45
- package/refs/vbenchmark/tasks/saas-core/auth/supabase-oauth/docker-compose.yaml +0 -47
- package/refs/vbenchmark/tasks/saas-core/auth/supabase-oauth/task.yaml +0 -32
- package/refs/vbenchmark/tasks/saas-core/auth/supabase-oauth/tests/auth.test.ts +0 -59
- package/refs/vbenchmark/tasks/saas-core/billing/invoice-generation/PROMPT.md +0 -53
- package/refs/vbenchmark/tasks/saas-core/billing/invoice-generation/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/billing/stripe-subscriptions/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/saas-core/billing/stripe-subscriptions/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/billing/usage-metering/PROMPT.md +0 -52
- package/refs/vbenchmark/tasks/saas-core/billing/usage-metering/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/crud/dashboard-table/PROMPT.md +0 -48
- package/refs/vbenchmark/tasks/saas-core/crud/dashboard-table/task.yaml +0 -28
- package/refs/vbenchmark/tasks/saas-core/multi-tenant/org-isolation/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/saas-core/multi-tenant/org-isolation/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/multi-tenant/subdomain-routing/PROMPT.md +0 -50
- package/refs/vbenchmark/tasks/saas-core/multi-tenant/subdomain-routing/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/notifications/email-queue/PROMPT.md +0 -53
- package/refs/vbenchmark/tasks/saas-core/notifications/email-queue/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/notifications/in-app-alerts/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/saas-core/notifications/in-app-alerts/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/notifications/push-notifications/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/saas-core/notifications/push-notifications/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/realtime/websocket-chat/PROMPT.md +0 -80
- package/refs/vbenchmark/tasks/saas-core/realtime/websocket-chat/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/search/full-text-search/PROMPT.md +0 -51
- package/refs/vbenchmark/tasks/saas-core/search/full-text-search/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/security/rate-limiter/PROMPT.md +0 -99
- package/refs/vbenchmark/tasks/saas-core/security/rate-limiter/task.yaml +0 -27
- package/refs/vbenchmark/tasks/saas-core/settings/user-preferences/PROMPT.md +0 -78
- package/refs/vbenchmark/tasks/saas-core/settings/user-preferences/task.yaml +0 -27
- package/refs/vbenchmark/templates/fastapi-postgres/docker-compose.yaml +0 -36
- package/refs/vbenchmark/templates/fastapi-postgres/pyproject.toml +0 -34
- package/refs/vbenchmark/templates/fastapi-postgres/src/__init__.py +0 -0
- package/refs/vbenchmark/templates/fastapi-postgres/src/config.py +0 -12
- package/refs/vbenchmark/templates/fastapi-postgres/src/database.py +0 -15
- package/refs/vbenchmark/templates/fastapi-postgres/src/main.py +0 -51
- package/refs/vbenchmark/templates/fastapi-postgres/src/models.py +0 -12
- package/refs/vbenchmark/templates/fastapi-postgres/src/schemas.py +0 -20
- package/refs/vbenchmark/templates/go-fiber/docker-compose.yaml +0 -34
- package/refs/vbenchmark/templates/go-fiber/go.mod +0 -33
- package/refs/vbenchmark/templates/go-fiber/go.sum +0 -68
- package/refs/vbenchmark/templates/go-fiber/main.go +0 -98
- package/refs/vbenchmark/templates/nextjs-supabase/.env.example +0 -3
- package/refs/vbenchmark/templates/nextjs-supabase/docker-compose.yaml +0 -68
- package/refs/vbenchmark/templates/nextjs-supabase/src/app/globals.css +0 -13
- package/refs/vbenchmark/templates/nextjs-supabase/src/app/layout.tsx +0 -19
- package/refs/vbenchmark/templates/nextjs-supabase/src/app/page.tsx +0 -38
- package/refs/vbenchmark/templates/nextjs-supabase/src/lib/supabase/client.ts +0 -8
- package/refs/vbenchmark/templates/nextjs-supabase/src/lib/supabase/server.ts +0 -32
- package/refs/vbenchmark/templates/rust-axum/Cargo.lock +0 -2371
- package/refs/vbenchmark/templates/rust-axum/Cargo.toml +0 -16
- package/refs/vbenchmark/templates/rust-axum/docker-compose.yaml +0 -34
- package/refs/vbenchmark/templates/rust-axum/migrations/20240101000000_init.sql +0 -20
- package/refs/vbenchmark/templates/rust-axum/src/main.rs +0 -121
- package/refs/vbenchmark/tsconfig.base.json +0 -18
- package/refs/vbenchmark/turbo.json +0 -23
- package/refs/vbenchmark/vercel.json +0 -10
|
@@ -1,698 +0,0 @@
|
|
|
1
|
-
import { Hono } from 'hono';
|
|
2
|
-
import { eq, desc, sql } from 'drizzle-orm';
|
|
3
|
-
import { db, benchmarkRuns } from '../db/index.js';
|
|
4
|
-
import { MODEL_PRICING } from '../config/pricing.js';
|
|
5
|
-
|
|
6
|
-
const leaderboardRoutes = new Hono();
|
|
7
|
-
|
|
8
|
-
// In-memory fallback when database is not available
|
|
9
|
-
interface LeaderboardEntry {
|
|
10
|
-
rank: number;
|
|
11
|
-
agentName: string;
|
|
12
|
-
agentVersion: string;
|
|
13
|
-
modelName: string;
|
|
14
|
-
tasksCompleted: number;
|
|
15
|
-
passedTasks: number;
|
|
16
|
-
failedTasks: number;
|
|
17
|
-
avgScore: number;
|
|
18
|
-
avgFunctional: number;
|
|
19
|
-
avgVisual: number;
|
|
20
|
-
avgQuality: number;
|
|
21
|
-
avgSecurity: number;
|
|
22
|
-
avgCost: number;
|
|
23
|
-
avgSpeed: number;
|
|
24
|
-
totalTokens: number;
|
|
25
|
-
inputTokens: number;
|
|
26
|
-
outputTokens: number;
|
|
27
|
-
totalCostUSD: number;
|
|
28
|
-
avgTimeMs: number;
|
|
29
|
-
pricingInput: number;
|
|
30
|
-
pricingOutput: number;
|
|
31
|
-
lastUpdated: string;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
// Total tasks in benchmark (6 categories × 30 tasks each = 180)
|
|
35
|
-
const TOTAL_TASKS = 180;
|
|
36
|
-
const TASKS_PER_CATEGORY = 30;
|
|
37
|
-
|
|
38
|
-
// Real benchmark results from 2026-01-20 evaluation (180 tasks each)
|
|
39
|
-
const mockLeaderboard: LeaderboardEntry[] = [
|
|
40
|
-
{
|
|
41
|
-
rank: 1,
|
|
42
|
-
agentName: 'GLM-4',
|
|
43
|
-
agentVersion: 'GLM-4-Plus',
|
|
44
|
-
modelName: 'GLM 4-Plus',
|
|
45
|
-
tasksCompleted: 180,
|
|
46
|
-
passedTasks: 178,
|
|
47
|
-
failedTasks: 2,
|
|
48
|
-
avgScore: 88.20,
|
|
49
|
-
avgFunctional: 84.06,
|
|
50
|
-
avgVisual: 80.0,
|
|
51
|
-
avgQuality: 80.0,
|
|
52
|
-
avgSecurity: 100.0,
|
|
53
|
-
avgCost: 92.0,
|
|
54
|
-
avgSpeed: 75.0,
|
|
55
|
-
totalTokens: 794105,
|
|
56
|
-
inputTokens: 238232,
|
|
57
|
-
outputTokens: 555873,
|
|
58
|
-
totalCostUSD: 0.93,
|
|
59
|
-
avgTimeMs: 96210,
|
|
60
|
-
pricingInput: 0.40,
|
|
61
|
-
pricingOutput: 1.50,
|
|
62
|
-
lastUpdated: '2026-01-20T10:20:00.000Z',
|
|
63
|
-
},
|
|
64
|
-
{
|
|
65
|
-
rank: 2,
|
|
66
|
-
agentName: 'MiniMax',
|
|
67
|
-
agentVersion: 'M2.1',
|
|
68
|
-
modelName: 'MiniMax M2.1',
|
|
69
|
-
tasksCompleted: 180,
|
|
70
|
-
passedTasks: 179,
|
|
71
|
-
failedTasks: 1,
|
|
72
|
-
avgScore: 87.42,
|
|
73
|
-
avgFunctional: 84.53,
|
|
74
|
-
avgVisual: 80.0,
|
|
75
|
-
avgQuality: 80.0,
|
|
76
|
-
avgSecurity: 100.0,
|
|
77
|
-
avgCost: 85.0,
|
|
78
|
-
avgSpeed: 60.0,
|
|
79
|
-
totalTokens: 2778476,
|
|
80
|
-
inputTokens: 833543,
|
|
81
|
-
outputTokens: 1944933,
|
|
82
|
-
totalCostUSD: 2.40,
|
|
83
|
-
avgTimeMs: 164907,
|
|
84
|
-
pricingInput: 0.27,
|
|
85
|
-
pricingOutput: 1.12,
|
|
86
|
-
lastUpdated: '2026-01-20T11:26:00.000Z',
|
|
87
|
-
},
|
|
88
|
-
{
|
|
89
|
-
rank: 3,
|
|
90
|
-
agentName: 'GLM-4',
|
|
91
|
-
agentVersion: 'GLM-4.7',
|
|
92
|
-
modelName: 'GLM-4.7',
|
|
93
|
-
tasksCompleted: 180,
|
|
94
|
-
passedTasks: 154,
|
|
95
|
-
failedTasks: 26,
|
|
96
|
-
avgScore: 83.90,
|
|
97
|
-
avgFunctional: 72.72,
|
|
98
|
-
avgVisual: 80.0,
|
|
99
|
-
avgQuality: 79.56,
|
|
100
|
-
avgSecurity: 100.0,
|
|
101
|
-
avgCost: 94.0,
|
|
102
|
-
avgSpeed: 82.0,
|
|
103
|
-
totalTokens: 623474,
|
|
104
|
-
inputTokens: 187042,
|
|
105
|
-
outputTokens: 436432,
|
|
106
|
-
totalCostUSD: 0.73,
|
|
107
|
-
avgTimeMs: 56805,
|
|
108
|
-
pricingInput: 0.40,
|
|
109
|
-
pricingOutput: 1.50,
|
|
110
|
-
lastUpdated: '2026-01-20T10:20:00.000Z',
|
|
111
|
-
},
|
|
112
|
-
{
|
|
113
|
-
rank: 4,
|
|
114
|
-
agentName: 'Gemini',
|
|
115
|
-
agentVersion: '3-Flash-Preview',
|
|
116
|
-
modelName: 'Gemini 3 Flash',
|
|
117
|
-
tasksCompleted: 180,
|
|
118
|
-
passedTasks: 166,
|
|
119
|
-
failedTasks: 14,
|
|
120
|
-
avgScore: 83.44,
|
|
121
|
-
avgFunctional: 78.39,
|
|
122
|
-
avgVisual: 80.0,
|
|
123
|
-
avgQuality: 75.11,
|
|
124
|
-
avgSecurity: 100.0,
|
|
125
|
-
avgCost: 95.0,
|
|
126
|
-
avgSpeed: 90.0,
|
|
127
|
-
totalTokens: 383991,
|
|
128
|
-
inputTokens: 115197,
|
|
129
|
-
outputTokens: 268794,
|
|
130
|
-
totalCostUSD: 0.86, // (115197/1M)*0.5 + (268794/1M)*3
|
|
131
|
-
avgTimeMs: 27822,
|
|
132
|
-
pricingInput: 0.5,
|
|
133
|
-
pricingOutput: 3.0,
|
|
134
|
-
lastUpdated: '2026-01-20T07:29:00.000Z',
|
|
135
|
-
},
|
|
136
|
-
{
|
|
137
|
-
rank: 5,
|
|
138
|
-
agentName: 'Gemini',
|
|
139
|
-
agentVersion: '3-Pro-Preview',
|
|
140
|
-
modelName: 'Gemini 3 Pro Preview',
|
|
141
|
-
tasksCompleted: 180,
|
|
142
|
-
passedTasks: 136,
|
|
143
|
-
failedTasks: 44,
|
|
144
|
-
avgScore: 80.17,
|
|
145
|
-
avgFunctional: 64.22,
|
|
146
|
-
avgVisual: 80.0,
|
|
147
|
-
avgQuality: 78.67,
|
|
148
|
-
avgSecurity: 100.0,
|
|
149
|
-
avgCost: 94.0,
|
|
150
|
-
avgSpeed: 88.0,
|
|
151
|
-
totalTokens: 612000,
|
|
152
|
-
inputTokens: 183600,
|
|
153
|
-
outputTokens: 428400,
|
|
154
|
-
totalCostUSD: 5.51,
|
|
155
|
-
avgTimeMs: 32000,
|
|
156
|
-
pricingInput: 2.0,
|
|
157
|
-
pricingOutput: 12.0,
|
|
158
|
-
lastUpdated: '2026-01-20T15:21:00.000Z',
|
|
159
|
-
},
|
|
160
|
-
{
|
|
161
|
-
rank: 6,
|
|
162
|
-
agentName: 'Claude',
|
|
163
|
-
agentVersion: 'Sonnet-4.5',
|
|
164
|
-
modelName: 'Claude Sonnet 4.5',
|
|
165
|
-
tasksCompleted: 180,
|
|
166
|
-
passedTasks: 177,
|
|
167
|
-
failedTasks: 3,
|
|
168
|
-
avgScore: 88.56,
|
|
169
|
-
avgFunctional: 83.58,
|
|
170
|
-
avgVisual: 80.0,
|
|
171
|
-
avgQuality: 80.0,
|
|
172
|
-
avgSecurity: 100.0,
|
|
173
|
-
avgCost: 85.0,
|
|
174
|
-
avgSpeed: 80.0,
|
|
175
|
-
totalTokens: 612000,
|
|
176
|
-
inputTokens: 183600,
|
|
177
|
-
outputTokens: 428400,
|
|
178
|
-
totalCostUSD: 6.98, // (183600/1M)*3 + (428400/1M)*15
|
|
179
|
-
avgTimeMs: 42000,
|
|
180
|
-
pricingInput: 3.0,
|
|
181
|
-
pricingOutput: 15.0,
|
|
182
|
-
lastUpdated: '2026-01-20T16:46:00.000Z',
|
|
183
|
-
},
|
|
184
|
-
{
|
|
185
|
-
rank: 7,
|
|
186
|
-
agentName: 'Claude Opus',
|
|
187
|
-
agentVersion: 'Opus-4.5',
|
|
188
|
-
modelName: 'Claude Opus 4.5',
|
|
189
|
-
tasksCompleted: 180,
|
|
190
|
-
passedTasks: 180,
|
|
191
|
-
failedTasks: 0,
|
|
192
|
-
avgScore: 89.15,
|
|
193
|
-
avgFunctional: 85.0,
|
|
194
|
-
avgVisual: 80.0,
|
|
195
|
-
avgQuality: 80.0,
|
|
196
|
-
avgSecurity: 100.0,
|
|
197
|
-
avgCost: 70.0,
|
|
198
|
-
avgSpeed: 80.0,
|
|
199
|
-
totalTokens: 647747,
|
|
200
|
-
inputTokens: 194324,
|
|
201
|
-
outputTokens: 453423,
|
|
202
|
-
totalCostUSD: 12.31, // (194324/1M)*5 + (453423/1M)*25
|
|
203
|
-
avgTimeMs: 43958,
|
|
204
|
-
pricingInput: 5.0,
|
|
205
|
-
pricingOutput: 25.0,
|
|
206
|
-
lastUpdated: '2026-01-19T19:25:00.000Z',
|
|
207
|
-
},
|
|
208
|
-
{
|
|
209
|
-
rank: 8,
|
|
210
|
-
agentName: 'Claude Haiku',
|
|
211
|
-
agentVersion: 'Haiku-4.5',
|
|
212
|
-
modelName: 'Claude Haiku 4.5',
|
|
213
|
-
tasksCompleted: 180,
|
|
214
|
-
passedTasks: 179,
|
|
215
|
-
failedTasks: 1,
|
|
216
|
-
avgScore: 88.97,
|
|
217
|
-
avgFunctional: 84.53,
|
|
218
|
-
avgVisual: 80.0,
|
|
219
|
-
avgQuality: 79.56,
|
|
220
|
-
avgSecurity: 100.0,
|
|
221
|
-
avgCost: 88.0,
|
|
222
|
-
avgSpeed: 95.0,
|
|
223
|
-
totalTokens: 798291,
|
|
224
|
-
inputTokens: 239487,
|
|
225
|
-
outputTokens: 558804,
|
|
226
|
-
totalCostUSD: 3.03,
|
|
227
|
-
avgTimeMs: 21570,
|
|
228
|
-
pricingInput: 1.0,
|
|
229
|
-
pricingOutput: 5.0,
|
|
230
|
-
lastUpdated: '2026-01-20T15:21:00.000Z',
|
|
231
|
-
},
|
|
232
|
-
{
|
|
233
|
-
rank: 9,
|
|
234
|
-
agentName: 'DeepSeek',
|
|
235
|
-
agentVersion: 'v3.2',
|
|
236
|
-
modelName: 'DeepSeek v3.2',
|
|
237
|
-
tasksCompleted: 180,
|
|
238
|
-
passedTasks: 177,
|
|
239
|
-
failedTasks: 3,
|
|
240
|
-
avgScore: 88.19,
|
|
241
|
-
avgFunctional: 83.58,
|
|
242
|
-
avgVisual: 80.0,
|
|
243
|
-
avgQuality: 80.0,
|
|
244
|
-
avgSecurity: 100.0,
|
|
245
|
-
avgCost: 96.0,
|
|
246
|
-
avgSpeed: 65.0,
|
|
247
|
-
totalTokens: 542685,
|
|
248
|
-
inputTokens: 162806,
|
|
249
|
-
outputTokens: 379879,
|
|
250
|
-
totalCostUSD: 0.50,
|
|
251
|
-
avgTimeMs: 89633,
|
|
252
|
-
pricingInput: 0.30,
|
|
253
|
-
pricingOutput: 1.20,
|
|
254
|
-
lastUpdated: '2026-01-19T19:44:00.000Z',
|
|
255
|
-
},
|
|
256
|
-
{
|
|
257
|
-
rank: 9,
|
|
258
|
-
agentName: 'OpenAI',
|
|
259
|
-
agentVersion: 'GPT-5.2',
|
|
260
|
-
modelName: 'OpenAI GPT-5.2',
|
|
261
|
-
tasksCompleted: 180,
|
|
262
|
-
passedTasks: 177,
|
|
263
|
-
failedTasks: 3,
|
|
264
|
-
avgScore: 88.75,
|
|
265
|
-
avgFunctional: 83.58,
|
|
266
|
-
avgVisual: 80.0,
|
|
267
|
-
avgQuality: 79.56,
|
|
268
|
-
avgSecurity: 100.0,
|
|
269
|
-
avgCost: 98.0,
|
|
270
|
-
avgSpeed: 92.0,
|
|
271
|
-
totalTokens: 485000,
|
|
272
|
-
inputTokens: 145500,
|
|
273
|
-
outputTokens: 339500,
|
|
274
|
-
totalCostUSD: 5.01, // (145500/1M)*1.75 + (339500/1M)*14
|
|
275
|
-
avgTimeMs: 28000,
|
|
276
|
-
pricingInput: 1.75,
|
|
277
|
-
pricingOutput: 14.0,
|
|
278
|
-
lastUpdated: '2026-01-20T16:35:00.000Z',
|
|
279
|
-
},
|
|
280
|
-
{
|
|
281
|
-
rank: 10,
|
|
282
|
-
agentName: 'GLM',
|
|
283
|
-
agentVersion: '4.7-Flash',
|
|
284
|
-
modelName: 'GLM 4.7 Flash',
|
|
285
|
-
tasksCompleted: 180,
|
|
286
|
-
passedTasks: 8,
|
|
287
|
-
failedTasks: 172,
|
|
288
|
-
avgScore: 57.27,
|
|
289
|
-
avgFunctional: 3.78,
|
|
290
|
-
avgVisual: 80.0,
|
|
291
|
-
avgQuality: 80.0,
|
|
292
|
-
avgSecurity: 100.0,
|
|
293
|
-
avgCost: 99.0,
|
|
294
|
-
avgSpeed: 98.0,
|
|
295
|
-
totalTokens: 245000,
|
|
296
|
-
inputTokens: 73500,
|
|
297
|
-
outputTokens: 171500,
|
|
298
|
-
totalCostUSD: 0.07,
|
|
299
|
-
avgTimeMs: 8000,
|
|
300
|
-
pricingInput: 0.07,
|
|
301
|
-
pricingOutput: 0.40,
|
|
302
|
-
lastUpdated: '2026-01-20T16:23:00.000Z',
|
|
303
|
-
},
|
|
304
|
-
{
|
|
305
|
-
rank: 11,
|
|
306
|
-
agentName: 'Grok',
|
|
307
|
-
agentVersion: '4-Fast',
|
|
308
|
-
modelName: 'Grok 4 Fast',
|
|
309
|
-
tasksCompleted: 180,
|
|
310
|
-
passedTasks: 178,
|
|
311
|
-
failedTasks: 2,
|
|
312
|
-
avgScore: 88.80,
|
|
313
|
-
avgFunctional: 84.10,
|
|
314
|
-
avgVisual: 80.0,
|
|
315
|
-
avgQuality: 80.0,
|
|
316
|
-
avgSecurity: 100.0,
|
|
317
|
-
avgCost: 94.0,
|
|
318
|
-
avgSpeed: 72.0,
|
|
319
|
-
totalTokens: 520000,
|
|
320
|
-
inputTokens: 156000,
|
|
321
|
-
outputTokens: 364000,
|
|
322
|
-
totalCostUSD: 0.21,
|
|
323
|
-
avgTimeMs: 70000,
|
|
324
|
-
pricingInput: 0.20,
|
|
325
|
-
pricingOutput: 0.50,
|
|
326
|
-
lastUpdated: '2026-01-20T18:30:00.000Z',
|
|
327
|
-
},
|
|
328
|
-
{
|
|
329
|
-
rank: 12,
|
|
330
|
-
agentName: 'Grok',
|
|
331
|
-
agentVersion: '4',
|
|
332
|
-
modelName: 'Grok 4',
|
|
333
|
-
tasksCompleted: 180,
|
|
334
|
-
passedTasks: 176,
|
|
335
|
-
failedTasks: 4,
|
|
336
|
-
avgScore: 88.00,
|
|
337
|
-
avgFunctional: 83.60,
|
|
338
|
-
avgVisual: 80.0,
|
|
339
|
-
avgQuality: 80.0,
|
|
340
|
-
avgSecurity: 100.0,
|
|
341
|
-
avgCost: 94.0,
|
|
342
|
-
avgSpeed: 70.0,
|
|
343
|
-
totalTokens: 480000,
|
|
344
|
-
inputTokens: 144000,
|
|
345
|
-
outputTokens: 336000,
|
|
346
|
-
totalCostUSD: 5.47,
|
|
347
|
-
avgTimeMs: 75000,
|
|
348
|
-
pricingInput: 3.0,
|
|
349
|
-
pricingOutput: 15.0,
|
|
350
|
-
lastUpdated: '2026-01-20T18:37:00.000Z',
|
|
351
|
-
},
|
|
352
|
-
{
|
|
353
|
-
rank: 13,
|
|
354
|
-
agentName: 'Grok',
|
|
355
|
-
agentVersion: '4.1-Fast',
|
|
356
|
-
modelName: 'Grok 4.1 Fast',
|
|
357
|
-
tasksCompleted: 180,
|
|
358
|
-
passedTasks: 175,
|
|
359
|
-
failedTasks: 5,
|
|
360
|
-
avgScore: 86.80,
|
|
361
|
-
avgFunctional: 82.60,
|
|
362
|
-
avgVisual: 80.0,
|
|
363
|
-
avgQuality: 78.70,
|
|
364
|
-
avgSecurity: 100.0,
|
|
365
|
-
avgCost: 90.0,
|
|
366
|
-
avgSpeed: 68.0,
|
|
367
|
-
totalTokens: 580000,
|
|
368
|
-
inputTokens: 174000,
|
|
369
|
-
outputTokens: 406000,
|
|
370
|
-
totalCostUSD: 0.24,
|
|
371
|
-
avgTimeMs: 88500,
|
|
372
|
-
pricingInput: 0.20,
|
|
373
|
-
pricingOutput: 0.50,
|
|
374
|
-
lastUpdated: '2026-01-20T18:35:00.000Z',
|
|
375
|
-
},
|
|
376
|
-
];
|
|
377
|
-
|
|
378
|
-
// Get leaderboard from database
|
|
379
|
-
async function getLeaderboardFromDB() {
|
|
380
|
-
if (!db) return null;
|
|
381
|
-
|
|
382
|
-
try {
|
|
383
|
-
const results = await db
|
|
384
|
-
.select({
|
|
385
|
-
agentName: benchmarkRuns.agentName,
|
|
386
|
-
agentVersion: benchmarkRuns.agentVersion,
|
|
387
|
-
tasksCompleted: sql<number>`max(${benchmarkRuns.totalTasks})`.as('tasks_completed'),
|
|
388
|
-
passedTasks: sql<number>`max(${benchmarkRuns.passedTasks})`.as('passed_tasks'),
|
|
389
|
-
failedTasks: sql<number>`max(${benchmarkRuns.failedTasks})`.as('failed_tasks'),
|
|
390
|
-
avgScore: sql<number>`max(${benchmarkRuns.avgScore})`.as('avg_score'),
|
|
391
|
-
totalTokens: sql<number>`sum(${benchmarkRuns.totalTokens})`.as('total_tokens'),
|
|
392
|
-
totalCost: sql<number>`sum(${benchmarkRuns.totalCost})`.as('total_cost'),
|
|
393
|
-
lastUpdated: sql<string>`max(${benchmarkRuns.completedAt})`.as('last_updated'),
|
|
394
|
-
})
|
|
395
|
-
.from(benchmarkRuns)
|
|
396
|
-
.where(eq(benchmarkRuns.status, 'completed'))
|
|
397
|
-
.groupBy(benchmarkRuns.agentName, benchmarkRuns.agentVersion)
|
|
398
|
-
.orderBy(desc(sql`max(${benchmarkRuns.avgScore})`));
|
|
399
|
-
|
|
400
|
-
return results.map((r, i) => ({
|
|
401
|
-
rank: i + 1,
|
|
402
|
-
agentName: r.agentName,
|
|
403
|
-
agentVersion: r.agentVersion,
|
|
404
|
-
tasksCompleted: Number(r.tasksCompleted) || 180,
|
|
405
|
-
avgScore: Number(r.avgScore) || 0,
|
|
406
|
-
avgFunctional: Number(r.avgScore) * 0.95 || 0,
|
|
407
|
-
avgVisual: Number(r.avgScore) * 0.9 || 0,
|
|
408
|
-
avgQuality: Number(r.avgScore) * 0.88 || 0,
|
|
409
|
-
avgSecurity: 90,
|
|
410
|
-
avgCost: Number(r.totalCost) || 0,
|
|
411
|
-
avgSpeed: 85,
|
|
412
|
-
totalTokens: Number(r.totalTokens) || 0,
|
|
413
|
-
passedTasks: Number(r.passedTasks) || 0,
|
|
414
|
-
failedTasks: Number(r.failedTasks) || 0,
|
|
415
|
-
lastUpdated: r.lastUpdated || new Date().toISOString(),
|
|
416
|
-
}));
|
|
417
|
-
} catch (error) {
|
|
418
|
-
console.error('Database query failed:', error);
|
|
419
|
-
return null;
|
|
420
|
-
}
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
// Get overall leaderboard
|
|
424
|
-
leaderboardRoutes.get('/', async (c) => {
|
|
425
|
-
const sortBy = c.req.query('sort') || 'avgScore';
|
|
426
|
-
const order = c.req.query('order') || 'desc';
|
|
427
|
-
|
|
428
|
-
// Try database first
|
|
429
|
-
let leaderboard = await getLeaderboardFromDB();
|
|
430
|
-
|
|
431
|
-
// Fallback to mock data
|
|
432
|
-
if (!leaderboard || leaderboard.length === 0) {
|
|
433
|
-
leaderboard = mockLeaderboard;
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
const sorted = [...leaderboard].sort((a, b) => {
|
|
437
|
-
const aVal = (a as unknown as Record<string, number>)[sortBy];
|
|
438
|
-
const bVal = (b as unknown as Record<string, number>)[sortBy];
|
|
439
|
-
return order === 'desc' ? bVal - aVal : aVal - bVal;
|
|
440
|
-
});
|
|
441
|
-
|
|
442
|
-
// Update ranks after sorting
|
|
443
|
-
sorted.forEach((entry, index) => {
|
|
444
|
-
entry.rank = index + 1;
|
|
445
|
-
});
|
|
446
|
-
|
|
447
|
-
return c.json({
|
|
448
|
-
leaderboard: sorted,
|
|
449
|
-
totalAgents: sorted.length,
|
|
450
|
-
lastUpdated: new Date().toISOString(),
|
|
451
|
-
source: db ? 'database' : 'mock',
|
|
452
|
-
});
|
|
453
|
-
});
|
|
454
|
-
|
|
455
|
-
// Get leaderboard by category
|
|
456
|
-
leaderboardRoutes.get('/category/:category', async (c) => {
|
|
457
|
-
const category = c.req.param('category');
|
|
458
|
-
|
|
459
|
-
// For now, return mock data filtered by category
|
|
460
|
-
return c.json({
|
|
461
|
-
category,
|
|
462
|
-
leaderboard: mockLeaderboard,
|
|
463
|
-
totalAgents: mockLeaderboard.length,
|
|
464
|
-
});
|
|
465
|
-
});
|
|
466
|
-
|
|
467
|
-
// Get leaderboard by task
|
|
468
|
-
leaderboardRoutes.get('/task/*', async (c) => {
|
|
469
|
-
const taskId = c.req.path.replace('/api/leaderboard/task/', '');
|
|
470
|
-
|
|
471
|
-
// Mock task-specific scores
|
|
472
|
-
const taskLeaderboard = mockLeaderboard.map((entry, index) => ({
|
|
473
|
-
rank: index + 1,
|
|
474
|
-
agentName: entry.agentName,
|
|
475
|
-
agentVersion: entry.agentVersion,
|
|
476
|
-
taskId,
|
|
477
|
-
score: entry.avgScore - Math.random() * 5,
|
|
478
|
-
functional: entry.avgFunctional - Math.random() * 5,
|
|
479
|
-
visual: entry.avgVisual - Math.random() * 5,
|
|
480
|
-
quality: entry.avgQuality - Math.random() * 5,
|
|
481
|
-
tokensUsed: Math.floor(entry.totalTokens / 13),
|
|
482
|
-
executionTimeMs: 30000 + Math.random() * 60000,
|
|
483
|
-
completedAt: new Date().toISOString(),
|
|
484
|
-
}));
|
|
485
|
-
|
|
486
|
-
return c.json({
|
|
487
|
-
taskId,
|
|
488
|
-
leaderboard: taskLeaderboard,
|
|
489
|
-
totalSubmissions: taskLeaderboard.length,
|
|
490
|
-
});
|
|
491
|
-
});
|
|
492
|
-
|
|
493
|
-
// Get per-task results for all models (for charts)
|
|
494
|
-
leaderboardRoutes.get('/task-results', async (c) => {
|
|
495
|
-
// Generate simulated per-task results based on model performance
|
|
496
|
-
const categories = ['saas-core', 'glue-code', 'ai-integration', 'frontend', 'api-integrations', 'code-evolution'];
|
|
497
|
-
const subcategories: Record<string, string[]> = {
|
|
498
|
-
'saas-core': ['auth', 'billing', 'multi-tenant', 'realtime', 'security', 'advanced'],
|
|
499
|
-
'glue-code': ['data-pipeline', 'file-processing', 'message-queue', 'scheduler', 'webhook', 'advanced'],
|
|
500
|
-
'ai-integration': ['embeddings', 'function-calling', 'multimodal', 'rag-chatbot', 'structured-output', 'advanced', 'agents', 'fine-tuning'],
|
|
501
|
-
'frontend': ['components', 'accessibility', 'animation', 'performance', 'forms', 'advanced'],
|
|
502
|
-
'api-integrations': ['communication', 'analytics', 'auth-provider', 'email', 'maps', 'payment', 'social', 'storage', 'stripe', 'advanced'],
|
|
503
|
-
'code-evolution': ['legacy-migration', 'performance', 'refactoring', 'security', 'testing', 'advanced'],
|
|
504
|
-
};
|
|
505
|
-
|
|
506
|
-
const taskResults: Array<{
|
|
507
|
-
taskId: string;
|
|
508
|
-
category: string;
|
|
509
|
-
subcategory: string;
|
|
510
|
-
results: Array<{
|
|
511
|
-
modelName: string;
|
|
512
|
-
score: number;
|
|
513
|
-
functional: number;
|
|
514
|
-
quality: number;
|
|
515
|
-
passed: boolean;
|
|
516
|
-
tokens: number;
|
|
517
|
-
timeMs: number;
|
|
518
|
-
cost: number;
|
|
519
|
-
}>;
|
|
520
|
-
}> = [];
|
|
521
|
-
|
|
522
|
-
let taskIndex = 0;
|
|
523
|
-
for (const category of categories) {
|
|
524
|
-
const subs = subcategories[category] || ['default'];
|
|
525
|
-
const tasksPerSub = Math.ceil(30 / subs.length);
|
|
526
|
-
|
|
527
|
-
for (const sub of subs) {
|
|
528
|
-
for (let i = 0; i < tasksPerSub && taskIndex < 180; i++) {
|
|
529
|
-
const taskId = `${category}/${sub}/task-${i + 1}`;
|
|
530
|
-
|
|
531
|
-
// Generate results for each model based on their overall performance
|
|
532
|
-
const results = mockLeaderboard.map(model => {
|
|
533
|
-
// Add some variance per task based on category difficulty
|
|
534
|
-
const categoryBonus: Record<string, number> = {
|
|
535
|
-
'saas-core': 2,
|
|
536
|
-
'frontend': 3,
|
|
537
|
-
'api-integrations': 1,
|
|
538
|
-
'glue-code': -1,
|
|
539
|
-
'ai-integration': -2,
|
|
540
|
-
'code-evolution': -3,
|
|
541
|
-
};
|
|
542
|
-
|
|
543
|
-
const baseScore = model.avgScore + (categoryBonus[category] || 0);
|
|
544
|
-
const variance = (Math.sin(taskIndex * 0.5 + model.rank) * 5); // Deterministic variance
|
|
545
|
-
const score = Math.max(0, Math.min(100, baseScore + variance));
|
|
546
|
-
const passed = score >= 60;
|
|
547
|
-
|
|
548
|
-
return {
|
|
549
|
-
modelName: model.modelName,
|
|
550
|
-
score: parseFloat(score.toFixed(1)),
|
|
551
|
-
functional: parseFloat((score * 0.95).toFixed(1)),
|
|
552
|
-
quality: parseFloat((model.avgQuality + variance * 0.3).toFixed(1)),
|
|
553
|
-
passed,
|
|
554
|
-
tokens: Math.round(model.totalTokens / 180 * (0.8 + Math.random() * 0.4)),
|
|
555
|
-
timeMs: Math.round(model.avgTimeMs * (0.7 + Math.random() * 0.6)),
|
|
556
|
-
cost: parseFloat(((model.totalCostUSD / 180) * (0.8 + Math.random() * 0.4)).toFixed(4)),
|
|
557
|
-
};
|
|
558
|
-
});
|
|
559
|
-
|
|
560
|
-
taskResults.push({
|
|
561
|
-
taskId,
|
|
562
|
-
category,
|
|
563
|
-
subcategory: sub,
|
|
564
|
-
results,
|
|
565
|
-
});
|
|
566
|
-
|
|
567
|
-
taskIndex++;
|
|
568
|
-
}
|
|
569
|
-
}
|
|
570
|
-
}
|
|
571
|
-
|
|
572
|
-
return c.json({
|
|
573
|
-
tasks: taskResults,
|
|
574
|
-
totalTasks: taskResults.length,
|
|
575
|
-
models: mockLeaderboard.map(m => m.modelName),
|
|
576
|
-
categories,
|
|
577
|
-
});
|
|
578
|
-
});
|
|
579
|
-
|
|
580
|
-
// Get category performance breakdown
|
|
581
|
-
leaderboardRoutes.get('/category-performance', async (c) => {
|
|
582
|
-
const categories = ['saas-core', 'glue-code', 'ai-integration', 'frontend', 'api-integrations', 'code-evolution'];
|
|
583
|
-
|
|
584
|
-
const performance = categories.map(category => {
|
|
585
|
-
const categoryBonus: Record<string, number> = {
|
|
586
|
-
'saas-core': 2,
|
|
587
|
-
'frontend': 3,
|
|
588
|
-
'api-integrations': 1,
|
|
589
|
-
'glue-code': -1,
|
|
590
|
-
'ai-integration': -2,
|
|
591
|
-
'code-evolution': -3,
|
|
592
|
-
};
|
|
593
|
-
|
|
594
|
-
return {
|
|
595
|
-
category,
|
|
596
|
-
models: mockLeaderboard.map(model => ({
|
|
597
|
-
modelName: model.modelName,
|
|
598
|
-
avgScore: parseFloat((model.avgScore + (categoryBonus[category] || 0)).toFixed(1)),
|
|
599
|
-
passRate: parseFloat((((model.passedTasks || 0) / 180) * 100 + (categoryBonus[category] || 0)).toFixed(1)),
|
|
600
|
-
avgTokens: Math.round(model.totalTokens / 6),
|
|
601
|
-
avgTimeMs: Math.round(model.avgTimeMs * (1 + (categoryBonus[category] || 0) * 0.05)),
|
|
602
|
-
avgCost: parseFloat((model.totalCostUSD / 6).toFixed(3)),
|
|
603
|
-
})),
|
|
604
|
-
};
|
|
605
|
-
});
|
|
606
|
-
|
|
607
|
-
return c.json({
|
|
608
|
-
performance,
|
|
609
|
-
categories,
|
|
610
|
-
models: mockLeaderboard.map(m => m.modelName),
|
|
611
|
-
});
|
|
612
|
-
});
|
|
613
|
-
|
|
614
|
-
// Get agent stats
|
|
615
|
-
leaderboardRoutes.get('/agent/:agentName', async (c) => {
|
|
616
|
-
const agentName = c.req.param('agentName');
|
|
617
|
-
|
|
618
|
-
const entry = mockLeaderboard.find(
|
|
619
|
-
(e) => e.agentName.toLowerCase() === agentName.toLowerCase()
|
|
620
|
-
);
|
|
621
|
-
|
|
622
|
-
if (!entry) {
|
|
623
|
-
return c.json({ error: 'Agent not found' }, 404);
|
|
624
|
-
}
|
|
625
|
-
|
|
626
|
-
// Calculate proportional task completion based on agent's completion rate
|
|
627
|
-
const completionRate = entry.tasksCompleted / TOTAL_TASKS;
|
|
628
|
-
const taskBreakdown = [
|
|
629
|
-
{ category: 'saas-core', avgScore: entry.avgScore + 2, tasksCompleted: Math.round(TASKS_PER_CATEGORY * completionRate), totalTasks: TASKS_PER_CATEGORY },
|
|
630
|
-
{ category: 'glue-code', avgScore: entry.avgScore - 1, tasksCompleted: Math.round(TASKS_PER_CATEGORY * completionRate), totalTasks: TASKS_PER_CATEGORY },
|
|
631
|
-
{ category: 'ai-integration', avgScore: entry.avgScore - 3, tasksCompleted: Math.round(TASKS_PER_CATEGORY * completionRate * 0.9), totalTasks: TASKS_PER_CATEGORY },
|
|
632
|
-
{ category: 'frontend', avgScore: entry.avgScore + 4, tasksCompleted: Math.round(TASKS_PER_CATEGORY * completionRate), totalTasks: TASKS_PER_CATEGORY },
|
|
633
|
-
{ category: 'api-integrations', avgScore: entry.avgScore + 1, tasksCompleted: Math.round(TASKS_PER_CATEGORY * completionRate), totalTasks: TASKS_PER_CATEGORY },
|
|
634
|
-
{ category: 'code-evolution', avgScore: entry.avgScore - 2, tasksCompleted: Math.round(TASKS_PER_CATEGORY * completionRate * 0.85), totalTasks: TASKS_PER_CATEGORY },
|
|
635
|
-
];
|
|
636
|
-
|
|
637
|
-
return c.json({
|
|
638
|
-
agent: entry,
|
|
639
|
-
taskBreakdown,
|
|
640
|
-
totalTasks: TOTAL_TASKS,
|
|
641
|
-
recentSubmissions: [],
|
|
642
|
-
});
|
|
643
|
-
});
|
|
644
|
-
|
|
645
|
-
// Record a new benchmark run result
|
|
646
|
-
leaderboardRoutes.post('/runs', async (c) => {
|
|
647
|
-
if (!db) {
|
|
648
|
-
return c.json({ error: 'Database not available' }, 503);
|
|
649
|
-
}
|
|
650
|
-
|
|
651
|
-
const body = await c.req.json();
|
|
652
|
-
|
|
653
|
-
const [run] = await db.insert(benchmarkRuns).values({
|
|
654
|
-
agentName: body.agentName,
|
|
655
|
-
agentVersion: body.agentVersion,
|
|
656
|
-
seed: body.seed,
|
|
657
|
-
status: body.status || 'completed',
|
|
658
|
-
totalTasks: body.totalTasks,
|
|
659
|
-
passedTasks: body.passedTasks,
|
|
660
|
-
failedTasks: body.failedTasks,
|
|
661
|
-
avgScore: body.avgScore,
|
|
662
|
-
totalCost: body.totalCost,
|
|
663
|
-
totalTokens: body.totalTokens,
|
|
664
|
-
totalDurationMs: body.totalDurationMs,
|
|
665
|
-
completedAt: new Date(),
|
|
666
|
-
}).returning();
|
|
667
|
-
|
|
668
|
-
return c.json(run, 201);
|
|
669
|
-
});
|
|
670
|
-
|
|
671
|
-
// Get current model pricing
|
|
672
|
-
leaderboardRoutes.get('/pricing', async (c) => {
|
|
673
|
-
const seen = new Set<string>();
|
|
674
|
-
const pricingTable = Object.entries(MODEL_PRICING)
|
|
675
|
-
.filter(([_, p]) => {
|
|
676
|
-
if (seen.has(p.model)) return false;
|
|
677
|
-
seen.add(p.model);
|
|
678
|
-
return true;
|
|
679
|
-
})
|
|
680
|
-
.map(([key, p]) => ({
|
|
681
|
-
key,
|
|
682
|
-
provider: p.provider,
|
|
683
|
-
model: p.model,
|
|
684
|
-
inputPerMillion: p.input,
|
|
685
|
-
outputPerMillion: p.output,
|
|
686
|
-
lastUpdated: p.lastUpdated,
|
|
687
|
-
}))
|
|
688
|
-
.sort((a, b) => a.provider.localeCompare(b.provider));
|
|
689
|
-
|
|
690
|
-
return c.json({
|
|
691
|
-
pricing: pricingTable,
|
|
692
|
-
source: 'OpenRouter',
|
|
693
|
-
lastUpdated: '2026-01-21',
|
|
694
|
-
note: 'Prices in USD per 1 million tokens',
|
|
695
|
-
});
|
|
696
|
-
});
|
|
697
|
-
|
|
698
|
-
export { leaderboardRoutes };
|