codex-linux 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +10 -0
- package/.eslintrc.json +27 -0
- package/.github/workflows/ci.yml +156 -0
- package/.huskyrc +7 -0
- package/.lintstagedrc +13 -0
- package/.prettierrc +12 -0
- package/CLAUDE.md +163 -0
- package/DESIGN_SUPERIOR.md +73 -0
- package/Dockerfile +64 -0
- package/INSTALLATION.md +152 -0
- package/LICENSE +21 -0
- package/README.md +245 -0
- package/assets/skills/code-review/instructions.md +102 -0
- package/assets/skills/code-review/skill.yaml +15 -0
- package/assets/skills/refactoring/instructions.md +149 -0
- package/assets/skills/refactoring/skill.yaml +15 -0
- package/assets/skills/testing/skill.yaml +15 -0
- package/commitlint.config.js +23 -0
- package/dist/main/DatabaseManager.js +763 -0
- package/dist/main/DatabaseManager.js.map +1 -0
- package/dist/main/SettingsManager.js +61 -0
- package/dist/main/SettingsManager.js.map +1 -0
- package/dist/main/agents/AgentOrchestrator.js +787 -0
- package/dist/main/agents/AgentOrchestrator.js.map +1 -0
- package/dist/main/agents/AgentSDK.js +219 -0
- package/dist/main/agents/AgentSDK.js.map +1 -0
- package/dist/main/agents/AgentTools.js +348 -0
- package/dist/main/agents/AgentTools.js.map +1 -0
- package/dist/main/agents/CodeIndex.js +233 -0
- package/dist/main/agents/CodeIndex.js.map +1 -0
- package/dist/main/agents/EmbeddingService.js +80 -0
- package/dist/main/agents/EmbeddingService.js.map +1 -0
- package/dist/main/agents/NativeToolCalling.js +206 -0
- package/dist/main/agents/NativeToolCalling.js.map +1 -0
- package/dist/main/api/APIServer.js +278 -0
- package/dist/main/api/APIServer.js.map +1 -0
- package/dist/main/api/RateLimiter.js +138 -0
- package/dist/main/api/RateLimiter.js.map +1 -0
- package/dist/main/api/WebSocketManager.js +300 -0
- package/dist/main/api/WebSocketManager.js.map +1 -0
- package/dist/main/assistant/ContextOptimizer.js +192 -0
- package/dist/main/assistant/ContextOptimizer.js.map +1 -0
- package/dist/main/assistant/PredictedOutputManager.js +172 -0
- package/dist/main/assistant/PredictedOutputManager.js.map +1 -0
- package/dist/main/assistant/PromptCacheManager.js +193 -0
- package/dist/main/assistant/PromptCacheManager.js.map +1 -0
- package/dist/main/assistant/PromptOptimizer.js +626 -0
- package/dist/main/assistant/PromptOptimizer.js.map +1 -0
- package/dist/main/assistant/SmartCodeAssistant.js +224 -0
- package/dist/main/assistant/SmartCodeAssistant.js.map +1 -0
- package/dist/main/auth/SessionManager.js +300 -0
- package/dist/main/auth/SessionManager.js.map +1 -0
- package/dist/main/automations/AdvancedWebhookSystem.js +212 -0
- package/dist/main/automations/AdvancedWebhookSystem.js.map +1 -0
- package/dist/main/automations/AutomationScheduler.js +269 -0
- package/dist/main/automations/AutomationScheduler.js.map +1 -0
- package/dist/main/automations/BatchProcessingSystem.js +159 -0
- package/dist/main/automations/BatchProcessingSystem.js.map +1 -0
- package/dist/main/automations/BrowserAutomationManager.js +195 -0
- package/dist/main/automations/BrowserAutomationManager.js.map +1 -0
- package/dist/main/automations/GitHubActionsManager.js +129 -0
- package/dist/main/automations/GitHubActionsManager.js.map +1 -0
- package/dist/main/automations/GitLabCIManager.js +122 -0
- package/dist/main/automations/GitLabCIManager.js.map +1 -0
- package/dist/main/automations/PriorityQueueManager.js +240 -0
- package/dist/main/automations/PriorityQueueManager.js.map +1 -0
- package/dist/main/background/BackgroundModeManager.js +117 -0
- package/dist/main/background/BackgroundModeManager.js.map +1 -0
- package/dist/main/backup/BackupManager.js +254 -0
- package/dist/main/backup/BackupManager.js.map +1 -0
- package/dist/main/backup/MigrationManager.js +114 -0
- package/dist/main/backup/MigrationManager.js.map +1 -0
- package/dist/main/commands/SlashCommandManager.js +399 -0
- package/dist/main/commands/SlashCommandManager.js.map +1 -0
- package/dist/main/config/ClaudeMdParser.js +519 -0
- package/dist/main/config/ClaudeMdParser.js.map +1 -0
- package/dist/main/config/CustomizationManager.js +381 -0
- package/dist/main/config/CustomizationManager.js.map +1 -0
- package/dist/main/config/LaunchConfigManager.js +211 -0
- package/dist/main/config/LaunchConfigManager.js.map +1 -0
- package/dist/main/config/SettingsManager.js +166 -0
- package/dist/main/config/SettingsManager.js.map +1 -0
- package/dist/main/connectors/ConnectorManager.js +151 -0
- package/dist/main/connectors/ConnectorManager.js.map +1 -0
- package/dist/main/connectors/DatabaseConnector.js +222 -0
- package/dist/main/connectors/DatabaseConnector.js.map +1 -0
- package/dist/main/cowork/CoworkManager.js +324 -0
- package/dist/main/cowork/CoworkManager.js.map +1 -0
- package/dist/main/evals/AgentEvalFramework.js +538 -0
- package/dist/main/evals/AgentEvalFramework.js.map +1 -0
- package/dist/main/evals/GraderManager.js +285 -0
- package/dist/main/evals/GraderManager.js.map +1 -0
- package/dist/main/git/GitWorktreeManager.js +214 -0
- package/dist/main/git/GitWorktreeManager.js.map +1 -0
- package/dist/main/github/GitHubPRMonitor.js +244 -0
- package/dist/main/github/GitHubPRMonitor.js.map +1 -0
- package/dist/main/ide/ContinueInManager.js +181 -0
- package/dist/main/ide/ContinueInManager.js.map +1 -0
- package/dist/main/ide/IDEIntegration.js +277 -0
- package/dist/main/ide/IDEIntegration.js.map +1 -0
- package/dist/main/integrations/LinearManager.js +252 -0
- package/dist/main/integrations/LinearManager.js.map +1 -0
- package/dist/main/integrations/SlackBotManager.js +247 -0
- package/dist/main/integrations/SlackBotManager.js.map +1 -0
- package/dist/main/lsp/LSPManager.js +394 -0
- package/dist/main/lsp/LSPManager.js.map +1 -0
- package/dist/main/main.js +1087 -0
- package/dist/main/main.js.map +1 -0
- package/dist/main/mcp/MCPConfigurationManager.js +281 -0
- package/dist/main/mcp/MCPConfigurationManager.js.map +1 -0
- package/dist/main/mcp/MCPManager.js +710 -0
- package/dist/main/mcp/MCPManager.js.map +1 -0
- package/dist/main/mcp/MCPRegistry.js +272 -0
- package/dist/main/mcp/MCPRegistry.js.map +1 -0
- package/dist/main/monitoring/ErrorRecoveryManager.js +268 -0
- package/dist/main/monitoring/ErrorRecoveryManager.js.map +1 -0
- package/dist/main/monitoring/ErrorTracker.js +57 -0
- package/dist/main/monitoring/ErrorTracker.js.map +1 -0
- package/dist/main/monitoring/MetricsCollector.js +155 -0
- package/dist/main/monitoring/MetricsCollector.js.map +1 -0
- package/dist/main/monitoring/TraceGradingSystem.js +148 -0
- package/dist/main/monitoring/TraceGradingSystem.js.map +1 -0
- package/dist/main/notifications/NotificationManager.js +67 -0
- package/dist/main/notifications/NotificationManager.js.map +1 -0
- package/dist/main/pair/AIPairProgramming.js +200 -0
- package/dist/main/pair/AIPairProgramming.js.map +1 -0
- package/dist/main/plugins/PluginManager.js +222 -0
- package/dist/main/plugins/PluginManager.js.map +1 -0
- package/dist/main/plugins/PluginMarketplace.js +237 -0
- package/dist/main/plugins/PluginMarketplace.js.map +1 -0
- package/dist/main/preload.js +189 -0
- package/dist/main/preload.js.map +1 -0
- package/dist/main/preview/PreviewSessionManager.js +170 -0
- package/dist/main/preview/PreviewSessionManager.js.map +1 -0
- package/dist/main/providers/AIProviderManager.js +327 -0
- package/dist/main/providers/AIProviderManager.js.map +1 -0
- package/dist/main/providers/FineTuningManager.js +276 -0
- package/dist/main/providers/FineTuningManager.js.map +1 -0
- package/dist/main/providers/FreeModelsProvider.js +1104 -0
- package/dist/main/providers/FreeModelsProvider.js.map +1 -0
- package/dist/main/realtime/RealtimeManager.js +116 -0
- package/dist/main/realtime/RealtimeManager.js.map +1 -0
- package/dist/main/remote/CloudEnvironmentManager.js +232 -0
- package/dist/main/remote/CloudEnvironmentManager.js.map +1 -0
- package/dist/main/remote/RemoteSessionManager.js +255 -0
- package/dist/main/remote/RemoteSessionManager.js.map +1 -0
- package/dist/main/search/DeepResearchManager.js +335 -0
- package/dist/main/search/DeepResearchManager.js.map +1 -0
- package/dist/main/search/WebSearchIntegration.js +147 -0
- package/dist/main/search/WebSearchIntegration.js.map +1 -0
- package/dist/main/security/AdminConsoleManager.js +223 -0
- package/dist/main/security/AdminConsoleManager.js.map +1 -0
- package/dist/main/security/AuditLogger.js +136 -0
- package/dist/main/security/AuditLogger.js.map +1 -0
- package/dist/main/security/PermissionManager.js +144 -0
- package/dist/main/security/PermissionManager.js.map +1 -0
- package/dist/main/security/SSOManager.js +173 -0
- package/dist/main/security/SSOManager.js.map +1 -0
- package/dist/main/security/SecurityManager.js +152 -0
- package/dist/main/security/SecurityManager.js.map +1 -0
- package/dist/main/skills/SkillsManager.js +223 -0
- package/dist/main/skills/SkillsManager.js.map +1 -0
- package/dist/main/ssh/SSHManager.js +65 -0
- package/dist/main/ssh/SSHManager.js.map +1 -0
- package/dist/main/streaming/StreamingManager.js +225 -0
- package/dist/main/streaming/StreamingManager.js.map +1 -0
- package/dist/main/sync/CloudSyncManager.js +422 -0
- package/dist/main/sync/CloudSyncManager.js.map +1 -0
- package/dist/main/types.js +28 -0
- package/dist/main/types.js.map +1 -0
- package/dist/main/verification/AutoVerifyManager.js +235 -0
- package/dist/main/verification/AutoVerifyManager.js.map +1 -0
- package/dist/main/vision/ComputerUseManager.js +376 -0
- package/dist/main/vision/ComputerUseManager.js.map +1 -0
- package/dist/main/vision/ImageVideoGenerationManager.js +401 -0
- package/dist/main/vision/ImageVideoGenerationManager.js.map +1 -0
- package/dist/main/vision/VisionManager.js +172 -0
- package/dist/main/vision/VisionManager.js.map +1 -0
- package/dist/renderer/assets/main-DJlZQBCA.js +304 -0
- package/dist/renderer/assets/main-N33ZXEr8.css +1 -0
- package/dist/renderer/index.html +21 -0
- package/dist/renderer/manifest.json +42 -0
- package/dist/renderer/sw.ts +109 -0
- package/dist/shared/types.js +35 -0
- package/dist/shared/types.js.map +1 -0
- package/docker-compose.yml +65 -0
- package/docs/API.md +307 -0
- package/docs/USER_GUIDE.md +476 -0
- package/examples/plugins/sample-plugin/package.json +41 -0
- package/examples/plugins/sample-plugin/src/index.ts +75 -0
- package/index.html +20 -0
- package/jest.config.js +39 -0
- package/package.json +180 -0
- package/packages/cli/package.json +29 -0
- package/packages/cli/src/commands/agents.ts +199 -0
- package/packages/cli/src/commands/tasks.ts +61 -0
- package/packages/cli/src/index.ts +91 -0
- package/packages/cli/src/utils/api.ts +45 -0
- package/packages/cli/src/utils/config.ts +61 -0
- package/packages/npm-installer/bin/codex-linux +126 -0
- package/packages/npm-installer/lib/download.js +273 -0
- package/packages/npm-installer/package.json +42 -0
- package/packages/vscode-extension/package.json +167 -0
- package/packages/vscode-extension/src/api.ts +68 -0
- package/packages/vscode-extension/src/extension.ts +161 -0
- package/packages/vscode-extension/src/panels/chatPanel.ts +265 -0
- package/packages/vscode-extension/src/panels/createAgentPanel.ts +227 -0
- package/packages/vscode-extension/src/providers/agentsProvider.ts +80 -0
- package/postcss.config.js +6 -0
- package/public/manifest.json +42 -0
- package/public/sw.ts +109 -0
- package/scripts/install-dev.sh +103 -0
- package/scripts/install.sh +275 -0
- package/src/main/DatabaseManager.ts +950 -0
- package/src/main/SettingsManager.ts +63 -0
- package/src/main/agents/AgentOrchestrator.ts +930 -0
- package/src/main/agents/AgentSDK.ts +269 -0
- package/src/main/agents/AgentTools.ts +380 -0
- package/src/main/agents/CodeIndex.ts +240 -0
- package/src/main/agents/EmbeddingService.ts +88 -0
- package/src/main/agents/NativeToolCalling.ts +245 -0
- package/src/main/api/APIServer.ts +316 -0
- package/src/main/api/RateLimiter.ts +165 -0
- package/src/main/api/WebSocketManager.ts +398 -0
- package/src/main/assistant/ContextOptimizer.ts +214 -0
- package/src/main/assistant/PredictedOutputManager.ts +265 -0
- package/src/main/assistant/PromptCacheManager.ts +280 -0
- package/src/main/assistant/PromptOptimizer.ts +746 -0
- package/src/main/assistant/SmartCodeAssistant.ts +234 -0
- package/src/main/auth/SessionManager.ts +415 -0
- package/src/main/automations/AdvancedWebhookSystem.ts +281 -0
- package/src/main/automations/AutomationScheduler.ts +272 -0
- package/src/main/automations/BatchProcessingSystem.ts +207 -0
- package/src/main/automations/BrowserAutomationManager.ts +203 -0
- package/src/main/automations/GitHubActionsManager.ts +151 -0
- package/src/main/automations/GitLabCIManager.ts +206 -0
- package/src/main/automations/PriorityQueueManager.ts +328 -0
- package/src/main/background/BackgroundModeManager.ts +130 -0
- package/src/main/backup/BackupManager.ts +287 -0
- package/src/main/backup/MigrationManager.ts +132 -0
- package/src/main/commands/SlashCommandManager.ts +407 -0
- package/src/main/config/ClaudeMdParser.ts +539 -0
- package/src/main/config/CustomizationManager.ts +493 -0
- package/src/main/config/LaunchConfigManager.ts +212 -0
- package/src/main/config/SettingsManager.ts +163 -0
- package/src/main/connectors/ConnectorManager.ts +175 -0
- package/src/main/connectors/DatabaseConnector.ts +212 -0
- package/src/main/cowork/CoworkManager.ts +431 -0
- package/src/main/evals/AgentEvalFramework.ts +665 -0
- package/src/main/evals/GraderManager.ts +417 -0
- package/src/main/git/GitWorktreeManager.ts +211 -0
- package/src/main/github/GitHubPRMonitor.ts +317 -0
- package/src/main/ide/ContinueInManager.ts +180 -0
- package/src/main/ide/IDEIntegration.ts +288 -0
- package/src/main/integrations/LinearManager.ts +327 -0
- package/src/main/integrations/SlackBotManager.ts +312 -0
- package/src/main/lsp/LSPManager.ts +445 -0
- package/src/main/main.ts +1221 -0
- package/src/main/mcp/MCPConfigurationManager.ts +281 -0
- package/src/main/mcp/MCPManager.ts +799 -0
- package/src/main/mcp/MCPRegistry.ts +273 -0
- package/src/main/monitoring/ErrorRecoveryManager.ts +359 -0
- package/src/main/monitoring/ErrorTracker.ts +60 -0
- package/src/main/monitoring/MetricsCollector.ts +196 -0
- package/src/main/monitoring/TraceGradingSystem.ts +196 -0
- package/src/main/notifications/NotificationManager.ts +96 -0
- package/src/main/pair/AIPairProgramming.ts +290 -0
- package/src/main/plugins/PluginManager.ts +266 -0
- package/src/main/plugins/PluginMarketplace.ts +318 -0
- package/src/main/preload.ts +215 -0
- package/src/main/preview/PreviewSessionManager.ts +186 -0
- package/src/main/providers/AIProviderManager.ts +394 -0
- package/src/main/providers/FineTuningManager.ts +390 -0
- package/src/main/providers/FreeModelsProvider.ts +1156 -0
- package/src/main/realtime/RealtimeManager.ts +147 -0
- package/src/main/remote/CloudEnvironmentManager.ts +253 -0
- package/src/main/remote/RemoteSessionManager.ts +323 -0
- package/src/main/search/DeepResearchManager.ts +458 -0
- package/src/main/search/WebSearchIntegration.ts +203 -0
- package/src/main/security/AdminConsoleManager.ts +244 -0
- package/src/main/security/AuditLogger.ts +143 -0
- package/src/main/security/PermissionManager.ts +184 -0
- package/src/main/security/SSOManager.ts +241 -0
- package/src/main/security/SecurityManager.ts +139 -0
- package/src/main/skills/SkillsManager.ts +218 -0
- package/src/main/ssh/SSHManager.ts +86 -0
- package/src/main/streaming/StreamingManager.ts +306 -0
- package/src/main/sync/CloudSyncManager.ts +532 -0
- package/src/main/verification/AutoVerifyManager.ts +285 -0
- package/src/main/vision/ComputerUseManager.ts +475 -0
- package/src/main/vision/ImageVideoGenerationManager.ts +526 -0
- package/src/main/vision/VisionManager.ts +186 -0
- package/src/renderer/App.tsx +314 -0
- package/src/renderer/components/AdvancedSettingsPanel.tsx +225 -0
- package/src/renderer/components/AgentPanel.tsx +760 -0
- package/src/renderer/components/AppPreview.tsx +220 -0
- package/src/renderer/components/AuditTrailPanel.tsx +148 -0
- package/src/renderer/components/AutomationPanel.tsx +220 -0
- package/src/renderer/components/ChatInterface.tsx +595 -0
- package/src/renderer/components/ChatTab.tsx +296 -0
- package/src/renderer/components/CodeEditor.tsx +257 -0
- package/src/renderer/components/CodeReviewPanel.tsx +256 -0
- package/src/renderer/components/CodeWorkspace.tsx +192 -0
- package/src/renderer/components/CodebaseDashboard.tsx +295 -0
- package/src/renderer/components/ComputerUsePanel.tsx +262 -0
- package/src/renderer/components/ConnectorsPanel.tsx +471 -0
- package/src/renderer/components/ContextMenu.tsx +155 -0
- package/src/renderer/components/ContextUsageDisplay.tsx +248 -0
- package/src/renderer/components/CoworkPanel.tsx +415 -0
- package/src/renderer/components/DiffViewer.tsx +452 -0
- package/src/renderer/components/ErrorBoundary.tsx +273 -0
- package/src/renderer/components/ExtendedThinkingToggle.tsx +244 -0
- package/src/renderer/components/FileAttachments.tsx +247 -0
- package/src/renderer/components/FileExplorer.tsx +242 -0
- package/src/renderer/components/FileExplorerPanel.tsx +302 -0
- package/src/renderer/components/GitPanel.tsx +154 -0
- package/src/renderer/components/Header.tsx +113 -0
- package/src/renderer/components/MCPPanel.tsx +326 -0
- package/src/renderer/components/MentionAutocomplete.tsx +239 -0
- package/src/renderer/components/PermissionPanel.tsx +159 -0
- package/src/renderer/components/PermissionSelector.tsx +203 -0
- package/src/renderer/components/PluginMarketplace.tsx +325 -0
- package/src/renderer/components/PromptOptimizerPanel.tsx +399 -0
- package/src/renderer/components/SearchPanel.tsx +173 -0
- package/src/renderer/components/SearchReplace.tsx +284 -0
- package/src/renderer/components/SessionSidebar.tsx +367 -0
- package/src/renderer/components/SettingsPanel.tsx +426 -0
- package/src/renderer/components/Sidebar.tsx +100 -0
- package/src/renderer/components/SkillsPanel.tsx +245 -0
- package/src/renderer/components/SplitPane.tsx +173 -0
- package/src/renderer/components/Terminal.tsx +190 -0
- package/src/renderer/components/VoiceCommand.tsx +129 -0
- package/src/renderer/components/WorktreePanel.tsx +163 -0
- package/src/renderer/components/ui/AriaComponents.tsx +193 -0
- package/src/renderer/components/ui/Button.tsx +68 -0
- package/src/renderer/components/ui/Card.tsx +102 -0
- package/src/renderer/components/ui/Input.tsx +44 -0
- package/src/renderer/components/ui/Skeleton.tsx +55 -0
- package/src/renderer/components/ui/VirtualList.tsx +196 -0
- package/src/renderer/i18n/I18nProvider.tsx +101 -0
- package/src/renderer/i18n/de.ts +161 -0
- package/src/renderer/i18n/en.ts +163 -0
- package/src/renderer/i18n/es.ts +161 -0
- package/src/renderer/i18n/fr.ts +161 -0
- package/src/renderer/i18n/index.ts +44 -0
- package/src/renderer/index.css +129 -0
- package/src/renderer/lib/accessibility.tsx +287 -0
- package/src/renderer/lib/hooks.ts +304 -0
- package/src/renderer/lib/utils.ts +6 -0
- package/src/renderer/main.tsx +25 -0
- package/src/renderer/styles/minimalist.css +539 -0
- package/src/renderer/sw.ts +180 -0
- package/src/renderer/types.d.ts +138 -0
- package/src/shared/types.ts +813 -0
- package/supabase/schema.sql +234 -0
- package/tailwind.config.js +78 -0
- package/tests/e2e/package.json +15 -0
- package/tests/e2e/playwright.config.ts +31 -0
- package/tests/e2e/specs/app.spec.ts +194 -0
- package/tests/setup.ts +99 -0
- package/tests/unit/AgentOrchestrator.test.ts +274 -0
- package/tests/unit/DatabaseManager.test.ts +262 -0
- package/tests/unit/GitWorktreeManager.test.ts +150 -0
- package/tests/unit/SecurityManager.test.ts +110 -0
- package/tsconfig.main.json +22 -0
- package/tsconfig.renderer.json +27 -0
- package/vite.config.ts +28 -0
|
@@ -0,0 +1,665 @@
|
|
|
1
|
+
import { EventEmitter } from 'events';
|
|
2
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
3
|
+
import log from 'electron-log';
|
|
4
|
+
import { AgentOrchestrator } from '../agents/AgentOrchestrator';
|
|
5
|
+
import { AIProviderManager } from '../providers/AIProviderManager';
|
|
6
|
+
import { Agent, AgentMessage } from '../../shared/types';
|
|
7
|
+
import * as fs from 'fs/promises';
|
|
8
|
+
import * as path from 'path';
|
|
9
|
+
|
|
10
|
+
export enum EvalCategory {
|
|
11
|
+
CODE_GENERATION = 'code_generation',
|
|
12
|
+
CODE_REVIEW = 'code_review',
|
|
13
|
+
REFACTORING = 'refactoring',
|
|
14
|
+
DEBUGGING = 'debugging',
|
|
15
|
+
DOCUMENTATION = 'documentation',
|
|
16
|
+
TESTING = 'testing',
|
|
17
|
+
SECURITY = 'security',
|
|
18
|
+
PERFORMANCE = 'performance',
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface EvalTestCase {
|
|
22
|
+
id: string;
|
|
23
|
+
name: string;
|
|
24
|
+
category: EvalCategory;
|
|
25
|
+
description: string;
|
|
26
|
+
prompt: string;
|
|
27
|
+
context?: {
|
|
28
|
+
files?: Array<{
|
|
29
|
+
path: string;
|
|
30
|
+
content: string;
|
|
31
|
+
}>;
|
|
32
|
+
codebase?: string;
|
|
33
|
+
};
|
|
34
|
+
expectedOutput?: string;
|
|
35
|
+
expectedFiles?: Array<{
|
|
36
|
+
path: string;
|
|
37
|
+
shouldExist: boolean;
|
|
38
|
+
contentPattern?: RegExp;
|
|
39
|
+
}>;
|
|
40
|
+
validationCriteria: EvalCriteria[];
|
|
41
|
+
timeout: number;
|
|
42
|
+
difficulty: 'easy' | 'medium' | 'hard';
|
|
43
|
+
tags: string[];
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export interface EvalCriteria {
|
|
47
|
+
id: string;
|
|
48
|
+
name: string;
|
|
49
|
+
description: string;
|
|
50
|
+
weight: number;
|
|
51
|
+
validator: (result: EvalResult) => boolean | Promise<boolean>;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface EvalResult {
|
|
55
|
+
testCaseId: string;
|
|
56
|
+
agentId: string;
|
|
57
|
+
success: boolean;
|
|
58
|
+
score: number;
|
|
59
|
+
maxScore: number;
|
|
60
|
+
duration: number;
|
|
61
|
+
messages: AgentMessage[];
|
|
62
|
+
criteriaResults: Array<{
|
|
63
|
+
criteriaId: string;
|
|
64
|
+
passed: boolean;
|
|
65
|
+
score: number;
|
|
66
|
+
details?: string;
|
|
67
|
+
}>;
|
|
68
|
+
errors: string[];
|
|
69
|
+
warnings: string[];
|
|
70
|
+
metadata: {
|
|
71
|
+
tokensUsed?: number;
|
|
72
|
+
modelUsed?: string;
|
|
73
|
+
cost?: number;
|
|
74
|
+
};
|
|
75
|
+
createdAt: Date;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export interface EvalDataset {
|
|
79
|
+
id: string;
|
|
80
|
+
name: string;
|
|
81
|
+
description: string;
|
|
82
|
+
version: string;
|
|
83
|
+
testCases: EvalTestCase[];
|
|
84
|
+
createdAt: Date;
|
|
85
|
+
updatedAt: Date;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export interface EvalRun {
|
|
89
|
+
id: string;
|
|
90
|
+
datasetId: string;
|
|
91
|
+
agentConfig: {
|
|
92
|
+
providerId: string;
|
|
93
|
+
model: string;
|
|
94
|
+
skills: string[];
|
|
95
|
+
};
|
|
96
|
+
results: EvalResult[];
|
|
97
|
+
summary: {
|
|
98
|
+
totalTests: number;
|
|
99
|
+
passedTests: number;
|
|
100
|
+
failedTests: number;
|
|
101
|
+
averageScore: number;
|
|
102
|
+
totalDuration: number;
|
|
103
|
+
};
|
|
104
|
+
status: 'pending' | 'running' | 'completed' | 'failed';
|
|
105
|
+
startedAt: Date;
|
|
106
|
+
completedAt?: Date;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
export class AgentEvalFramework extends EventEmitter {
|
|
110
|
+
private agentOrchestrator: AgentOrchestrator;
|
|
111
|
+
private aiProviderManager: AIProviderManager;
|
|
112
|
+
private datasets: Map<string, EvalDataset> = new Map();
|
|
113
|
+
private activeRuns: Map<string, EvalRun> = new Map();
|
|
114
|
+
private runHistory: EvalRun[] = [];
|
|
115
|
+
private datasetsPath: string;
|
|
116
|
+
|
|
117
|
+
constructor(
|
|
118
|
+
agentOrchestrator: AgentOrchestrator,
|
|
119
|
+
aiProviderManager: AIProviderManager,
|
|
120
|
+
datasetsPath: string = './eval-datasets'
|
|
121
|
+
) {
|
|
122
|
+
super();
|
|
123
|
+
this.agentOrchestrator = agentOrchestrator;
|
|
124
|
+
this.aiProviderManager = aiProviderManager;
|
|
125
|
+
this.datasetsPath = datasetsPath;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
async initialize(): Promise<void> {
|
|
129
|
+
await this.loadDefaultDatasets();
|
|
130
|
+
await this.loadCustomDatasets();
|
|
131
|
+
log.info('Agent Evaluation Framework initialized');
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
private async loadDefaultDatasets(): Promise<void> {
|
|
135
|
+
// Code Generation Dataset
|
|
136
|
+
const codeGenDataset: EvalDataset = {
|
|
137
|
+
id: 'code-generation-v1',
|
|
138
|
+
name: 'Code Generation Benchmark',
|
|
139
|
+
description: 'Tests agent ability to generate correct and efficient code',
|
|
140
|
+
version: '1.0.0',
|
|
141
|
+
createdAt: new Date(),
|
|
142
|
+
updatedAt: new Date(),
|
|
143
|
+
testCases: [
|
|
144
|
+
{
|
|
145
|
+
id: 'codegen-001',
|
|
146
|
+
name: 'Generate Fibonacci Function',
|
|
147
|
+
category: EvalCategory.CODE_GENERATION,
|
|
148
|
+
description: 'Generate an efficient Fibonacci function',
|
|
149
|
+
prompt: 'Write a TypeScript function that calculates the nth Fibonacci number efficiently. The function should handle edge cases and large inputs (n up to 1000).',
|
|
150
|
+
expectedFiles: [{
|
|
151
|
+
path: 'fibonacci.ts',
|
|
152
|
+
shouldExist: true,
|
|
153
|
+
contentPattern: /function\s+fibonacci|const\s+fibonacci/,
|
|
154
|
+
}],
|
|
155
|
+
validationCriteria: [
|
|
156
|
+
{
|
|
157
|
+
id: 'syntax-valid',
|
|
158
|
+
name: 'Valid TypeScript Syntax',
|
|
159
|
+
description: 'Generated code must be valid TypeScript',
|
|
160
|
+
weight: 0.2,
|
|
161
|
+
validator: (result) => result.errors.length === 0,
|
|
162
|
+
},
|
|
163
|
+
{
|
|
164
|
+
id: 'efficient',
|
|
165
|
+
name: 'Efficient Implementation',
|
|
166
|
+
description: 'Uses memoization or iterative approach, not naive recursion',
|
|
167
|
+
weight: 0.3,
|
|
168
|
+
validator: (result) => {
|
|
169
|
+
const content = result.messages.map(m => m.content).join('');
|
|
170
|
+
return /memo|cache|iterat|loop|for\s*\(|while\s*\(/.test(content);
|
|
171
|
+
},
|
|
172
|
+
},
|
|
173
|
+
{
|
|
174
|
+
id: 'handles-edge-cases',
|
|
175
|
+
name: 'Edge Case Handling',
|
|
176
|
+
description: 'Handles negative inputs and large numbers',
|
|
177
|
+
weight: 0.3,
|
|
178
|
+
validator: (result) => {
|
|
179
|
+
const content = result.messages.map(m => m.content).join('');
|
|
180
|
+
return /if.*n\s*<\s*0|n\s*===\s*0|n\s*===\s*1|throw|error/i.test(content);
|
|
181
|
+
},
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
id: 'includes-tests',
|
|
185
|
+
name: 'Includes Test Cases',
|
|
186
|
+
description: 'Provides example usage or test cases',
|
|
187
|
+
weight: 0.2,
|
|
188
|
+
validator: (result) => {
|
|
189
|
+
const content = result.messages.map(m => m.content).join('');
|
|
190
|
+
return /test|example|console\.log|expect/.test(content);
|
|
191
|
+
},
|
|
192
|
+
},
|
|
193
|
+
],
|
|
194
|
+
timeout: 60000,
|
|
195
|
+
difficulty: 'medium',
|
|
196
|
+
tags: ['typescript', 'algorithms', 'fibonacci'],
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
id: 'codegen-002',
|
|
200
|
+
name: 'API Endpoint Implementation',
|
|
201
|
+
category: EvalCategory.CODE_GENERATION,
|
|
202
|
+
description: 'Generate a REST API endpoint with validation',
|
|
203
|
+
prompt: 'Create an Express.js POST endpoint for creating a user. Include input validation, error handling, and proper response formatting.',
|
|
204
|
+
validationCriteria: [
|
|
205
|
+
{
|
|
206
|
+
id: 'express-route',
|
|
207
|
+
name: 'Express Route Definition',
|
|
208
|
+
description: 'Uses app.post() or router.post()',
|
|
209
|
+
weight: 0.2,
|
|
210
|
+
validator: (result) => {
|
|
211
|
+
const content = result.messages.map(m => m.content).join('');
|
|
212
|
+
return /app\.post|router\.post/.test(content);
|
|
213
|
+
},
|
|
214
|
+
},
|
|
215
|
+
{
|
|
216
|
+
id: 'validation',
|
|
217
|
+
name: 'Input Validation',
|
|
218
|
+
description: 'Validates request body',
|
|
219
|
+
weight: 0.3,
|
|
220
|
+
validator: (result) => {
|
|
221
|
+
const content = result.messages.map(m => m.content).join('');
|
|
222
|
+
return /validate|joi|zod|express-validator|req\.body/.test(content);
|
|
223
|
+
},
|
|
224
|
+
},
|
|
225
|
+
{
|
|
226
|
+
id: 'error-handling',
|
|
227
|
+
name: 'Error Handling',
|
|
228
|
+
description: 'Includes try-catch or error middleware',
|
|
229
|
+
weight: 0.3,
|
|
230
|
+
validator: (result) => {
|
|
231
|
+
const content = result.messages.map(m => m.content).join('');
|
|
232
|
+
return /try\s*{|catch|error|Error/.test(content);
|
|
233
|
+
},
|
|
234
|
+
},
|
|
235
|
+
{
|
|
236
|
+
id: 'response-format',
|
|
237
|
+
name: 'Proper Response Format',
|
|
238
|
+
description: 'Returns JSON with status codes',
|
|
239
|
+
weight: 0.2,
|
|
240
|
+
validator: (result) => {
|
|
241
|
+
const content = result.messages.map(m => m.content).join('');
|
|
242
|
+
return /res\.json|res\.status|200|201|400|500/.test(content);
|
|
243
|
+
},
|
|
244
|
+
},
|
|
245
|
+
],
|
|
246
|
+
timeout: 90000,
|
|
247
|
+
difficulty: 'medium',
|
|
248
|
+
tags: ['express', 'api', 'validation', 'javascript'],
|
|
249
|
+
},
|
|
250
|
+
],
|
|
251
|
+
};
|
|
252
|
+
|
|
253
|
+
// Code Review Dataset
|
|
254
|
+
const codeReviewDataset: EvalDataset = {
|
|
255
|
+
id: 'code-review-v1',
|
|
256
|
+
name: 'Code Review Benchmark',
|
|
257
|
+
description: 'Tests agent ability to identify issues and suggest improvements',
|
|
258
|
+
version: '1.0.0',
|
|
259
|
+
createdAt: new Date(),
|
|
260
|
+
updatedAt: new Date(),
|
|
261
|
+
testCases: [
|
|
262
|
+
{
|
|
263
|
+
id: 'review-001',
|
|
264
|
+
name: 'Security Issues Detection',
|
|
265
|
+
category: EvalCategory.CODE_REVIEW,
|
|
266
|
+
description: 'Identify security vulnerabilities in code',
|
|
267
|
+
prompt: 'Review this code for security issues:\n\n```javascript\nfunction authenticateUser(username, password) {\n const query = `SELECT * FROM users WHERE username = \'${username}\' AND password = \'${password}\'`;\n return db.query(query);\n}\n```\n\nIdentify all security issues and suggest fixes.',
|
|
268
|
+
context: {
|
|
269
|
+
files: [{
|
|
270
|
+
path: 'auth.js',
|
|
271
|
+
content: `function authenticateUser(username, password) {
|
|
272
|
+
const query = \`SELECT * FROM users WHERE username = '\${username}' AND password = '\${password}'\`;
|
|
273
|
+
return db.query(query);
|
|
274
|
+
}`,
|
|
275
|
+
}],
|
|
276
|
+
},
|
|
277
|
+
validationCriteria: [
|
|
278
|
+
{
|
|
279
|
+
id: 'sql-injection',
|
|
280
|
+
name: 'SQL Injection Detection',
|
|
281
|
+
description: 'Identifies SQL injection vulnerability',
|
|
282
|
+
weight: 0.4,
|
|
283
|
+
validator: (result) => {
|
|
284
|
+
const content = result.messages.map(m => m.content).join('').toLowerCase();
|
|
285
|
+
return /sql injection|sqli|parameterized|prepared statement/.test(content);
|
|
286
|
+
},
|
|
287
|
+
},
|
|
288
|
+
{
|
|
289
|
+
id: 'plaintext-password',
|
|
290
|
+
name: 'Plaintext Password Detection',
|
|
291
|
+
description: 'Identifies plaintext password storage',
|
|
292
|
+
weight: 0.3,
|
|
293
|
+
validator: (result) => {
|
|
294
|
+
const content = result.messages.map(m => m.content).join('').toLowerCase();
|
|
295
|
+
return /plaintext|hash|bcrypt|scrypt|argon2/.test(content);
|
|
296
|
+
},
|
|
297
|
+
},
|
|
298
|
+
{
|
|
299
|
+
id: 'provides-fix',
|
|
300
|
+
name: 'Provides Fix',
|
|
301
|
+
description: 'Provides corrected code',
|
|
302
|
+
weight: 0.3,
|
|
303
|
+
validator: (result) => {
|
|
304
|
+
const content = result.messages.map(m => m.content).join('');
|
|
305
|
+
return /```[\s\S]*?```/.test(content);
|
|
306
|
+
},
|
|
307
|
+
},
|
|
308
|
+
],
|
|
309
|
+
timeout: 60000,
|
|
310
|
+
difficulty: 'easy',
|
|
311
|
+
tags: ['security', 'sql-injection', 'review'],
|
|
312
|
+
},
|
|
313
|
+
],
|
|
314
|
+
};
|
|
315
|
+
|
|
316
|
+
// Refactoring Dataset
|
|
317
|
+
const refactoringDataset: EvalDataset = {
|
|
318
|
+
id: 'refactoring-v1',
|
|
319
|
+
name: 'Refactoring Benchmark',
|
|
320
|
+
description: 'Tests agent ability to improve code quality',
|
|
321
|
+
version: '1.0.0',
|
|
322
|
+
createdAt: new Date(),
|
|
323
|
+
updatedAt: new Date(),
|
|
324
|
+
testCases: [
|
|
325
|
+
{
|
|
326
|
+
id: 'refactor-001',
|
|
327
|
+
name: 'Extract Function Refactoring',
|
|
328
|
+
category: EvalCategory.REFACTORING,
|
|
329
|
+
description: 'Refactor duplicated code into a function',
|
|
330
|
+
prompt: 'Refactor this code to eliminate duplication:\n\n```typescript\nfunction processOrders(orders: Order[]) {\n for (const order of orders) {\n if (order.status === "pending") {\n const tax = order.amount * 0.1;\n const total = order.amount + tax;\n order.total = total;\n order.status = "processed";\n }\n }\n \n for (const order of orders) {\n if (order.priority === "high") {\n const tax = order.amount * 0.1;\n const total = order.amount + tax;\n order.total = total;\n order.priority = "processed";\n }\n }\n}\n```',
|
|
331
|
+
validationCriteria: [
|
|
332
|
+
{
|
|
333
|
+
id: 'extracts-function',
|
|
334
|
+
name: 'Extracts Helper Function',
|
|
335
|
+
description: 'Creates a separate function for tax calculation',
|
|
336
|
+
weight: 0.4,
|
|
337
|
+
validator: (result) => {
|
|
338
|
+
const content = result.messages.map(m => m.content).join('');
|
|
339
|
+
return /function\s+\w+.*tax|calculateTax|computeTax/.test(content);
|
|
340
|
+
},
|
|
341
|
+
},
|
|
342
|
+
{
|
|
343
|
+
id: 'removes-duplication',
|
|
344
|
+
name: 'Removes Code Duplication',
|
|
345
|
+
description: 'Tax calculation appears only once',
|
|
346
|
+
weight: 0.4,
|
|
347
|
+
validator: (result) => {
|
|
348
|
+
const content = result.messages.map(m => m.content).join('');
|
|
349
|
+
const matches = content.match(/\*\s*0\.1/g);
|
|
350
|
+
return !matches || matches.length <= 1;
|
|
351
|
+
},
|
|
352
|
+
},
|
|
353
|
+
{
|
|
354
|
+
id: 'maintains-functionality',
|
|
355
|
+
name: 'Maintains Functionality',
|
|
356
|
+
description: 'Refactored code produces same results',
|
|
357
|
+
weight: 0.2,
|
|
358
|
+
validator: () => true, // Would require actual execution
|
|
359
|
+
},
|
|
360
|
+
],
|
|
361
|
+
timeout: 90000,
|
|
362
|
+
difficulty: 'medium',
|
|
363
|
+
tags: ['refactoring', 'dry', 'typescript'],
|
|
364
|
+
},
|
|
365
|
+
],
|
|
366
|
+
};
|
|
367
|
+
|
|
368
|
+
this.datasets.set(codeGenDataset.id, codeGenDataset);
|
|
369
|
+
this.datasets.set(codeReviewDataset.id, codeReviewDataset);
|
|
370
|
+
this.datasets.set(refactoringDataset.id, refactoringDataset);
|
|
371
|
+
|
|
372
|
+
log.info(`Loaded ${this.datasets.size} default evaluation datasets`);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
private async loadCustomDatasets(): Promise<void> {
|
|
376
|
+
try {
|
|
377
|
+
await fs.mkdir(this.datasetsPath, { recursive: true });
|
|
378
|
+
const files = await fs.readdir(this.datasetsPath);
|
|
379
|
+
|
|
380
|
+
for (const file of files) {
|
|
381
|
+
if (file.endsWith('.json')) {
|
|
382
|
+
try {
|
|
383
|
+
const content = await fs.readFile(
|
|
384
|
+
path.join(this.datasetsPath, file),
|
|
385
|
+
'utf-8'
|
|
386
|
+
);
|
|
387
|
+
const dataset: EvalDataset = JSON.parse(content);
|
|
388
|
+
this.datasets.set(dataset.id, dataset);
|
|
389
|
+
log.info(`Loaded custom dataset: ${dataset.name}`);
|
|
390
|
+
} catch (error) {
|
|
391
|
+
log.error(`Failed to load dataset ${file}:`, error);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
} catch (error) {
|
|
396
|
+
log.error('Failed to load custom datasets:', error);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
async createRun(
|
|
401
|
+
datasetId: string,
|
|
402
|
+
agentConfig: EvalRun['agentConfig'],
|
|
403
|
+
options: {
|
|
404
|
+
testCaseIds?: string[];
|
|
405
|
+
parallel?: boolean;
|
|
406
|
+
maxParallel?: number;
|
|
407
|
+
} = {}
|
|
408
|
+
): Promise<EvalRun> {
|
|
409
|
+
const dataset = this.datasets.get(datasetId);
|
|
410
|
+
if (!dataset) {
|
|
411
|
+
throw new Error(`Dataset ${datasetId} not found`);
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
const runId = uuidv4();
|
|
415
|
+
const testCases = options.testCaseIds
|
|
416
|
+
? dataset.testCases.filter(tc => options.testCaseIds?.includes(tc.id))
|
|
417
|
+
: dataset.testCases;
|
|
418
|
+
|
|
419
|
+
const run: EvalRun = {
|
|
420
|
+
id: runId,
|
|
421
|
+
datasetId,
|
|
422
|
+
agentConfig,
|
|
423
|
+
results: [],
|
|
424
|
+
summary: {
|
|
425
|
+
totalTests: testCases.length,
|
|
426
|
+
passedTests: 0,
|
|
427
|
+
failedTests: 0,
|
|
428
|
+
averageScore: 0,
|
|
429
|
+
totalDuration: 0,
|
|
430
|
+
},
|
|
431
|
+
status: 'pending',
|
|
432
|
+
startedAt: new Date(),
|
|
433
|
+
};
|
|
434
|
+
|
|
435
|
+
this.activeRuns.set(runId, run);
|
|
436
|
+
this.emit('run:created', { runId, datasetId });
|
|
437
|
+
|
|
438
|
+
// Start evaluation
|
|
439
|
+
this.executeRun(run, testCases, options).catch(error => {
|
|
440
|
+
log.error(`Eval run ${runId} failed:`, error);
|
|
441
|
+
run.status = 'failed';
|
|
442
|
+
this.emit('run:failed', { runId, error });
|
|
443
|
+
});
|
|
444
|
+
|
|
445
|
+
return run;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
private async executeRun(
|
|
449
|
+
run: EvalRun,
|
|
450
|
+
testCases: EvalTestCase[],
|
|
451
|
+
options: { parallel?: boolean; maxParallel?: number }
|
|
452
|
+
): Promise<void> {
|
|
453
|
+
run.status = 'running';
|
|
454
|
+
this.emit('run:started', { runId: run.id });
|
|
455
|
+
|
|
456
|
+
const startTime = Date.now();
|
|
457
|
+
|
|
458
|
+
if (options.parallel) {
|
|
459
|
+
// Execute in parallel with limit
|
|
460
|
+
const limit = options.maxParallel || 3;
|
|
461
|
+
const chunks = this.chunkArray(testCases, limit);
|
|
462
|
+
|
|
463
|
+
for (const chunk of chunks) {
|
|
464
|
+
await Promise.all(
|
|
465
|
+
chunk.map(tc => this.executeTestCase(run, tc))
|
|
466
|
+
);
|
|
467
|
+
}
|
|
468
|
+
} else {
|
|
469
|
+
// Execute sequentially
|
|
470
|
+
for (const testCase of testCases) {
|
|
471
|
+
await this.executeTestCase(run, testCase);
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
// Calculate summary
|
|
476
|
+
run.summary.totalDuration = Date.now() - startTime;
|
|
477
|
+
run.summary.passedTests = run.results.filter(r => r.success).length;
|
|
478
|
+
run.summary.failedTests = run.results.filter(r => !r.success).length;
|
|
479
|
+
run.summary.averageScore = run.results.length > 0
|
|
480
|
+
? run.results.reduce((sum, r) => sum + (r.score / r.maxScore), 0) / run.results.length * 100
|
|
481
|
+
: 0;
|
|
482
|
+
|
|
483
|
+
run.status = 'completed';
|
|
484
|
+
run.completedAt = new Date();
|
|
485
|
+
|
|
486
|
+
this.runHistory.push(run);
|
|
487
|
+
this.activeRuns.delete(run.id);
|
|
488
|
+
|
|
489
|
+
this.emit('run:completed', { runId: run.id, summary: run.summary });
|
|
490
|
+
log.info(`Eval run ${run.id} completed:`, run.summary);
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
private async executeTestCase(
|
|
494
|
+
run: EvalRun,
|
|
495
|
+
testCase: EvalTestCase
|
|
496
|
+
): Promise<EvalResult> {
|
|
497
|
+
const startTime = Date.now();
|
|
498
|
+
const result: EvalResult = {
|
|
499
|
+
testCaseId: testCase.id,
|
|
500
|
+
agentId: '',
|
|
501
|
+
success: false,
|
|
502
|
+
score: 0,
|
|
503
|
+
maxScore: testCase.validationCriteria.reduce((sum, c) => sum + c.weight, 0),
|
|
504
|
+
duration: 0,
|
|
505
|
+
messages: [],
|
|
506
|
+
criteriaResults: [],
|
|
507
|
+
errors: [],
|
|
508
|
+
warnings: [],
|
|
509
|
+
metadata: {},
|
|
510
|
+
createdAt: new Date(),
|
|
511
|
+
};
|
|
512
|
+
|
|
513
|
+
try {
|
|
514
|
+
// Create temporary agent for evaluation
|
|
515
|
+
const agent = await this.agentOrchestrator.createAgent({
|
|
516
|
+
name: `Eval-${testCase.name}`,
|
|
517
|
+
projectPath: `/tmp/eval-${testCase.id}`,
|
|
518
|
+
providerId: run.agentConfig.providerId,
|
|
519
|
+
model: run.agentConfig.model,
|
|
520
|
+
skills: run.agentConfig.skills,
|
|
521
|
+
});
|
|
522
|
+
|
|
523
|
+
result.agentId = agent.id;
|
|
524
|
+
|
|
525
|
+
// Execute prompt
|
|
526
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
527
|
+
setTimeout(() => reject(new Error('Test timeout')), testCase.timeout);
|
|
528
|
+
});
|
|
529
|
+
|
|
530
|
+
const messagePromise = this.agentOrchestrator.sendMessage(
|
|
531
|
+
agent.id,
|
|
532
|
+
testCase.prompt
|
|
533
|
+
);
|
|
534
|
+
|
|
535
|
+
const response = await Promise.race([messagePromise, timeoutPromise]) as AgentMessage;
|
|
536
|
+
result.messages = [response];
|
|
537
|
+
|
|
538
|
+
// Evaluate criteria
|
|
539
|
+
for (const criteria of testCase.validationCriteria) {
|
|
540
|
+
try {
|
|
541
|
+
const passed = await criteria.validator(result);
|
|
542
|
+
const criteriaResult = {
|
|
543
|
+
criteriaId: criteria.id,
|
|
544
|
+
passed: !!passed,
|
|
545
|
+
score: passed ? criteria.weight : 0,
|
|
546
|
+
details: passed ? 'Passed' : 'Failed',
|
|
547
|
+
};
|
|
548
|
+
result.criteriaResults.push(criteriaResult);
|
|
549
|
+
result.score += criteriaResult.score;
|
|
550
|
+
} catch (error) {
|
|
551
|
+
result.criteriaResults.push({
|
|
552
|
+
criteriaId: criteria.id,
|
|
553
|
+
passed: false,
|
|
554
|
+
score: 0,
|
|
555
|
+
details: `Validation error: ${error}`,
|
|
556
|
+
});
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
// Check success
|
|
561
|
+
result.success = result.score >= result.maxScore * 0.6; // 60% to pass
|
|
562
|
+
result.duration = Date.now() - startTime;
|
|
563
|
+
|
|
564
|
+
// Cleanup
|
|
565
|
+
await this.agentOrchestrator.deleteAgent(agent.id);
|
|
566
|
+
|
|
567
|
+
} catch (error) {
|
|
568
|
+
result.errors.push(`Test execution failed: ${error}`);
|
|
569
|
+
result.duration = Date.now() - startTime;
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
run.results.push(result);
|
|
573
|
+
this.emit('test:completed', { runId: run.id, testCaseId: testCase.id, result });
|
|
574
|
+
|
|
575
|
+
return result;
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
private chunkArray<T>(array: T[], size: number): T[][] {
|
|
579
|
+
const chunks: T[][] = [];
|
|
580
|
+
for (let i = 0; i < array.length; i += size) {
|
|
581
|
+
chunks.push(array.slice(i, i + size));
|
|
582
|
+
}
|
|
583
|
+
return chunks;
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
getDatasets(): EvalDataset[] {
|
|
587
|
+
return Array.from(this.datasets.values());
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
getDataset(datasetId: string): EvalDataset | undefined {
|
|
591
|
+
return this.datasets.get(datasetId);
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
getActiveRuns(): EvalRun[] {
|
|
595
|
+
return Array.from(this.activeRuns.values());
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
getRunHistory(): EvalRun[] {
|
|
599
|
+
return this.runHistory;
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
async saveDataset(dataset: EvalDataset): Promise<void> {
|
|
603
|
+
this.datasets.set(dataset.id, dataset);
|
|
604
|
+
|
|
605
|
+
// Save to disk
|
|
606
|
+
const filePath = path.join(this.datasetsPath, `${dataset.id}.json`);
|
|
607
|
+
await fs.mkdir(this.datasetsPath, { recursive: true });
|
|
608
|
+
await fs.writeFile(filePath, JSON.stringify(dataset, null, 2));
|
|
609
|
+
|
|
610
|
+
log.info(`Saved dataset: ${dataset.name}`);
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
generateReport(runId: string): string {
|
|
614
|
+
const run = this.activeRuns.get(runId) || this.runHistory.find(r => r.id === runId);
|
|
615
|
+
if (!run) {
|
|
616
|
+
throw new Error(`Run ${runId} not found`);
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
const dataset = this.datasets.get(run.datasetId);
|
|
620
|
+
|
|
621
|
+
let report = `# Evaluation Report\n\n`;
|
|
622
|
+
report += `**Dataset:** ${dataset?.name || run.datasetId}\n`;
|
|
623
|
+
report += `**Agent:** ${run.agentConfig.providerId} - ${run.agentConfig.model}\n`;
|
|
624
|
+
report += `**Date:** ${run.startedAt.toISOString()}\n\n`;
|
|
625
|
+
|
|
626
|
+
report += `## Summary\n\n`;
|
|
627
|
+
report += `- **Total Tests:** ${run.summary.totalTests}\n`;
|
|
628
|
+
report += `- **Passed:** ${run.summary.passedTests} (${((run.summary.passedTests / run.summary.totalTests) * 100).toFixed(1)}%)\n`;
|
|
629
|
+
report += `- **Failed:** ${run.summary.failedTests}\n`;
|
|
630
|
+
report += `- **Average Score:** ${run.summary.averageScore.toFixed(1)}%\n`;
|
|
631
|
+
report += `- **Total Duration:** ${(run.summary.totalDuration / 1000).toFixed(2)}s\n\n`;
|
|
632
|
+
|
|
633
|
+
report += `## Detailed Results\n\n`;
|
|
634
|
+
|
|
635
|
+
for (const result of run.results) {
|
|
636
|
+
const testCase = dataset?.testCases.find(tc => tc.id === result.testCaseId);
|
|
637
|
+
report += `### ${testCase?.name || result.testCaseId}\n\n`;
|
|
638
|
+
report += `- **Status:** ${result.success ? '✅ PASSED' : '❌ FAILED'}\n`;
|
|
639
|
+
report += `- **Score:** ${((result.score / result.maxScore) * 100).toFixed(1)}%\n`;
|
|
640
|
+
report += `- **Duration:** ${(result.duration / 1000).toFixed(2)}s\n\n`;
|
|
641
|
+
|
|
642
|
+
if (result.errors.length > 0) {
|
|
643
|
+
report += `**Errors:**\n`;
|
|
644
|
+
result.errors.forEach(e => report += `- ${e}\n`);
|
|
645
|
+
report += '\n';
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
report += `**Criteria Results:**\n`;
|
|
649
|
+
for (const criteriaResult of result.criteriaResults) {
|
|
650
|
+
const criteria = testCase?.validationCriteria.find(c => c.id === criteriaResult.criteriaId);
|
|
651
|
+
report += `- ${criteriaResult.passed ? '✅' : '❌'} ${criteria?.name || criteriaResult.criteriaId} (${criteriaResult.score.toFixed(2)})\n`;
|
|
652
|
+
}
|
|
653
|
+
report += '\n---\n\n';
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
return report;
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
cleanup(): void {
|
|
660
|
+
this.activeRuns.clear();
|
|
661
|
+
this.removeAllListeners();
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
export default AgentEvalFramework;
|