webmcp-cli 1.0.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/features/agent-simulator.d.ts +67 -0
- package/dist/agent/features/agent-simulator.js +368 -0
- package/dist/agent/features/agent-simulator.js.map +1 -0
- package/dist/agent/features/index.d.ts +8 -0
- package/dist/agent/features/index.js +9 -0
- package/dist/agent/features/index.js.map +1 -0
- package/dist/agent/features/simulation-judge.d.ts +78 -0
- package/dist/agent/features/simulation-judge.js +276 -0
- package/dist/agent/features/simulation-judge.js.map +1 -0
- package/dist/agent/features/test-case-generator.d.ts +35 -0
- package/dist/agent/features/test-case-generator.js +257 -0
- package/dist/agent/features/test-case-generator.js.map +1 -0
- package/dist/agent/index.d.ts +7 -0
- package/dist/agent/index.js +10 -0
- package/dist/agent/index.js.map +1 -0
- package/dist/agent/llm-client.d.ts +76 -0
- package/dist/agent/llm-client.js +198 -0
- package/dist/agent/llm-client.js.map +1 -0
- package/dist/audit/run-single-page-audit.d.ts +41 -0
- package/dist/audit/run-single-page-audit.js +103 -0
- package/dist/audit/run-single-page-audit.js.map +1 -0
- package/dist/bin/webmcp.d.ts +5 -0
- package/dist/bin/webmcp.js +14 -0
- package/dist/bin/webmcp.js.map +1 -0
- package/dist/browser/audit-runner.d.ts +30 -0
- package/dist/browser/audit-runner.js +77 -0
- package/dist/browser/audit-runner.js.map +1 -0
- package/dist/browser/index.d.ts +6 -0
- package/dist/browser/index.js +7 -0
- package/dist/browser/index.js.map +1 -0
- package/dist/browser/interceptor.d.ts +68 -0
- package/dist/browser/interceptor.js +257 -0
- package/dist/browser/interceptor.js.map +1 -0
- package/dist/browser/playwright.d.ts +98 -0
- package/dist/browser/playwright.js +158 -0
- package/dist/browser/playwright.js.map +1 -0
- package/dist/cli/commands/audit.d.ts +12 -0
- package/dist/cli/commands/audit.js +349 -0
- package/dist/cli/commands/audit.js.map +1 -0
- package/dist/cli/commands/interactive.d.ts +10 -0
- package/dist/cli/commands/interactive.js +34 -0
- package/dist/cli/commands/interactive.js.map +1 -0
- package/dist/cli/index.d.ts +17 -0
- package/dist/cli/index.js +84 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/options/parse-audit-options.d.ts +12 -0
- package/dist/cli/options/parse-audit-options.js +64 -0
- package/dist/cli/options/parse-audit-options.js.map +1 -0
- package/dist/core/constants.d.ts +102 -0
- package/dist/core/constants.js +214 -0
- package/dist/core/constants.js.map +1 -0
- package/dist/core/types/audit.d.ts +260 -0
- package/dist/core/types/audit.js +5 -0
- package/dist/core/types/audit.js.map +1 -0
- package/dist/core/types/index.d.ts +6 -0
- package/dist/core/types/index.js +7 -0
- package/dist/core/types/index.js.map +1 -0
- package/dist/core/types/rule.d.ts +190 -0
- package/dist/core/types/rule.js +26 -0
- package/dist/core/types/rule.js.map +1 -0
- package/dist/core/types/tool.d.ts +312 -0
- package/dist/core/types/tool.js +6 -0
- package/dist/core/types/tool.js.map +1 -0
- package/dist/detection/declarative.d.ts +27 -0
- package/dist/detection/declarative.js +343 -0
- package/dist/detection/declarative.js.map +1 -0
- package/dist/detection/imperative.d.ts +38 -0
- package/dist/detection/imperative.js +99 -0
- package/dist/detection/imperative.js.map +1 -0
- package/dist/detection/index.d.ts +5 -0
- package/dist/detection/index.js +6 -0
- package/dist/detection/index.js.map +1 -0
- package/dist/index.d.ts +12 -0
- package/dist/index.js +19 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/advice-service.d.ts +38 -0
- package/dist/llm/advice-service.js +243 -0
- package/dist/llm/advice-service.js.map +1 -0
- package/dist/llm/evaluator.d.ts +89 -0
- package/dist/llm/evaluator.js +274 -0
- package/dist/llm/evaluator.js.map +1 -0
- package/dist/llm/index.d.ts +11 -0
- package/dist/llm/index.js +15 -0
- package/dist/llm/index.js.map +1 -0
- package/dist/llm/json-response.d.ts +12 -0
- package/dist/llm/json-response.js +67 -0
- package/dist/llm/json-response.js.map +1 -0
- package/dist/llm/providers/mock.d.ts +29 -0
- package/dist/llm/providers/mock.js +324 -0
- package/dist/llm/providers/mock.js.map +1 -0
- package/dist/llm/providers/openrouter.d.ts +53 -0
- package/dist/llm/providers/openrouter.js +321 -0
- package/dist/llm/providers/openrouter.js.map +1 -0
- package/dist/llm/request-cache.d.ts +28 -0
- package/dist/llm/request-cache.js +99 -0
- package/dist/llm/request-cache.js.map +1 -0
- package/dist/llm/types.d.ts +233 -0
- package/dist/llm/types.js +7 -0
- package/dist/llm/types.js.map +1 -0
- package/dist/rules/best-practices/BP-001.d.ts +11 -0
- package/dist/rules/best-practices/BP-001.js +56 -0
- package/dist/rules/best-practices/BP-001.js.map +1 -0
- package/dist/rules/best-practices/BP-002.d.ts +11 -0
- package/dist/rules/best-practices/BP-002.js +63 -0
- package/dist/rules/best-practices/BP-002.js.map +1 -0
- package/dist/rules/best-practices/BP-003.d.ts +11 -0
- package/dist/rules/best-practices/BP-003.js +68 -0
- package/dist/rules/best-practices/BP-003.js.map +1 -0
- package/dist/rules/coverage/COV-001.d.ts +8 -0
- package/dist/rules/coverage/COV-001.js +51 -0
- package/dist/rules/coverage/COV-001.js.map +1 -0
- package/dist/rules/description/DESC-003.d.ts +13 -0
- package/dist/rules/description/DESC-003.js +96 -0
- package/dist/rules/description/DESC-003.js.map +1 -0
- package/dist/rules/description/DESC-004.d.ts +8 -0
- package/dist/rules/description/DESC-004.js +61 -0
- package/dist/rules/description/DESC-004.js.map +1 -0
- package/dist/rules/description/DESC-005.d.ts +12 -0
- package/dist/rules/description/DESC-005.js +70 -0
- package/dist/rules/description/DESC-005.js.map +1 -0
- package/dist/rules/description/index.d.ts +4 -0
- package/dist/rules/description/index.js +5 -0
- package/dist/rules/description/index.js.map +1 -0
- package/dist/rules/implementation/IMP-001.d.ts +10 -0
- package/dist/rules/implementation/IMP-001.js +36 -0
- package/dist/rules/implementation/IMP-001.js.map +1 -0
- package/dist/rules/implementation/IMP-003.d.ts +9 -0
- package/dist/rules/implementation/IMP-003.js +45 -0
- package/dist/rules/implementation/IMP-003.js.map +1 -0
- package/dist/rules/implementation/IMP-004.d.ts +9 -0
- package/dist/rules/implementation/IMP-004.js +48 -0
- package/dist/rules/implementation/IMP-004.js.map +1 -0
- package/dist/rules/implementation/IMP-005.d.ts +9 -0
- package/dist/rules/implementation/IMP-005.js +54 -0
- package/dist/rules/implementation/IMP-005.js.map +1 -0
- package/dist/rules/implementation/IMP-007.d.ts +8 -0
- package/dist/rules/implementation/IMP-007.js +79 -0
- package/dist/rules/implementation/IMP-007.js.map +1 -0
- package/dist/rules/implementation/IMP-013.d.ts +9 -0
- package/dist/rules/implementation/IMP-013.js +55 -0
- package/dist/rules/implementation/IMP-013.js.map +1 -0
- package/dist/rules/implementation/index.d.ts +9 -0
- package/dist/rules/implementation/index.js +10 -0
- package/dist/rules/implementation/index.js.map +1 -0
- package/dist/rules/index.d.ts +51 -0
- package/dist/rules/index.js +100 -0
- package/dist/rules/index.js.map +1 -0
- package/dist/rules/llm/LLM-001.d.ts +14 -0
- package/dist/rules/llm/LLM-001.js +78 -0
- package/dist/rules/llm/LLM-001.js.map +1 -0
- package/dist/rules/llm/LLM-002.d.ts +14 -0
- package/dist/rules/llm/LLM-002.js +77 -0
- package/dist/rules/llm/LLM-002.js.map +1 -0
- package/dist/rules/llm/LLM-003.d.ts +16 -0
- package/dist/rules/llm/LLM-003.js +82 -0
- package/dist/rules/llm/LLM-003.js.map +1 -0
- package/dist/rules/llm/LLM-004.d.ts +14 -0
- package/dist/rules/llm/LLM-004.js +87 -0
- package/dist/rules/llm/LLM-004.js.map +1 -0
- package/dist/rules/llm/LLM-005.d.ts +16 -0
- package/dist/rules/llm/LLM-005.js +105 -0
- package/dist/rules/llm/LLM-005.js.map +1 -0
- package/dist/rules/llm/index.d.ts +10 -0
- package/dist/rules/llm/index.js +11 -0
- package/dist/rules/llm/index.js.map +1 -0
- package/dist/rules/runner.d.ts +54 -0
- package/dist/rules/runner.js +138 -0
- package/dist/rules/runner.js.map +1 -0
- package/dist/rules/schema/SCHEMA-001.d.ts +9 -0
- package/dist/rules/schema/SCHEMA-001.js +57 -0
- package/dist/rules/schema/SCHEMA-001.js.map +1 -0
- package/dist/rules/schema/SCHEMA-002.d.ts +9 -0
- package/dist/rules/schema/SCHEMA-002.js +59 -0
- package/dist/rules/schema/SCHEMA-002.js.map +1 -0
- package/dist/rules/schema/SCHEMA-003.d.ts +10 -0
- package/dist/rules/schema/SCHEMA-003.js +66 -0
- package/dist/rules/schema/SCHEMA-003.js.map +1 -0
- package/dist/rules/schema/SCHEMA-011.d.ts +10 -0
- package/dist/rules/schema/SCHEMA-011.js +62 -0
- package/dist/rules/schema/SCHEMA-011.js.map +1 -0
- package/dist/rules/security/SEC-001.d.ts +12 -0
- package/dist/rules/security/SEC-001.js +66 -0
- package/dist/rules/security/SEC-001.js.map +1 -0
- package/dist/rules/utils/keywords.d.ts +35 -0
- package/dist/rules/utils/keywords.js +100 -0
- package/dist/rules/utils/keywords.js.map +1 -0
- package/dist/scoring/calculator.d.ts +27 -0
- package/dist/scoring/calculator.js +194 -0
- package/dist/scoring/calculator.js.map +1 -0
- package/dist/scoring/grades.d.ts +34 -0
- package/dist/scoring/grades.js +167 -0
- package/dist/scoring/grades.js.map +1 -0
- package/dist/scoring/index.d.ts +5 -0
- package/dist/scoring/index.js +6 -0
- package/dist/scoring/index.js.map +1 -0
- package/dist/ui/banner.d.ts +21 -0
- package/dist/ui/banner.js +60 -0
- package/dist/ui/banner.js.map +1 -0
- package/dist/ui/design-tokens.d.ts +23 -0
- package/dist/ui/design-tokens.js +58 -0
- package/dist/ui/design-tokens.js.map +1 -0
- package/dist/ui/findings.d.ts +23 -0
- package/dist/ui/findings.js +190 -0
- package/dist/ui/findings.js.map +1 -0
- package/dist/ui/index.d.ts +9 -0
- package/dist/ui/index.js +10 -0
- package/dist/ui/index.js.map +1 -0
- package/dist/ui/ink/App.d.ts +14 -0
- package/dist/ui/ink/App.js +113 -0
- package/dist/ui/ink/App.js.map +1 -0
- package/dist/ui/ink/FullScreenLayout.d.ts +16 -0
- package/dist/ui/ink/FullScreenLayout.js +29 -0
- package/dist/ui/ink/FullScreenLayout.js.map +1 -0
- package/dist/ui/ink/InteractiveApp.d.ts +28 -0
- package/dist/ui/ink/InteractiveApp.js +229 -0
- package/dist/ui/ink/InteractiveApp.js.map +1 -0
- package/dist/ui/ink/RealAuditApp.d.ts +19 -0
- package/dist/ui/ink/RealAuditApp.js +170 -0
- package/dist/ui/ink/RealAuditApp.js.map +1 -0
- package/dist/ui/ink/components/AnimatedProgressBar.d.ts +20 -0
- package/dist/ui/ink/components/AnimatedProgressBar.js +46 -0
- package/dist/ui/ink/components/AnimatedProgressBar.js.map +1 -0
- package/dist/ui/ink/components/AsciiLogo.d.ts +12 -0
- package/dist/ui/ink/components/AsciiLogo.js +35 -0
- package/dist/ui/ink/components/AsciiLogo.js.map +1 -0
- package/dist/ui/ink/components/CategoryBars.d.ts +18 -0
- package/dist/ui/ink/components/CategoryBars.js +18 -0
- package/dist/ui/ink/components/CategoryBars.js.map +1 -0
- package/dist/ui/ink/components/FindingsTable.d.ts +18 -0
- package/dist/ui/ink/components/FindingsTable.js +19 -0
- package/dist/ui/ink/components/FindingsTable.js.map +1 -0
- package/dist/ui/ink/components/Footer.d.ts +15 -0
- package/dist/ui/ink/components/Footer.js +20 -0
- package/dist/ui/ink/components/Footer.js.map +1 -0
- package/dist/ui/ink/components/Header.d.ts +11 -0
- package/dist/ui/ink/components/Header.js +12 -0
- package/dist/ui/ink/components/Header.js.map +1 -0
- package/dist/ui/ink/components/LinkList.d.ts +17 -0
- package/dist/ui/ink/components/LinkList.js +44 -0
- package/dist/ui/ink/components/LinkList.js.map +1 -0
- package/dist/ui/ink/components/Navigation.d.ts +26 -0
- package/dist/ui/ink/components/Navigation.js +62 -0
- package/dist/ui/ink/components/Navigation.js.map +1 -0
- package/dist/ui/ink/components/ProgressBar.d.ts +15 -0
- package/dist/ui/ink/components/ProgressBar.js +14 -0
- package/dist/ui/ink/components/ProgressBar.js.map +1 -0
- package/dist/ui/ink/components/ScoreCard.d.ts +30 -0
- package/dist/ui/ink/components/ScoreCard.js +26 -0
- package/dist/ui/ink/components/ScoreCard.js.map +1 -0
- package/dist/ui/ink/components/SimulationResults.d.ts +33 -0
- package/dist/ui/ink/components/SimulationResults.js +23 -0
- package/dist/ui/ink/components/SimulationResults.js.map +1 -0
- package/dist/ui/ink/components/Spinner.d.ts +11 -0
- package/dist/ui/ink/components/Spinner.js +12 -0
- package/dist/ui/ink/components/Spinner.js.map +1 -0
- package/dist/ui/ink/components/ToolCard.d.ts +23 -0
- package/dist/ui/ink/components/ToolCard.js +20 -0
- package/dist/ui/ink/components/ToolCard.js.map +1 -0
- package/dist/ui/ink/components/shared/Badge.d.ts +21 -0
- package/dist/ui/ink/components/shared/Badge.js +39 -0
- package/dist/ui/ink/components/shared/Badge.js.map +1 -0
- package/dist/ui/ink/components/shared/Card.d.ts +18 -0
- package/dist/ui/ink/components/shared/Card.js +11 -0
- package/dist/ui/ink/components/shared/Card.js.map +1 -0
- package/dist/ui/ink/components/shared/HelpOverlay.d.ts +10 -0
- package/dist/ui/ink/components/shared/HelpOverlay.js +28 -0
- package/dist/ui/ink/components/shared/HelpOverlay.js.map +1 -0
- package/dist/ui/ink/components/shared/LoadingWithTimeout.d.ts +11 -0
- package/dist/ui/ink/components/shared/LoadingWithTimeout.js +21 -0
- package/dist/ui/ink/components/shared/LoadingWithTimeout.js.map +1 -0
- package/dist/ui/ink/components/shared/Menu.d.ts +23 -0
- package/dist/ui/ink/components/shared/Menu.js +43 -0
- package/dist/ui/ink/components/shared/Menu.js.map +1 -0
- package/dist/ui/ink/components/shared/Table.d.ts +23 -0
- package/dist/ui/ink/components/shared/Table.js +40 -0
- package/dist/ui/ink/components/shared/Table.js.map +1 -0
- package/dist/ui/ink/components/views/CrawlingView.d.ts +12 -0
- package/dist/ui/ink/components/views/CrawlingView.js +34 -0
- package/dist/ui/ink/components/views/CrawlingView.js.map +1 -0
- package/dist/ui/ink/components/views/DashboardView.d.ts +21 -0
- package/dist/ui/ink/components/views/DashboardView.js +51 -0
- package/dist/ui/ink/components/views/DashboardView.js.map +1 -0
- package/dist/ui/ink/components/views/FindingDetailView.d.ts +16 -0
- package/dist/ui/ink/components/views/FindingDetailView.js +34 -0
- package/dist/ui/ink/components/views/FindingDetailView.js.map +1 -0
- package/dist/ui/ink/components/views/FindingsView.d.ts +16 -0
- package/dist/ui/ink/components/views/FindingsView.js +79 -0
- package/dist/ui/ink/components/views/FindingsView.js.map +1 -0
- package/dist/ui/ink/components/views/OnboardingView.d.ts +12 -0
- package/dist/ui/ink/components/views/OnboardingView.js +40 -0
- package/dist/ui/ink/components/views/OnboardingView.js.map +1 -0
- package/dist/ui/ink/components/views/SimulationView.d.ts +17 -0
- package/dist/ui/ink/components/views/SimulationView.js +53 -0
- package/dist/ui/ink/components/views/SimulationView.js.map +1 -0
- package/dist/ui/ink/components/views/TestCaseDetailView.d.ts +11 -0
- package/dist/ui/ink/components/views/TestCaseDetailView.js +53 -0
- package/dist/ui/ink/components/views/TestCaseDetailView.js.map +1 -0
- package/dist/ui/ink/components/views/ToolDetailView.d.ts +15 -0
- package/dist/ui/ink/components/views/ToolDetailView.js +25 -0
- package/dist/ui/ink/components/views/ToolDetailView.js.map +1 -0
- package/dist/ui/ink/components/views/ToolsView.d.ts +15 -0
- package/dist/ui/ink/components/views/ToolsView.js +43 -0
- package/dist/ui/ink/components/views/ToolsView.js.map +1 -0
- package/dist/ui/ink/demo.d.ts +6 -0
- package/dist/ui/ink/demo.js +254 -0
- package/dist/ui/ink/demo.js.map +1 -0
- package/dist/ui/ink/hooks/useAnimation.d.ts +29 -0
- package/dist/ui/ink/hooks/useAnimation.js +89 -0
- package/dist/ui/ink/hooks/useAnimation.js.map +1 -0
- package/dist/ui/ink/hooks/useAudit.d.ts +69 -0
- package/dist/ui/ink/hooks/useAudit.js +99 -0
- package/dist/ui/ink/hooks/useAudit.js.map +1 -0
- package/dist/ui/ink/hooks/useCrawlAnimation.d.ts +19 -0
- package/dist/ui/ink/hooks/useCrawlAnimation.js +204 -0
- package/dist/ui/ink/hooks/useCrawlAnimation.js.map +1 -0
- package/dist/ui/ink/hooks/useKeyboardNav.d.ts +23 -0
- package/dist/ui/ink/hooks/useKeyboardNav.js +81 -0
- package/dist/ui/ink/hooks/useKeyboardNav.js.map +1 -0
- package/dist/ui/ink/hooks/useNavigation.d.ts +16 -0
- package/dist/ui/ink/hooks/useNavigation.js +42 -0
- package/dist/ui/ink/hooks/useNavigation.js.map +1 -0
- package/dist/ui/ink/hooks/useTerminalSize.d.ts +10 -0
- package/dist/ui/ink/hooks/useTerminalSize.js +29 -0
- package/dist/ui/ink/hooks/useTerminalSize.js.map +1 -0
- package/dist/ui/ink/index.d.ts +43 -0
- package/dist/ui/ink/index.js +50 -0
- package/dist/ui/ink/index.js.map +1 -0
- package/dist/ui/ink/render.d.ts +24 -0
- package/dist/ui/ink/render.js +14 -0
- package/dist/ui/ink/render.js.map +1 -0
- package/dist/ui/ink/theme.d.ts +37 -0
- package/dist/ui/ink/theme.js +38 -0
- package/dist/ui/ink/theme.js.map +1 -0
- package/dist/ui/ink/types.d.ts +77 -0
- package/dist/ui/ink/types.js +5 -0
- package/dist/ui/ink/types.js.map +1 -0
- package/dist/ui/score-display.d.ts +16 -0
- package/dist/ui/score-display.js +201 -0
- package/dist/ui/score-display.js.map +1 -0
- package/dist/ui/spinner.d.ts +45 -0
- package/dist/ui/spinner.js +112 -0
- package/dist/ui/spinner.js.map +1 -0
- package/dist/ui/utils.d.ts +13 -0
- package/dist/ui/utils.js +25 -0
- package/dist/ui/utils.js.map +1 -0
- package/package.json +61 -9
- package/index.js +0 -105
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Simulation Judge
|
|
3
|
+
*
|
|
4
|
+
* Uses LLM to evaluate simulation results and provide structured scoring
|
|
5
|
+
* with actionable improvement suggestions.
|
|
6
|
+
*/
|
|
7
|
+
import chalk from 'chalk';
|
|
8
|
+
import { LLMClient, createLLMClient } from '../llm-client.js';
|
|
9
|
+
import { parseJsonObject } from '../../llm/json-response.js';
|
|
10
|
+
// ═══════════════════════════════════════════════════════
|
|
11
|
+
// JUDGE PROMPTS
|
|
12
|
+
// ═══════════════════════════════════════════════════════
|
|
13
|
+
const JUDGE_SYSTEM_PROMPT = `You are an expert judge evaluating an AI agent's performance when interacting with website tools via the WebMCP protocol.
|
|
14
|
+
|
|
15
|
+
You evaluate with strict criteria but fair judgment. Your evaluation should be:
|
|
16
|
+
1. SPECIFIC — Point to exact moments in the conversation
|
|
17
|
+
2. ACTIONABLE — Every criticism comes with a fix
|
|
18
|
+
3. BALANCED — Acknowledge what worked well, not just failures
|
|
19
|
+
4. QUANTIFIED — Use numbers, percentages, and scores`;
|
|
20
|
+
function buildJudgmentPrompt(simulation, tools) {
|
|
21
|
+
const conversationLog = simulation.rounds.map(r => `[USER] ${r.userMessage}\n[AGENT] ${r.agentResponse.text || '(tool call)'}${r.agentResponse.toolCalls.length > 0
|
|
22
|
+
? ` → called: ${r.agentResponse.toolCalls.map(tc => `${tc.toolName}(${JSON.stringify(tc.toolInput)})`).join(', ')}`
|
|
23
|
+
: ''}`).join('\n\n');
|
|
24
|
+
return `TOOLS AVAILABLE:
|
|
25
|
+
${tools.map(t => ` ${t.name}: ${t.description}`).join('\n')}
|
|
26
|
+
|
|
27
|
+
TEST CASE:
|
|
28
|
+
Prompt: "${simulation.testCase.prompt}"
|
|
29
|
+
Expected tool: ${simulation.testCase.expectedBehavior.toolName || 'none'}
|
|
30
|
+
Expected params: ${JSON.stringify(simulation.testCase.expectedBehavior.expectedParams)}
|
|
31
|
+
Category: ${simulation.testCase.category}
|
|
32
|
+
Difficulty: ${simulation.testCase.difficulty}/5
|
|
33
|
+
|
|
34
|
+
CONVERSATION TRANSCRIPT:
|
|
35
|
+
${conversationLog}
|
|
36
|
+
|
|
37
|
+
FINAL OUTCOME:
|
|
38
|
+
Success: ${simulation.finalSuccess}
|
|
39
|
+
Tool invocations: ${simulation.toolInvocations}
|
|
40
|
+
Rounds used: ${simulation.totalRounds}
|
|
41
|
+
${simulation.evaluation.failureReason ? ` Failure reason: ${simulation.evaluation.failureReason}` : ''}
|
|
42
|
+
|
|
43
|
+
Judge this interaction on the following criteria:
|
|
44
|
+
|
|
45
|
+
1. TOOL SELECTION ACCURACY (0-100)
|
|
46
|
+
Did the agent select the correct tool(s) for the user's request?
|
|
47
|
+
Did it avoid selecting wrong or unnecessary tools?
|
|
48
|
+
|
|
49
|
+
2. PARAMETER ACCURACY (0-100)
|
|
50
|
+
Did the agent correctly extract and format parameters from the user's input?
|
|
51
|
+
Did it hallucinate any values not mentioned by the user?
|
|
52
|
+
|
|
53
|
+
3. CONVERSATION EFFICIENCY (0-100)
|
|
54
|
+
Did the agent complete the task in the minimum number of exchanges?
|
|
55
|
+
Were there unnecessary clarifying questions or redundant tool calls?
|
|
56
|
+
|
|
57
|
+
4. ERROR HANDLING (0-100)
|
|
58
|
+
When errors occurred, did the agent recover gracefully?
|
|
59
|
+
Did it provide helpful context to the user about what went wrong?
|
|
60
|
+
|
|
61
|
+
5. USER EXPERIENCE (0-100)
|
|
62
|
+
Was the agent's communication clear and natural?
|
|
63
|
+
Did it keep the user informed about what it was doing?
|
|
64
|
+
|
|
65
|
+
6. TOOL DEFINITION QUALITY (0-100)
|
|
66
|
+
Based on the agent's behavior, how well are the tools defined?
|
|
67
|
+
Did the tool descriptions/schemas cause any agent confusion?
|
|
68
|
+
This score reflects the TOOL QUALITY, not the agent's ability.
|
|
69
|
+
|
|
70
|
+
OUTPUT FORMAT (strict JSON):
|
|
71
|
+
{
|
|
72
|
+
"scores": {
|
|
73
|
+
"toolSelection": { "score": 85, "reasoning": "..." },
|
|
74
|
+
"parameterAccuracy": { "score": 72, "reasoning": "..." },
|
|
75
|
+
"conversationEfficiency": { "score": 60, "reasoning": "..." },
|
|
76
|
+
"errorHandling": { "score": 90, "reasoning": "..." },
|
|
77
|
+
"userExperience": { "score": 78, "reasoning": "..." },
|
|
78
|
+
"toolDefinitionQuality": { "score": 65, "reasoning": "..." }
|
|
79
|
+
},
|
|
80
|
+
"overallScore": 75,
|
|
81
|
+
"verdict": "PASS",
|
|
82
|
+
"keyMoments": [
|
|
83
|
+
{
|
|
84
|
+
"moment": "Round 1: Agent correctly identified search intent",
|
|
85
|
+
"impact": "positive",
|
|
86
|
+
"severity": "minor"
|
|
87
|
+
}
|
|
88
|
+
],
|
|
89
|
+
"toolImprovements": [
|
|
90
|
+
{
|
|
91
|
+
"tool": "search-flights",
|
|
92
|
+
"issue": "Description doesn't mention airport code format",
|
|
93
|
+
"suggestedFix": "Add 'Use 3-letter IATA airport codes' to description",
|
|
94
|
+
"estimatedImpact": "Would improve parameter accuracy"
|
|
95
|
+
}
|
|
96
|
+
],
|
|
97
|
+
"summary": "Overall assessment in 2-3 sentences"
|
|
98
|
+
}`;
|
|
99
|
+
}
|
|
100
|
+
// ═══════════════════════════════════════════════════════
|
|
101
|
+
// JUDGE FUNCTIONS
|
|
102
|
+
// ═══════════════════════════════════════════════════════
|
|
103
|
+
/**
|
|
104
|
+
* Judge a single simulation result
|
|
105
|
+
*/
|
|
106
|
+
export async function judgeSimulation(simulation, tools, llm) {
|
|
107
|
+
const client = llm || createLLMClient();
|
|
108
|
+
if (!client.isAvailable()) {
|
|
109
|
+
return createFallbackJudgment(simulation);
|
|
110
|
+
}
|
|
111
|
+
try {
|
|
112
|
+
const response = await client.chat([
|
|
113
|
+
{ role: 'system', content: JUDGE_SYSTEM_PROMPT },
|
|
114
|
+
{ role: 'user', content: buildJudgmentPrompt(simulation, tools) },
|
|
115
|
+
]);
|
|
116
|
+
if (!response.text) {
|
|
117
|
+
throw new Error('No response from LLM');
|
|
118
|
+
}
|
|
119
|
+
return parseJudgmentResponse(response.text);
|
|
120
|
+
}
|
|
121
|
+
catch (error) {
|
|
122
|
+
console.warn(chalk.yellow(`Judgment failed: ${error instanceof Error ? error.message : error}`));
|
|
123
|
+
return createFallbackJudgment(simulation);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Judge multiple simulations and provide aggregate analysis
|
|
128
|
+
*/
|
|
129
|
+
export async function judgeSimulations(simulations, tools, llm) {
|
|
130
|
+
const client = llm || createLLMClient();
|
|
131
|
+
// Get individual judgments (sample if too many)
|
|
132
|
+
const sampleSize = Math.min(simulations.length, 10);
|
|
133
|
+
const sampled = simulations.slice(0, sampleSize);
|
|
134
|
+
const judgments = [];
|
|
135
|
+
for (const sim of sampled) {
|
|
136
|
+
const judgment = await judgeSimulation(sim, tools, client);
|
|
137
|
+
judgments.push(judgment);
|
|
138
|
+
}
|
|
139
|
+
// Aggregate results
|
|
140
|
+
const passCount = judgments.filter(j => j.verdict === 'PASS').length;
|
|
141
|
+
const partialCount = judgments.filter(j => j.verdict === 'PARTIAL').length;
|
|
142
|
+
const avgScore = judgments.reduce((sum, j) => sum + j.overallScore, 0) / judgments.length;
|
|
143
|
+
// Collect failure patterns
|
|
144
|
+
const failureMap = new Map();
|
|
145
|
+
for (const judgment of judgments) {
|
|
146
|
+
for (const improvement of judgment.toolImprovements) {
|
|
147
|
+
const key = improvement.issue;
|
|
148
|
+
if (!failureMap.has(key)) {
|
|
149
|
+
failureMap.set(key, { count: 0, tools: new Set() });
|
|
150
|
+
}
|
|
151
|
+
const entry = failureMap.get(key);
|
|
152
|
+
entry.count++;
|
|
153
|
+
entry.tools.add(improvement.tool);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
const failurePatterns = Array.from(failureMap.entries())
|
|
157
|
+
.sort((a, b) => b[1].count - a[1].count)
|
|
158
|
+
.slice(0, 5)
|
|
159
|
+
.map(([pattern, data]) => {
|
|
160
|
+
const matchingJudgment = judgments.find(j => j.toolImprovements.some(i => i.issue === pattern));
|
|
161
|
+
const matchingImprovement = matchingJudgment?.toolImprovements.find(i => i.issue === pattern);
|
|
162
|
+
return {
|
|
163
|
+
pattern,
|
|
164
|
+
frequency: data.count,
|
|
165
|
+
affectedTools: Array.from(data.tools),
|
|
166
|
+
suggestedFix: matchingImprovement?.suggestedFix ?? 'Review tool definition',
|
|
167
|
+
};
|
|
168
|
+
});
|
|
169
|
+
// Collect unique improvements
|
|
170
|
+
const improvementMap = new Map();
|
|
171
|
+
for (const judgment of judgments) {
|
|
172
|
+
for (const improvement of judgment.toolImprovements) {
|
|
173
|
+
const key = `${improvement.tool}:${improvement.issue}`;
|
|
174
|
+
if (!improvementMap.has(key)) {
|
|
175
|
+
improvementMap.set(key, improvement);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
const topImprovements = Array.from(improvementMap.values()).slice(0, 5);
|
|
180
|
+
return {
|
|
181
|
+
totalSimulations: simulations.length,
|
|
182
|
+
passRate: (passCount + partialCount * 0.5) / judgments.length,
|
|
183
|
+
avgScore,
|
|
184
|
+
failurePatterns,
|
|
185
|
+
topImprovements,
|
|
186
|
+
summary: `Evaluated ${simulations.length} simulations. ` +
|
|
187
|
+
`Pass rate: ${Math.round((passCount / judgments.length) * 100)}%. ` +
|
|
188
|
+
`Average score: ${Math.round(avgScore)}/100. ` +
|
|
189
|
+
`${failurePatterns.length > 0 && failurePatterns[0] ? `Top issue: ${failurePatterns[0].pattern}` : 'No major issues found.'}`,
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Parse the LLM judgment response
|
|
194
|
+
*/
|
|
195
|
+
function parseJudgmentResponse(response) {
|
|
196
|
+
const parsed = parseJsonObject(response);
|
|
197
|
+
// Validate and fill defaults
|
|
198
|
+
return {
|
|
199
|
+
scores: parsed.scores || createDefaultScores(),
|
|
200
|
+
overallScore: parsed.overallScore || 50,
|
|
201
|
+
verdict: parsed.verdict || 'PARTIAL',
|
|
202
|
+
keyMoments: parsed.keyMoments || [],
|
|
203
|
+
toolImprovements: parsed.toolImprovements || [],
|
|
204
|
+
summary: parsed.summary || 'Evaluation complete.',
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Create fallback judgment when LLM is unavailable
|
|
209
|
+
*/
|
|
210
|
+
function createFallbackJudgment(simulation) {
|
|
211
|
+
const eval_ = simulation.evaluation;
|
|
212
|
+
return {
|
|
213
|
+
scores: {
|
|
214
|
+
toolSelection: {
|
|
215
|
+
score: Math.round(eval_.routingAccuracy * 100),
|
|
216
|
+
reasoning: eval_.routingAccuracy === 1
|
|
217
|
+
? 'Correct tool selected on first try'
|
|
218
|
+
: 'Tool selection issues detected',
|
|
219
|
+
},
|
|
220
|
+
parameterAccuracy: {
|
|
221
|
+
score: Math.round(eval_.parameterAccuracy * 100),
|
|
222
|
+
reasoning: eval_.parameterAccuracy >= 0.8
|
|
223
|
+
? 'Most parameters extracted correctly'
|
|
224
|
+
: 'Some parameters were incorrect or missing',
|
|
225
|
+
},
|
|
226
|
+
conversationEfficiency: {
|
|
227
|
+
score: Math.round(eval_.roundEfficiency * 100),
|
|
228
|
+
reasoning: simulation.totalRounds === 1
|
|
229
|
+
? 'Completed in single round'
|
|
230
|
+
: `Took ${simulation.totalRounds} rounds`,
|
|
231
|
+
},
|
|
232
|
+
errorHandling: {
|
|
233
|
+
score: simulation.finalSuccess ? 80 : 40,
|
|
234
|
+
reasoning: simulation.finalSuccess
|
|
235
|
+
? 'Task completed successfully'
|
|
236
|
+
: 'Task did not complete successfully',
|
|
237
|
+
},
|
|
238
|
+
userExperience: {
|
|
239
|
+
score: 70,
|
|
240
|
+
reasoning: 'Heuristic evaluation - LLM not available for detailed analysis',
|
|
241
|
+
},
|
|
242
|
+
toolDefinitionQuality: {
|
|
243
|
+
score: Math.round((eval_.routingAccuracy + eval_.parameterAccuracy) * 50),
|
|
244
|
+
reasoning: 'Based on agent performance metrics',
|
|
245
|
+
},
|
|
246
|
+
},
|
|
247
|
+
overallScore: eval_.overallScore,
|
|
248
|
+
verdict: simulation.finalSuccess ? 'PASS' : eval_.overallScore >= 50 ? 'PARTIAL' : 'FAIL',
|
|
249
|
+
keyMoments: simulation.evaluation.failureReason ? [{
|
|
250
|
+
moment: simulation.evaluation.failureReason,
|
|
251
|
+
impact: 'negative',
|
|
252
|
+
severity: 'major',
|
|
253
|
+
}] : [],
|
|
254
|
+
toolImprovements: eval_.hallucinationRate > 0 ? [{
|
|
255
|
+
tool: simulation.testCase.expectedBehavior.toolName || 'unknown',
|
|
256
|
+
issue: 'Agent hallucinated parameters',
|
|
257
|
+
suggestedFix: 'Add clearer parameter descriptions to schema',
|
|
258
|
+
estimatedImpact: `Would reduce hallucination rate of ${Math.round(eval_.hallucinationRate * 100)}%`,
|
|
259
|
+
}] : [],
|
|
260
|
+
summary: simulation.finalSuccess
|
|
261
|
+
? `Simulation passed. Tool ${simulation.testCase.expectedBehavior.toolName} invoked correctly.`
|
|
262
|
+
: `Simulation failed: ${simulation.evaluation.failureReason || 'Unknown reason'}`,
|
|
263
|
+
};
|
|
264
|
+
}
|
|
265
|
+
function createDefaultScores() {
|
|
266
|
+
return {
|
|
267
|
+
toolSelection: { score: 50, reasoning: 'Unable to evaluate' },
|
|
268
|
+
parameterAccuracy: { score: 50, reasoning: 'Unable to evaluate' },
|
|
269
|
+
conversationEfficiency: { score: 50, reasoning: 'Unable to evaluate' },
|
|
270
|
+
errorHandling: { score: 50, reasoning: 'Unable to evaluate' },
|
|
271
|
+
userExperience: { score: 50, reasoning: 'Unable to evaluate' },
|
|
272
|
+
toolDefinitionQuality: { score: 50, reasoning: 'Unable to evaluate' },
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
export default judgeSimulation;
|
|
276
|
+
//# sourceMappingURL=simulation-judge.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"simulation-judge.js","sourceRoot":"","sources":["../../../src/agent/features/simulation-judge.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAG9D,OAAO,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AAsD7D,0DAA0D;AAC1D,gBAAgB;AAChB,0DAA0D;AAE1D,MAAM,mBAAmB,GAAG;;;;;;qDAMyB,CAAC;AAEtD,SAAS,mBAAmB,CAC1B,UAA4B,EAC5B,KAAqB;IAErB,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAChD,UAAU,CAAC,CAAC,WAAW,aAAa,CAAC,CAAC,aAAa,CAAC,IAAI,IAAI,aAAa,GACvE,CAAC,CAAC,aAAa,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC;QAClC,CAAC,CAAC,cAAc,CAAC,CAAC,aAAa,CAAC,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAC/C,GAAG,EAAE,CAAC,QAAQ,IAAI,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC,SAAS,CAAC,GAAG,CAClD,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;QAChB,CAAC,CAAC,EACN,EAAE,CACH,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAEf,OAAO;EACP,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;aAG/C,UAAU,CAAC,QAAQ,CAAC,MAAM;mBACpB,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,IAAI,MAAM;qBACrD,IAAI,CAAC,SAAS,CAAC,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,cAAc,CAAC;cAC1E,UAAU,CAAC,QAAQ,CAAC,QAAQ;gBAC1B,UAAU,CAAC,QAAQ,CAAC,UAAU;;;EAG5C,eAAe;;;aAGJ,UAAU,CAAC,YAAY;sBACd,UAAU,CAAC,eAAe;iBAC/B,UAAU,CAAC,WAAW;EACrC,UAAU,CAAC,UAAU,CAAC,aAAa,CAAC,CAAC,CAAC,qBAAqB,UAAU,CAAC,UAAU,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAyDrG,CAAC;AACH,CAAC;AAED,0DAA0D;AAC1D,kBAAkB;AAClB,0DAA0D;AAE1D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAA4B,EAC5B,KAAqB,EACrB,GAAe;IAEf,MAAM,MAAM,GAAG,GAAG,IAAI,eAAe,EAAE,CAAC;IAExC,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,EAAE,CAAC;QAC1B,OAAO,sBAAsB,CAAC,UAAU,CAAC,CAAC;IAC5C,CAAC;IAED,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC;YACjC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,mBAAmB,EAAE;YAChD,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,mBAAmB,CAAC,UAAU,EAAE,KAAK,CAAC,EAAE;SAClE,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnB,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,OAAO,qBAAqB,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,oBAAoB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACjG,OAAO,sBAAsB,CAAC,UAAU,CAAC,CAAC;IAC5C,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,WAA+B,EAC/B,KAAqB,EACrB,GAAe;IAEf,MAAM,MAAM,GAAG,GAAG,IAAI,eAAe,EAAE,CAAC;IAExC,gDAAgD;IAChD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;IACpD,MAAM,OAAO,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;IAEjD,MAAM,SAAS,GAAyB,EAAE,CAAC;IAE3C,KAAK,MAAM,GAAG,IAAI,OAAO,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC;QAC3D,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IAC3B,CAAC;IAED,oBAAoB;IACpB,MAAM,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACrE,MAAM,YAAY,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,SAAS,CAAC,CAAC,MAAM,CAAC;IAC3E,MAAM,QAAQ,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,YAAY,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC,MAAM,CAAC;IAE1F,2BAA2B;IAC3B,MAAM,UAAU,GAAG,IAAI,GAAG,EAAiD,CAAC;IAE5E,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,KAAK,MAAM,WAAW,IAAI,QAAQ,CAAC,gBAAgB,EAAE,CAAC;YACpD,MAAM,GAAG,GAAG,WAAW,CAAC,KAAK,CAAC;YAC9B,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;gBACzB,UAAU,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;YACtD,CAAC;YACD,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,CAAC,GAAG,CAAE,CAAC;YACnC,KAAK,CAAC,KAAK,EAAE,CAAC;YACd,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACpC,CAAC;IACH,CAAC;IAED,MAAM,eAAe,GAAqB,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;SACvE,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;SACvC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;SACX,GAAG,CAAC,CAAC,CAAC,OAAO,EAAE,IAAI,CAAC,EAAE,EAAE;QACvB,MAAM,gBAAgB,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAC1C,CAAC,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,KAAK,OAAO,CAAC,CAClD,CAAC;QACF,MAAM,mBAAmB,GAAG,gBAAgB,EAAE,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,KAAK,OAAO,CAAC,CAAC;QAC9F,OAAO;YACL,OAAO;YACP,SAAS,EAAE,IAAI,CAAC,KAAK;YACrB,aAAa,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC;YACrC,YAAY,EAAE,mBAAmB,EAAE,YAAY,IAAI,wBAAwB;SAC5E,CAAC;IACJ,CAAC,CAAC,CAAC;IAEL,8BAA8B;IAC9B,MAAM,cAAc,GAAG,IAAI,GAAG,EAA2B,CAAC;IAC1D,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,KAAK,MAAM,WAAW,IAAI,QAAQ,CAAC,gBAAgB,EAAE,CAAC;YACpD,MAAM,GAAG,GAAG,GAAG,WAAW,CAAC,IAAI,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;YACvD,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC7B,cAAc,CAAC,GAAG,CAAC,GAAG,EAAE,WAAW,CAAC,CAAC;YACvC,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,eAAe,GAAG,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAExE,OAAO;QACL,gBAAgB,EAAE,WAAW,CAAC,MAAM;QACpC,QAAQ,EAAE,CAAC,SAAS,GAAG,YAAY,GAAG,GAAG,CAAC,GAAG,SAAS,CAAC,MAAM;QAC7D,QAAQ;QACR,eAAe;QACf,eAAe;QACf,OAAO,EAAE,aAAa,WAAW,CAAC,MAAM,gBAAgB;YACtD,cAAc,IAAI,CAAC,KAAK,CAAC,CAAC,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,KAAK;YACnE,kBAAkB,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,QAAQ;YAC9C,GAAG,eAAe,CAAC,MAAM,GAAG,CAAC,IAAI,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,cAAc,eAAe,CAAC,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,wBAAwB,EAAE;KAChI,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,MAAM,MAAM,GAAG,eAAe,CAAqB,QAAQ,CAAC,CAAC;IAE7D,6BAA6B;IAC7B,OAAO;QACL,MAAM,EAAE,MAAM,CAAC,MAAM,IAAI,mBAAmB,EAAE;QAC9C,YAAY,EAAE,MAAM,CAAC,YAAY,IAAI,EAAE;QACvC,OAAO,EAAE,MAAM,CAAC,OAAO,IAAI,SAAS;QACpC,UAAU,EAAE,MAAM,CAAC,UAAU,IAAI,EAAE;QACnC,gBAAgB,EAAE,MAAM,CAAC,gBAAgB,IAAI,EAAE;QAC/C,OAAO,EAAE,MAAM,CAAC,OAAO,IAAI,sBAAsB;KAClD,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,sBAAsB,CAAC,UAA4B;IAC1D,MAAM,KAAK,GAAG,UAAU,CAAC,UAAU,CAAC;IAEpC,OAAO;QACL,MAAM,EAAE;YACN,aAAa,EAAE;gBACb,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,eAAe,GAAG,GAAG,CAAC;gBAC9C,SAAS,EAAE,KAAK,CAAC,eAAe,KAAK,CAAC;oBACpC,CAAC,CAAC,oCAAoC;oBACtC,CAAC,CAAC,gCAAgC;aACrC;YACD,iBAAiB,EAAE;gBACjB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,iBAAiB,GAAG,GAAG,CAAC;gBAChD,SAAS,EAAE,KAAK,CAAC,iBAAiB,IAAI,GAAG;oBACvC,CAAC,CAAC,qCAAqC;oBACvC,CAAC,CAAC,2CAA2C;aAChD;YACD,sBAAsB,EAAE;gBACtB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,eAAe,GAAG,GAAG,CAAC;gBAC9C,SAAS,EAAE,UAAU,CAAC,WAAW,KAAK,CAAC;oBACrC,CAAC,CAAC,2BAA2B;oBAC7B,CAAC,CAAC,QAAQ,UAAU,CAAC,WAAW,SAAS;aAC5C;YACD,aAAa,EAAE;gBACb,KAAK,EAAE,UAAU,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE;gBACxC,SAAS,EAAE,UAAU,CAAC,YAAY;oBAChC,CAAC,CAAC,6BAA6B;oBAC/B,CAAC,CAAC,oCAAoC;aACzC;YACD,cAAc,EAAE;gBACd,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,gEAAgE;aAC5E;YACD,qBAAqB,EAAE;gBACrB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,eAAe,GAAG,KAAK,CAAC,iBAAiB,CAAC,GAAG,EAAE,CAAC;gBACzE,SAAS,EAAE,oCAAoC;aAChD;SACF;QACD,YAAY,EAAE,KAAK,CAAC,YAAY;QAChC,OAAO,EAAE,UAAU,CAAC,YAAY,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM;QACzF,UAAU,EAAE,UAAU,CAAC,UAAU,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;gBACjD,MAAM,EAAE,UAAU,CAAC,UAAU,CAAC,aAAa;gBAC3C,MAAM,EAAE,UAAU;gBAClB,QAAQ,EAAE,OAAO;aAClB,CAAC,CAAC,CAAC,CAAC,EAAE;QACP,gBAAgB,EAAE,KAAK,CAAC,iBAAiB,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC/C,IAAI,EAAE,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,IAAI,SAAS;gBAChE,KAAK,EAAE,+BAA+B;gBACtC,YAAY,EAAE,8CAA8C;gBAC5D,eAAe,EAAE,sCAAsC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,iBAAiB,GAAG,GAAG,CAAC,GAAG;aACpG,CAAC,CAAC,CAAC,CAAC,EAAE;QACP,OAAO,EAAE,UAAU,CAAC,YAAY;YAC9B,CAAC,CAAC,2BAA2B,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,qBAAqB;YAC/F,CAAC,CAAC,sBAAsB,UAAU,CAAC,UAAU,CAAC,aAAa,IAAI,gBAAgB,EAAE;KACpF,CAAC;AACJ,CAAC;AAED,SAAS,mBAAmB;IAC1B,OAAO;QACL,aAAa,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;QAC7D,iBAAiB,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;QACjE,sBAAsB,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;QACtE,aAAa,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;QAC7D,cAAc,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;QAC9D,qBAAqB,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE,oBAAoB,EAAE;KACtE,CAAC;AACJ,CAAC;AAED,eAAe,eAAe,CAAC"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test Case Generator
|
|
3
|
+
*
|
|
4
|
+
* Uses LLM to generate realistic user prompts for testing WebMCP tools.
|
|
5
|
+
* Generates diverse test cases covering happy paths, edge cases, and adversarial inputs.
|
|
6
|
+
*/
|
|
7
|
+
import { LLMClient } from '../llm-client.js';
|
|
8
|
+
import type { DetectedTool } from '../../core/types/tool.js';
|
|
9
|
+
export type TestCaseCategory = 'happy_path' | 'partial_info' | 'casual' | 'verbose' | 'ambiguous' | 'adversarial' | 'out_of_scope';
|
|
10
|
+
export interface TestCase {
|
|
11
|
+
id: string;
|
|
12
|
+
prompt: string;
|
|
13
|
+
category: TestCaseCategory;
|
|
14
|
+
difficulty: 1 | 2 | 3 | 4 | 5;
|
|
15
|
+
expectedBehavior: {
|
|
16
|
+
shouldInvokeTool: boolean;
|
|
17
|
+
toolName: string | null;
|
|
18
|
+
expectedParams: Record<string, unknown>;
|
|
19
|
+
paramsExplanation: Record<string, string>;
|
|
20
|
+
missingParams: string[];
|
|
21
|
+
shouldAskForMissing: boolean;
|
|
22
|
+
acceptableAlternatives: string[];
|
|
23
|
+
};
|
|
24
|
+
failureModes: string[];
|
|
25
|
+
}
|
|
26
|
+
export interface TestCaseGenerationConfig {
|
|
27
|
+
count: number;
|
|
28
|
+
categories: TestCaseCategory[];
|
|
29
|
+
model?: string;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Generate test cases for the given tools
|
|
33
|
+
*/
|
|
34
|
+
export declare function generateTestCases(tools: DetectedTool[], siteContext: string, config: TestCaseGenerationConfig, llm?: LLMClient): Promise<TestCase[]>;
|
|
35
|
+
export default generateTestCases;
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test Case Generator
|
|
3
|
+
*
|
|
4
|
+
* Uses LLM to generate realistic user prompts for testing WebMCP tools.
|
|
5
|
+
* Generates diverse test cases covering happy paths, edge cases, and adversarial inputs.
|
|
6
|
+
*/
|
|
7
|
+
import chalk from 'chalk';
|
|
8
|
+
import { LLMClient, createLLMClient } from '../llm-client.js';
|
|
9
|
+
import { parseJsonObject } from '../../llm/json-response.js';
|
|
10
|
+
// ═══════════════════════════════════════════════════════
|
|
11
|
+
// PROMPTS
|
|
12
|
+
// ═══════════════════════════════════════════════════════
|
|
13
|
+
const SYSTEM_PROMPT = `You are a QA engineer specialized in testing AI agent tool interactions. Your job is to generate realistic test cases — natural language prompts that real users would say to an AI assistant, expecting specific tools to be invoked.
|
|
14
|
+
|
|
15
|
+
IMPORTANT RULES:
|
|
16
|
+
1. Generate diverse phrasings — formal, casual, abbreviated, verbose
|
|
17
|
+
2. Include edge cases — partial information, ambiguous requests
|
|
18
|
+
3. Include realistic mistakes — wrong terminology, vague descriptions
|
|
19
|
+
4. For each prompt, specify the EXPECTED correct behavior
|
|
20
|
+
5. Be creative — real users are unpredictable
|
|
21
|
+
6. Output valid JSON only — no markdown, no commentary outside the JSON`;
|
|
22
|
+
function buildGenerationPrompt(tools, siteContext, config) {
|
|
23
|
+
const { count, categories } = config;
|
|
24
|
+
return `Given the following WebMCP tools registered on a ${siteContext} website, generate exactly ${count} test case prompts.
|
|
25
|
+
|
|
26
|
+
TOOLS AVAILABLE:
|
|
27
|
+
${tools.map(t => `
|
|
28
|
+
Tool: "${t.name}"
|
|
29
|
+
Description: "${t.description}"
|
|
30
|
+
Parameters: ${JSON.stringify(t.inputSchema, null, 2)}
|
|
31
|
+
`).join('\n---\n')}
|
|
32
|
+
|
|
33
|
+
Generate test cases across these categories: ${categories.join(', ')}
|
|
34
|
+
|
|
35
|
+
Category definitions:
|
|
36
|
+
- "happy_path": Clear, complete requests that should invoke the tool correctly with all required parameters
|
|
37
|
+
- "partial_info": Requests missing some required info — agent should ask for clarification OR use defaults
|
|
38
|
+
- "casual": Very informal, shorthand, or slang phrasings ("yo find me flights to nyc asap")
|
|
39
|
+
- "verbose": Overly detailed requests with extra context the agent must filter
|
|
40
|
+
- "ambiguous": Requests that COULD match multiple tools — tests disambiguation
|
|
41
|
+
- "adversarial": Typos, wrong terminology, misleading phrasing, or requests just outside the tool's scope
|
|
42
|
+
- "out_of_scope": Requests that NONE of the available tools should handle — agent should decline
|
|
43
|
+
|
|
44
|
+
For each test case, provide:
|
|
45
|
+
- The natural language prompt a user would say
|
|
46
|
+
- Which tool(s) should be invoked (or "none" if out of scope)
|
|
47
|
+
- The expected parameters the agent should extract
|
|
48
|
+
- What a correct agent response looks like
|
|
49
|
+
- What category this test case belongs to
|
|
50
|
+
- Difficulty rating for the agent (1=trivial, 5=very hard)
|
|
51
|
+
|
|
52
|
+
OUTPUT FORMAT (strict JSON):
|
|
53
|
+
{
|
|
54
|
+
"testCases": [
|
|
55
|
+
{
|
|
56
|
+
"id": "TC-001",
|
|
57
|
+
"prompt": "the exact user prompt",
|
|
58
|
+
"category": "happy_path",
|
|
59
|
+
"difficulty": 2,
|
|
60
|
+
"expectedBehavior": {
|
|
61
|
+
"shouldInvokeTool": true,
|
|
62
|
+
"toolName": "tool-name",
|
|
63
|
+
"expectedParams": {
|
|
64
|
+
"param1": "value1",
|
|
65
|
+
"param2": "value2"
|
|
66
|
+
},
|
|
67
|
+
"paramsExplanation": {
|
|
68
|
+
"param1": "Explicitly stated as X",
|
|
69
|
+
"param2": "Inferred from Y"
|
|
70
|
+
},
|
|
71
|
+
"missingParams": ["param3"],
|
|
72
|
+
"shouldAskForMissing": true,
|
|
73
|
+
"acceptableAlternatives": [
|
|
74
|
+
"Agent could interpret 'X' as 'Y'"
|
|
75
|
+
]
|
|
76
|
+
},
|
|
77
|
+
"failureModes": [
|
|
78
|
+
"Agent might not recognize X as Y",
|
|
79
|
+
"Agent might pick wrong tool"
|
|
80
|
+
]
|
|
81
|
+
}
|
|
82
|
+
]
|
|
83
|
+
}`;
|
|
84
|
+
}
|
|
85
|
+
// ═══════════════════════════════════════════════════════
|
|
86
|
+
// GENERATOR
|
|
87
|
+
// ═══════════════════════════════════════════════════════
|
|
88
|
+
/**
|
|
89
|
+
* Generate test cases for the given tools
|
|
90
|
+
*/
|
|
91
|
+
export async function generateTestCases(tools, siteContext, config, llm) {
|
|
92
|
+
const client = llm || createLLMClient({ model: config.model });
|
|
93
|
+
if (!client.isAvailable()) {
|
|
94
|
+
// Return fallback test cases when no API key
|
|
95
|
+
return generateFallbackTestCases(tools, config);
|
|
96
|
+
}
|
|
97
|
+
try {
|
|
98
|
+
const response = await client.chat([
|
|
99
|
+
{ role: 'system', content: SYSTEM_PROMPT },
|
|
100
|
+
{ role: 'user', content: buildGenerationPrompt(tools, siteContext, config) },
|
|
101
|
+
]);
|
|
102
|
+
if (!response.text) {
|
|
103
|
+
throw new Error('No response from LLM');
|
|
104
|
+
}
|
|
105
|
+
// Extract JSON from response
|
|
106
|
+
const parsed = parseTestCaseResponse(response.text);
|
|
107
|
+
return parsed.testCases;
|
|
108
|
+
}
|
|
109
|
+
catch (error) {
|
|
110
|
+
console.warn(chalk.yellow(`Test case generation failed: ${error instanceof Error ? error.message : error}`));
|
|
111
|
+
return generateFallbackTestCases(tools, config);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Parse the LLM response, handling code blocks
|
|
116
|
+
*/
|
|
117
|
+
function parseTestCaseResponse(response) {
|
|
118
|
+
return parseJsonObject(response);
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Generate basic fallback test cases when LLM is unavailable
|
|
122
|
+
*/
|
|
123
|
+
function generateFallbackTestCases(tools, config) {
|
|
124
|
+
const testCases = [];
|
|
125
|
+
let id = 1;
|
|
126
|
+
for (const tool of tools) {
|
|
127
|
+
// Extract parameters from schema
|
|
128
|
+
const properties = tool.inputSchema?.properties || {};
|
|
129
|
+
const required = (tool.inputSchema?.required || []);
|
|
130
|
+
const paramNames = Object.keys(properties);
|
|
131
|
+
// Happy path test
|
|
132
|
+
if (config.categories.includes('happy_path')) {
|
|
133
|
+
const params = {};
|
|
134
|
+
const explanations = {};
|
|
135
|
+
for (const param of paramNames) {
|
|
136
|
+
const prop = properties[param];
|
|
137
|
+
params[param] = getExampleValue(param, prop);
|
|
138
|
+
explanations[param] = 'Example value for testing';
|
|
139
|
+
}
|
|
140
|
+
testCases.push({
|
|
141
|
+
id: `TC-${String(id++).padStart(3, '0')}`,
|
|
142
|
+
prompt: `Use ${tool.name} with ${paramNames.join(', ')}`,
|
|
143
|
+
category: 'happy_path',
|
|
144
|
+
difficulty: 1,
|
|
145
|
+
expectedBehavior: {
|
|
146
|
+
shouldInvokeTool: true,
|
|
147
|
+
toolName: tool.name,
|
|
148
|
+
expectedParams: params,
|
|
149
|
+
paramsExplanation: explanations,
|
|
150
|
+
missingParams: [],
|
|
151
|
+
shouldAskForMissing: false,
|
|
152
|
+
acceptableAlternatives: [],
|
|
153
|
+
},
|
|
154
|
+
failureModes: ['Agent might not recognize the tool name'],
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
// Partial info test
|
|
158
|
+
if (config.categories.includes('partial_info') && required.length > 0) {
|
|
159
|
+
const params = {};
|
|
160
|
+
const explanations = {};
|
|
161
|
+
const firstRequired = required[0];
|
|
162
|
+
if (firstRequired) {
|
|
163
|
+
const prop = properties[firstRequired];
|
|
164
|
+
params[firstRequired] = getExampleValue(firstRequired, prop);
|
|
165
|
+
explanations[firstRequired] = 'Only provided this parameter';
|
|
166
|
+
}
|
|
167
|
+
testCases.push({
|
|
168
|
+
id: `TC-${String(id++).padStart(3, '0')}`,
|
|
169
|
+
prompt: `I want to ${tool.name.replace(/[-_]/g, ' ')}`,
|
|
170
|
+
category: 'partial_info',
|
|
171
|
+
difficulty: 3,
|
|
172
|
+
expectedBehavior: {
|
|
173
|
+
shouldInvokeTool: true,
|
|
174
|
+
toolName: tool.name,
|
|
175
|
+
expectedParams: params,
|
|
176
|
+
paramsExplanation: explanations,
|
|
177
|
+
missingParams: required.slice(1),
|
|
178
|
+
shouldAskForMissing: true,
|
|
179
|
+
acceptableAlternatives: [],
|
|
180
|
+
},
|
|
181
|
+
failureModes: ['Agent might guess missing values'],
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
// Casual test
|
|
185
|
+
if (config.categories.includes('casual')) {
|
|
186
|
+
testCases.push({
|
|
187
|
+
id: `TC-${String(id++).padStart(3, '0')}`,
|
|
188
|
+
prompt: `hey can u ${tool.name.replace(/[-_]/g, ' ')} for me`,
|
|
189
|
+
category: 'casual',
|
|
190
|
+
difficulty: 2,
|
|
191
|
+
expectedBehavior: {
|
|
192
|
+
shouldInvokeTool: true,
|
|
193
|
+
toolName: tool.name,
|
|
194
|
+
expectedParams: {},
|
|
195
|
+
paramsExplanation: {},
|
|
196
|
+
missingParams: required,
|
|
197
|
+
shouldAskForMissing: true,
|
|
198
|
+
acceptableAlternatives: [],
|
|
199
|
+
},
|
|
200
|
+
failureModes: ['Agent might not understand casual phrasing'],
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
// Out of scope test
|
|
205
|
+
if (config.categories.includes('out_of_scope')) {
|
|
206
|
+
testCases.push({
|
|
207
|
+
id: `TC-${String(id++).padStart(3, '0')}`,
|
|
208
|
+
prompt: 'What is the meaning of life?',
|
|
209
|
+
category: 'out_of_scope',
|
|
210
|
+
difficulty: 1,
|
|
211
|
+
expectedBehavior: {
|
|
212
|
+
shouldInvokeTool: false,
|
|
213
|
+
toolName: null,
|
|
214
|
+
expectedParams: {},
|
|
215
|
+
paramsExplanation: {},
|
|
216
|
+
missingParams: [],
|
|
217
|
+
shouldAskForMissing: false,
|
|
218
|
+
acceptableAlternatives: [],
|
|
219
|
+
},
|
|
220
|
+
failureModes: ['Agent might try to use a tool anyway'],
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
return testCases.slice(0, config.count);
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Get an example value for a parameter
|
|
227
|
+
*/
|
|
228
|
+
function getExampleValue(name, prop) {
|
|
229
|
+
const type = prop['type'];
|
|
230
|
+
const enumValues = prop['enum'];
|
|
231
|
+
if (enumValues && enumValues.length > 0) {
|
|
232
|
+
return enumValues[0];
|
|
233
|
+
}
|
|
234
|
+
switch (type) {
|
|
235
|
+
case 'string':
|
|
236
|
+
if (name.toLowerCase().includes('date'))
|
|
237
|
+
return '2026-06-15';
|
|
238
|
+
if (name.toLowerCase().includes('email'))
|
|
239
|
+
return 'test@example.com';
|
|
240
|
+
if (name.toLowerCase().includes('url'))
|
|
241
|
+
return 'https://example.com';
|
|
242
|
+
return 'example_value';
|
|
243
|
+
case 'number':
|
|
244
|
+
case 'integer':
|
|
245
|
+
return 1;
|
|
246
|
+
case 'boolean':
|
|
247
|
+
return true;
|
|
248
|
+
case 'array':
|
|
249
|
+
return [];
|
|
250
|
+
case 'object':
|
|
251
|
+
return {};
|
|
252
|
+
default:
|
|
253
|
+
return 'example';
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
export default generateTestCases;
|
|
257
|
+
//# sourceMappingURL=test-case-generator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"test-case-generator.js","sourceRoot":"","sources":["../../../src/agent/features/test-case-generator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAE9D,OAAO,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AAsC7D,0DAA0D;AAC1D,UAAU;AACV,0DAA0D;AAE1D,MAAM,aAAa,GAAG;;;;;;;;wEAQkD,CAAC;AAEzE,SAAS,qBAAqB,CAC5B,KAAqB,EACrB,WAAmB,EACnB,MAAgC;IAEhC,MAAM,EAAE,KAAK,EAAE,UAAU,EAAE,GAAG,MAAM,CAAC;IAErC,OAAO,oDAAoD,WAAW,8BAA8B,KAAK;;;EAGzG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;SACR,CAAC,CAAC,IAAI;gBACC,CAAC,CAAC,WAAW;cACf,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC,CAAC;CACnD,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;;+CAE6B,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAkDlE,CAAC;AACH,CAAC;AAED,0DAA0D;AAC1D,YAAY;AACZ,0DAA0D;AAE1D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,KAAqB,EACrB,WAAmB,EACnB,MAAgC,EAChC,GAAe;IAEf,MAAM,MAAM,GAAG,GAAG,IAAI,eAAe,CAAC,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;IAE/D,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,EAAE,CAAC;QAC1B,6CAA6C;QAC7C,OAAO,yBAAyB,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;IAClD,CAAC;IAED,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC;YACjC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,aAAa,EAAE;YAC1C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,qBAAqB,CAAC,KAAK,EAAE,WAAW,EAAE,MAAM,CAAC,EAAE;SAC7E,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnB,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,6BAA6B;QAC7B,MAAM,MAAM,GAAG,qBAAqB,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACpD,OAAO,MAAM,CAAC,SAAS,CAAC;IAC1B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,gCAAgC,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QAC7G,OAAO,yBAAyB,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;IAClD,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,OAAO,eAAe,CAA4B,QAAQ,CAAC,CAAC;AAC9D,CAAC;AAED;;GAEG;AACH,SAAS,yBAAyB,CAChC,KAAqB,EACrB,MAAgC;IAEhC,MAAM,SAAS,GAAe,EAAE,CAAC;IACjC,IAAI,EAAE,GAAG,CAAC,CAAC;IAEX,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,iCAAiC;QACjC,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,EAAE,UAAU,IAAI,EAAE,CAAC;QACtD,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,WAAW,EAAE,QAAQ,IAAI,EAAE,CAAa,CAAC;QAChE,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAE3C,kBAAkB;QAClB,IAAI,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;YAC7C,MAAM,MAAM,GAA4B,EAAE,CAAC;YAC3C,MAAM,YAAY,GAA2B,EAAE,CAAC;YAEhD,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;gBAC/B,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAA4B,CAAC;gBAC1D,MAAM,CAAC,KAAK,CAAC,GAAG,eAAe,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;gBAC7C,YAAY,CAAC,KAAK,CAAC,GAAG,2BAA2B,CAAC;YACpD,CAAC;YAED,SAAS,CAAC,IAAI,CAAC;gBACb,EAAE,EAAE,MAAM,MAAM,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;gBACzC,MAAM,EAAE,OAAO,IAAI,CAAC,IAAI,SAAS,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;gBACxD,QAAQ,EAAE,YAAY;gBACtB,UAAU,EAAE,CAAC;gBACb,gBAAgB,EAAE;oBAChB,gBAAgB,EAAE,IAAI;oBACtB,QAAQ,EAAE,IAAI,CAAC,IAAI;oBACnB,cAAc,EAAE,MAAM;oBACtB,iBAAiB,EAAE,YAAY;oBAC/B,aAAa,EAAE,EAAE;oBACjB,mBAAmB,EAAE,KAAK;oBAC1B,sBAAsB,EAAE,EAAE;iBAC3B;gBACD,YAAY,EAAE,CAAC,yCAAyC,CAAC;aAC1D,CAAC,CAAC;QACL,CAAC;QAED,oBAAoB;QACpB,IAAI,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtE,MAAM,MAAM,GAA4B,EAAE,CAAC;YAC3C,MAAM,YAAY,GAA2B,EAAE,CAAC;YAChD,MAAM,aAAa,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;YAClC,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,IAAI,GAAG,UAAU,CAAC,aAAa,CAA4B,CAAC;gBAClE,MAAM,CAAC,aAAa,CAAC,GAAG,eAAe,CAAC,aAAa,EAAE,IAAI,CAAC,CAAC;gBAC7D,YAAY,CAAC,aAAa,CAAC,GAAG,8BAA8B,CAAC;YAC/D,CAAC;YAED,SAAS,CAAC,IAAI,CAAC;gBACb,EAAE,EAAE,MAAM,MAAM,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;gBACzC,MAAM,EAAE,aAAa,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,EAAE;gBACtD,QAAQ,EAAE,cAAc;gBACxB,UAAU,EAAE,CAAC;gBACb,gBAAgB,EAAE;oBAChB,gBAAgB,EAAE,IAAI;oBACtB,QAAQ,EAAE,IAAI,CAAC,IAAI;oBACnB,cAAc,EAAE,MAAM;oBACtB,iBAAiB,EAAE,YAAY;oBAC/B,aAAa,EAAE,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;oBAChC,mBAAmB,EAAE,IAAI;oBACzB,sBAAsB,EAAE,EAAE;iBAC3B;gBACD,YAAY,EAAE,CAAC,kCAAkC,CAAC;aACnD,CAAC,CAAC;QACL,CAAC;QAED,cAAc;QACd,IAAI,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;YACzC,SAAS,CAAC,IAAI,CAAC;gBACb,EAAE,EAAE,MAAM,MAAM,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;gBACzC,MAAM,EAAE,aAAa,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,SAAS;gBAC7D,QAAQ,EAAE,QAAQ;gBAClB,UAAU,EAAE,CAAC;gBACb,gBAAgB,EAAE;oBAChB,gBAAgB,EAAE,IAAI;oBACtB,QAAQ,EAAE,IAAI,CAAC,IAAI;oBACnB,cAAc,EAAE,EAAE;oBAClB,iBAAiB,EAAE,EAAE;oBACrB,aAAa,EAAE,QAAQ;oBACvB,mBAAmB,EAAE,IAAI;oBACzB,sBAAsB,EAAE,EAAE;iBAC3B;gBACD,YAAY,EAAE,CAAC,4CAA4C,CAAC;aAC7D,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,oBAAoB;IACpB,IAAI,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,cAAc,CAAC,EAAE,CAAC;QAC/C,SAAS,CAAC,IAAI,CAAC;YACb,EAAE,EAAE,MAAM,MAAM,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;YACzC,MAAM,EAAE,8BAA8B;YACtC,QAAQ,EAAE,cAAc;YACxB,UAAU,EAAE,CAAC;YACb,gBAAgB,EAAE;gBAChB,gBAAgB,EAAE,KAAK;gBACvB,QAAQ,EAAE,IAAI;gBACd,cAAc,EAAE,EAAE;gBAClB,iBAAiB,EAAE,EAAE;gBACrB,aAAa,EAAE,EAAE;gBACjB,mBAAmB,EAAE,KAAK;gBAC1B,sBAAsB,EAAE,EAAE;aAC3B;YACD,YAAY,EAAE,CAAC,sCAAsC,CAAC;SACvD,CAAC,CAAC;IACL,CAAC;IAED,OAAO,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,IAAY,EAAE,IAA6B;IAClE,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAW,CAAC;IACpC,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAc,CAAC;IAE7C,IAAI,UAAU,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IAED,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,QAAQ;YACX,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAAE,OAAO,YAAY,CAAC;YAC7D,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC;gBAAE,OAAO,kBAAkB,CAAC;YACpE,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC;gBAAE,OAAO,qBAAqB,CAAC;YACrE,OAAO,eAAe,CAAC;QACzB,KAAK,QAAQ,CAAC;QACd,KAAK,SAAS;YACZ,OAAO,CAAC,CAAC;QACX,KAAK,SAAS;YACZ,OAAO,IAAI,CAAC;QACd,KAAK,OAAO;YACV,OAAO,EAAE,CAAC;QACZ,KAAK,QAAQ;YACX,OAAO,EAAE,CAAC;QACZ;YACE,OAAO,SAAS,CAAC;IACrB,CAAC;AACH,CAAC;AAED,eAAe,iBAAiB,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent Module
|
|
3
|
+
*
|
|
4
|
+
* LLM-powered agent simulation for testing WebMCP tools.
|
|
5
|
+
*/
|
|
6
|
+
export { LLMClient, createLLMClient, type LLMClientConfig, type LLMMessage, type LLMTool, type LLMToolCall, type LLMResponse, } from './llm-client.js';
|
|
7
|
+
export { generateTestCases, runSimulation, runSimulations, judgeSimulation, judgeSimulations, type TestCase, type TestCaseCategory, type TestCaseGenerationConfig, type SimulationConfig, type SimulationResult, type SimulationRound, type ToolResult, type RoundEvaluation, type SimulationJudgment, type JudgmentResult, type KeyMoment, type ToolImprovement, type AggregateJudgment, type FailurePattern, } from './features/index.js';
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent Module
|
|
3
|
+
*
|
|
4
|
+
* LLM-powered agent simulation for testing WebMCP tools.
|
|
5
|
+
*/
|
|
6
|
+
// LLM Client
|
|
7
|
+
export { LLMClient, createLLMClient, } from './llm-client.js';
|
|
8
|
+
// Features
|
|
9
|
+
export { generateTestCases, runSimulation, runSimulations, judgeSimulation, judgeSimulations, } from './features/index.js';
|
|
10
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/agent/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,aAAa;AACb,OAAO,EACL,SAAS,EACT,eAAe,GAMhB,MAAM,iBAAiB,CAAC;AAEzB,WAAW;AACX,OAAO,EACL,iBAAiB,EACjB,aAAa,EACb,cAAc,EACd,eAAe,EACf,gBAAgB,GAejB,MAAM,qBAAqB,CAAC"}
|