@inspectr/mcplab 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/README.md +762 -0
  2. package/dist/app/android-chrome-192x192.png +0 -0
  3. package/dist/app/android-chrome-512x512.png +0 -0
  4. package/dist/app/apple-touch-icon.png +0 -0
  5. package/dist/app/assets/index-DT-Z4AVG.js +249 -0
  6. package/dist/app/assets/index-EP4WAY8u.css +1 -0
  7. package/dist/app/favicon-16x16.png +0 -0
  8. package/dist/app/favicon-32x32.png +0 -0
  9. package/dist/app/favicon.svg +13 -0
  10. package/dist/app/index.html +26 -0
  11. package/dist/app/inspectr_logo_color.svg +9 -0
  12. package/dist/app/mcp.png +0 -0
  13. package/dist/app/mcp.svg +1 -0
  14. package/dist/app/robots.txt +14 -0
  15. package/dist/app/site.webmanifest +1 -0
  16. package/dist/app-server/app-context.d.ts +164 -0
  17. package/dist/app-server/app-context.d.ts.map +1 -0
  18. package/dist/app-server/app-context.js +2 -0
  19. package/dist/app-server/app-context.js.map +1 -0
  20. package/dist/app-server/assistant-common.d.ts +41 -0
  21. package/dist/app-server/assistant-common.d.ts.map +1 -0
  22. package/dist/app-server/assistant-common.js +104 -0
  23. package/dist/app-server/assistant-common.js.map +1 -0
  24. package/dist/app-server/config-store.d.ts +15 -0
  25. package/dist/app-server/config-store.d.ts.map +1 -0
  26. package/dist/app-server/config-store.js +67 -0
  27. package/dist/app-server/config-store.js.map +1 -0
  28. package/dist/app-server/dev-mcp.d.ts +6 -0
  29. package/dist/app-server/dev-mcp.d.ts.map +1 -0
  30. package/dist/app-server/dev-mcp.js +71 -0
  31. package/dist/app-server/dev-mcp.js.map +1 -0
  32. package/dist/app-server/evals-routes.d.ts +22 -0
  33. package/dist/app-server/evals-routes.d.ts.map +1 -0
  34. package/dist/app-server/evals-routes.js +135 -0
  35. package/dist/app-server/evals-routes.js.map +1 -0
  36. package/dist/app-server/http.d.ts +5 -0
  37. package/dist/app-server/http.d.ts.map +1 -0
  38. package/dist/app-server/http.js +31 -0
  39. package/dist/app-server/http.js.map +1 -0
  40. package/dist/app-server/index.d.ts +3 -0
  41. package/dist/app-server/index.d.ts.map +1 -0
  42. package/dist/app-server/index.js +2 -0
  43. package/dist/app-server/index.js.map +1 -0
  44. package/dist/app-server/jobs.d.ts +15 -0
  45. package/dist/app-server/jobs.d.ts.map +1 -0
  46. package/dist/app-server/jobs.js +11 -0
  47. package/dist/app-server/jobs.js.map +1 -0
  48. package/dist/app-server/libraries-store.d.ts +15 -0
  49. package/dist/app-server/libraries-store.d.ts.map +1 -0
  50. package/dist/app-server/libraries-store.js +61 -0
  51. package/dist/app-server/libraries-store.js.map +1 -0
  52. package/dist/app-server/markdown-reports.d.ts +12 -0
  53. package/dist/app-server/markdown-reports.d.ts.map +1 -0
  54. package/dist/app-server/markdown-reports.js +145 -0
  55. package/dist/app-server/markdown-reports.js.map +1 -0
  56. package/dist/app-server/oauth-debugger-domain.d.ts +230 -0
  57. package/dist/app-server/oauth-debugger-domain.d.ts.map +1 -0
  58. package/dist/app-server/oauth-debugger-domain.js +1098 -0
  59. package/dist/app-server/oauth-debugger-domain.js.map +1 -0
  60. package/dist/app-server/oauth-debugger.d.ts +20 -0
  61. package/dist/app-server/oauth-debugger.d.ts.map +1 -0
  62. package/dist/app-server/oauth-debugger.js +193 -0
  63. package/dist/app-server/oauth-debugger.js.map +1 -0
  64. package/dist/app-server/provider-models.d.ts +8 -0
  65. package/dist/app-server/provider-models.d.ts.map +1 -0
  66. package/dist/app-server/provider-models.js +60 -0
  67. package/dist/app-server/provider-models.js.map +1 -0
  68. package/dist/app-server/result-assistant-domain.d.ts +87 -0
  69. package/dist/app-server/result-assistant-domain.d.ts.map +1 -0
  70. package/dist/app-server/result-assistant-domain.js +212 -0
  71. package/dist/app-server/result-assistant-domain.js.map +1 -0
  72. package/dist/app-server/result-assistant.d.ts +22 -0
  73. package/dist/app-server/result-assistant.d.ts.map +1 -0
  74. package/dist/app-server/result-assistant.js +328 -0
  75. package/dist/app-server/result-assistant.js.map +1 -0
  76. package/dist/app-server/router.d.ts +4 -0
  77. package/dist/app-server/router.d.ts.map +1 -0
  78. package/dist/app-server/router.js +374 -0
  79. package/dist/app-server/router.js.map +1 -0
  80. package/dist/app-server/runs-routes.d.ts +44 -0
  81. package/dist/app-server/runs-routes.d.ts.map +1 -0
  82. package/dist/app-server/runs-routes.js +555 -0
  83. package/dist/app-server/runs-routes.js.map +1 -0
  84. package/dist/app-server/runs-store.d.ts +23 -0
  85. package/dist/app-server/runs-store.d.ts.map +1 -0
  86. package/dist/app-server/runs-store.js +84 -0
  87. package/dist/app-server/runs-store.js.map +1 -0
  88. package/dist/app-server/scenario-assistant-domain.d.ts +162 -0
  89. package/dist/app-server/scenario-assistant-domain.d.ts.map +1 -0
  90. package/dist/app-server/scenario-assistant-domain.js +269 -0
  91. package/dist/app-server/scenario-assistant-domain.js.map +1 -0
  92. package/dist/app-server/scenario-assistant.d.ts +29 -0
  93. package/dist/app-server/scenario-assistant.d.ts.map +1 -0
  94. package/dist/app-server/scenario-assistant.js +246 -0
  95. package/dist/app-server/scenario-assistant.js.map +1 -0
  96. package/dist/app-server/settings-store.d.ts +4 -0
  97. package/dist/app-server/settings-store.d.ts.map +1 -0
  98. package/dist/app-server/settings-store.js +32 -0
  99. package/dist/app-server/settings-store.js.map +1 -0
  100. package/dist/app-server/snapshots-routes.d.ts +24 -0
  101. package/dist/app-server/snapshots-routes.d.ts.map +1 -0
  102. package/dist/app-server/snapshots-routes.js +82 -0
  103. package/dist/app-server/snapshots-routes.js.map +1 -0
  104. package/dist/app-server/static-serving.d.ts +17 -0
  105. package/dist/app-server/static-serving.d.ts.map +1 -0
  106. package/dist/app-server/static-serving.js +64 -0
  107. package/dist/app-server/static-serving.js.map +1 -0
  108. package/dist/app-server/store-utils.d.ts +5 -0
  109. package/dist/app-server/store-utils.d.ts.map +1 -0
  110. package/dist/app-server/store-utils.js +26 -0
  111. package/dist/app-server/store-utils.js.map +1 -0
  112. package/dist/app-server/tool-analysis-domain.d.ts +146 -0
  113. package/dist/app-server/tool-analysis-domain.d.ts.map +1 -0
  114. package/dist/app-server/tool-analysis-domain.js +556 -0
  115. package/dist/app-server/tool-analysis-domain.js.map +1 -0
  116. package/dist/app-server/tool-analysis-storage.d.ts +41 -0
  117. package/dist/app-server/tool-analysis-storage.d.ts.map +1 -0
  118. package/dist/app-server/tool-analysis-storage.js +110 -0
  119. package/dist/app-server/tool-analysis-storage.js.map +1 -0
  120. package/dist/app-server/tool-analysis.d.ts +22 -0
  121. package/dist/app-server/tool-analysis.d.ts.map +1 -0
  122. package/dist/app-server/tool-analysis.js +271 -0
  123. package/dist/app-server/tool-analysis.js.map +1 -0
  124. package/dist/app-server/types.d.ts +28 -0
  125. package/dist/app-server/types.d.ts.map +1 -0
  126. package/dist/app-server/types.js +2 -0
  127. package/dist/app-server/types.js.map +1 -0
  128. package/dist/cli.d.ts +3 -0
  129. package/dist/cli.d.ts.map +1 -0
  130. package/dist/cli.js +544 -0
  131. package/dist/cli.js.map +1 -0
  132. package/dist/snapshot.d.ts +80 -0
  133. package/dist/snapshot.d.ts.map +1 -0
  134. package/dist/snapshot.js +401 -0
  135. package/dist/snapshot.js.map +1 -0
  136. package/package.json +55 -0
package/README.md ADDED
@@ -0,0 +1,762 @@
1
+ # MCPLab 🧪
2
+
3
+ > **Lab for testing Model Context Protocol servers with LLMs**
4
+
5
+ Test how well LLM agents use your MCP tools, compare different models, and track quality over time with automated testing and detailed reports.
6
+
7
+ ---
8
+
9
+ ## What is MCPLab?
10
+
11
+ MCPLab is a testing and evaluation framework for [MCP servers](https://modelcontextprotocol.io). It helps you:
12
+
13
+ - **Validate** that LLM agents correctly use your MCP tools
14
+ - **Compare** different LLMs (Claude, GPT-4, etc.) on the same tasks
15
+ - **Track** tool usage patterns, success rates, and performance metrics
16
+ - **Automate** quality assurance in CI/CD pipelines
17
+ - **Debug** agent behavior with detailed execution traces
18
+
19
+ Perfect for MCP server developers who want to ensure their tools work reliably across different AI models.
20
+
21
+ ---
22
+
23
+ ## ✨ Features
24
+
25
+ ### Core Capabilities
26
+ - **HTTP SSE Transport** - Test MCP servers over Streamable HTTP
27
+ - **Multi-LLM Support** - OpenAI, Anthropic Claude, Azure OpenAI
28
+ - **Rich Assertions** - Validate tool usage, sequences, and response content
29
+ - **Variance Testing** - Run multiple iterations to measure stability
30
+ - **Detailed Traces** - JSONL logs of every tool call and LLM response
31
+
32
+ ### Analysis & Reporting
33
+ - **Trend Analysis** - Track pass rates and performance over time
34
+ - **LLM Comparison** - Built-in tools to compare agent behavior
35
+ - **Multiple Outputs** - HTML report, JSON results, Markdown summary, JSONL trace
36
+ - **Custom Metrics** - Extract values and track domain-specific KPIs
37
+ - **Markdown Reports** - Store and browse custom analysis notes alongside runs
38
+
39
+ ### AI-Powered Tools (App Mode)
40
+ - **Scenario Assistant** - AI chat to help design and refine eval scenarios
41
+ - **Result Assistant** - AI chat to analyze and explain completed run results
42
+ - **MCP Tool Analysis** - Automated review of MCP tool quality and safety
43
+
44
+ ### Developer Experience
45
+ - **Watch Mode** - Auto-rerun tests when configs change
46
+ - **YAML Configuration** - Declarative, version-controllable eval specs
47
+ - **Interactive Reports** - Self-contained HTML with filtering and drill-down
48
+ - **Multi-Agent Testing** - Compare LLMs with a single CLI flag
49
+ - **Scenario Isolation** - Run specific tests or full suites
50
+
51
+ ---
52
+
53
+ ## 🚀 Quick Start
54
+
55
+ ### 1. Install
56
+
57
+ ```bash
58
+ npx @inspectr/mcplab --help
59
+ ```
60
+
61
+ Or install globally:
62
+
63
+ ```bash
64
+ npm install -g @inspectr/mcplab
65
+ ```
66
+
67
+ ### 2. Set up environment
68
+
69
+ ```bash
70
+ cp .env.example .env
71
+ # Edit .env and add your API keys:
72
+ # ANTHROPIC_API_KEY=sk-ant-...
73
+ # OPENAI_API_KEY=sk-...
74
+ ```
75
+
76
+ ### 3. Run your first evaluation
77
+
78
+ ```bash
79
+ # Run the app (frontend + local API bridge)
80
+ npx @inspectr/mcplab app --open
81
+
82
+ # Run an evaluation from a config file
83
+ npx @inspectr/mcplab run -c mcplab/evals/eval.yaml
84
+
85
+ # View the results
86
+ open mcplab/results/evaluation-runs/$(ls -t mcplab/results/evaluation-runs | head -1)/report.html
87
+ ```
88
+
89
+ ### 4. Create your own test
90
+
91
+ Create `my-eval.yaml`:
92
+
93
+ ```yaml
94
+ servers:
95
+ - id: my-server
96
+ transport: "http"
97
+ url: "http://localhost:3000/mcp"
98
+
99
+ agents:
100
+ - id: claude
101
+ provider: "anthropic"
102
+ model: "claude-haiku-4-5-20251001"
103
+ temperature: 0
104
+ max_tokens: 2048
105
+
106
+ scenarios:
107
+ - id: "basic-test"
108
+ servers: ["my-server"]
109
+ prompt: "Use the available tools to complete this task..."
110
+ eval:
111
+ tool_constraints:
112
+ required_tools: ["my_tool"]
113
+ response_assertions:
114
+ - type: "regex"
115
+ pattern: "success|completed"
116
+ ```
117
+
118
+ Run it:
119
+
120
+ ```bash
121
+ mcplab run -c my-eval.yaml
122
+ ```
123
+
124
+ ---
125
+
126
+ ## 📖 Configuration Guide
127
+
128
+ ### Structure Overview
129
+
130
+ Add this at the top of your eval file for editor validation/autocomplete:
131
+
132
+ ```yaml
133
+ # yaml-language-server: $schema=./config-schema.json
134
+ ```
135
+
136
+ ```yaml
137
+ servers: # MCP servers to test against
138
+ - id: local-server
139
+ transport: "http"
140
+ url: "http://localhost:3000/mcp"
141
+ - ref: "shared-server"
142
+
143
+ agents: # LLM agents to use for testing
144
+ - id: local-agent
145
+ provider: "anthropic"
146
+ model: "claude-sonnet-4-6"
147
+ - ref: "claude-sonnet-46"
148
+
149
+ scenarios: # Test scenarios to run
150
+ - id: "basic-test"
151
+ servers: ["local-server"]
152
+ prompt: "..."
153
+ - ref: "scn-shared-basic"
154
+ ```
155
+
156
+ ### Servers
157
+
158
+ Define MCP servers with connection details and authentication:
159
+
160
+ ```yaml
161
+ servers:
162
+ - id: my-server
163
+ transport: "http"
164
+ url: "https://api.example.com/mcp"
165
+ auth:
166
+ type: "bearer" # or "oauth_client_credentials"
167
+ env: "MCP_TOKEN" # Environment variable name
168
+ ```
169
+
170
+ **Authentication types:**
171
+
172
+ **Bearer Token:**
173
+ ```yaml
174
+ auth:
175
+ type: "bearer"
176
+ env: "MCP_TOKEN" # Reads from process.env.MCP_TOKEN
177
+ ```
178
+
179
+ **OAuth Client Credentials:**
180
+ ```yaml
181
+ auth:
182
+ type: "oauth_client_credentials"
183
+ token_url: "https://auth.example.com/token"
184
+ client_id_env: "CLIENT_ID"
185
+ client_secret_env: "CLIENT_SECRET"
186
+ scope: "read:data" # Optional
187
+ audience: "https://api.example.com" # Optional
188
+ ```
189
+
190
+ ### Agents
191
+
192
+ Configure LLM agents with provider-specific settings:
193
+
194
+ **Anthropic (Claude):**
195
+ ```yaml
196
+ agents:
197
+ - id: claude-sonnet
198
+ provider: "anthropic"
199
+ model: "claude-sonnet-4-6"
200
+ temperature: 0
201
+ max_tokens: 2048
202
+ system: "You are a helpful assistant."
203
+ ```
204
+
205
+ **OpenAI (ChatGPT):**
206
+ ```yaml
207
+ agents:
208
+ - id: gpt-4
209
+ provider: "openai"
210
+ model: "gpt-4o-mini"
211
+ temperature: 0
212
+ max_tokens: 2048
213
+ system: "You are a helpful assistant."
214
+ ```
215
+
216
+ **Azure OpenAI:**
217
+ ```yaml
218
+ agents:
219
+ - id: azure-gpt
220
+ provider: "azure_openai"
221
+ model: "gpt-4o" # Deployment name
222
+ temperature: 0
223
+ max_tokens: 2048
224
+ system: "You are a helpful assistant."
225
+ ```
226
+
227
+ **Required environment variables:**
228
+ - Anthropic: `ANTHROPIC_API_KEY`
229
+ - OpenAI: `OPENAI_API_KEY`
230
+ - Azure: `AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_DEPLOYMENT`
231
+
232
+ ### Scenarios
233
+
234
+ Define test scenarios with prompts and evaluation criteria:
235
+
236
+ ```yaml
237
+ scenarios:
238
+ - id: "search-and-analyze"
239
+ servers: ["my-server"]
240
+ prompt: |
241
+ Search for items matching criteria X,
242
+ then analyze the results and provide insights.
243
+
244
+ eval:
245
+ # Validate tool usage
246
+ tool_constraints:
247
+ required_tools: ["search", "analyze"]
248
+ forbidden_tools: ["delete"]
249
+
250
+ # Validate tool sequence
251
+ tool_sequence:
252
+ allow:
253
+ - ["search", "analyze"]
254
+ - ["search", "search", "analyze"]
255
+
256
+ # Validate response content
257
+ response_assertions:
258
+ - type: "regex"
259
+ pattern: "found \\d+ items"
260
+ - type: "jsonpath"
261
+ path: "$.summary.count"
262
+ equals: 10
263
+
264
+ # Extract metrics
265
+ extract:
266
+ - name: "item_count"
267
+ from: "final_text"
268
+ regex: "found (?<value>\\d+) items"
269
+
270
+ run_defaults:
271
+ selected_agents:
272
+ - claude-sonnet
273
+ ```
274
+
275
+ **Evaluation options:**
276
+
277
+ - **`tool_constraints`** - Which tools must/must not be used
278
+ - `required_tools`: Tools that must be called
279
+ - `forbidden_tools`: Tools that must not be called
280
+
281
+ - **`tool_sequence`** - Valid sequences of tool calls
282
+ - `allow`: List of allowed sequences (e.g., `[["search", "analyze"]]`)
283
+
284
+ - **`response_assertions`** - Validate the final response
285
+ - `regex`: Pattern matching on response text
286
+ - `jsonpath`: Query and validate JSON responses
287
+
288
+ - **`extract`** - Extract metrics from responses
289
+ - Capture values using regex named groups: `(?<value>...)`
290
+
291
+ ---
292
+
293
+ ## 💡 Usage Examples
294
+
295
+ ### Basic Usage
296
+
297
+ ```bash
298
+ # Run all scenarios
299
+ mcplab run -c mcplab/evals/eval.yaml
300
+
301
+ # Run specific scenario
302
+ mcplab run -c mcplab/evals/eval.yaml -s basic-test
303
+
304
+ # Run with variance testing (5 iterations)
305
+ mcplab run -c mcplab/evals/eval.yaml -n 5
306
+ ```
307
+
308
+ ### App Mode
309
+
310
+ Serve the web app and local API in one process:
311
+
312
+ ```bash
313
+ mcplab app --open
314
+ ```
315
+
316
+ Optional custom paths:
317
+
318
+ ```bash
319
+ mcplab app --evals-dir mcplab/evals --runs-dir mcplab/results/evaluation-runs --port 8787 --open
320
+ ```
321
+
322
+ Optional development mode (proxy frontend to Vite, keep API local):
323
+
324
+ ```bash
325
+ mcplab app --dev
326
+ ```
327
+
328
+ ### Multi-LLM Testing
329
+
330
+ Compare how different LLMs perform on the same tasks:
331
+
332
+ ```bash
333
+ # Test with multiple agents
334
+ mcplab run -c examples/eval.yaml \
335
+ --agents claude-haiku,gpt-4o-mini,gpt-4o
336
+
337
+ # This runs each scenario with each agent automatically
338
+ # 3 scenarios × 3 agents = 9 tests
339
+
340
+ # Compare results
341
+ node scripts/compare-llm-results.mjs mcplab/results/evaluation-runs/LATEST/results.json
342
+ ```
343
+
344
+ Output:
345
+ ```
346
+ 📊 LLM Performance Comparison
347
+
348
+ LLM | Pass Rate | Avg Tools/Run | Avg Duration (ms)
349
+ -----------------|-----------|---------------|------------------
350
+ claude-haiku | 100.0% | 2.5 | 850
351
+ gpt-4o-mini | 88.9% | 2.8 | 950
352
+ gpt-4o | 88.9% | 3.2 | 1200
353
+
354
+ 💡 Key Insights
355
+ • Highest Pass Rate: claude-haiku (100.0%)
356
+ • Fastest: claude-haiku (850ms avg)
357
+ • Most Efficient: claude-haiku (2.5 tools/run)
358
+ ```
359
+
360
+ ### Watch Mode
361
+
362
+ Auto-rerun tests when config changes:
363
+
364
+ ```bash
365
+ mcplab watch -c examples/eval.yaml
366
+
367
+ # With multi-agent testing
368
+ mcplab watch -c examples/eval.yaml \
369
+ --agents claude-haiku,gpt-4o-mini
370
+ ```
371
+
372
+ ### Snapshot Baselines
373
+
374
+ Create a smart baseline from a fully passing run, then compare later runs against it:
375
+
376
+ ```bash
377
+ # Create a snapshot (source run must be fully passing)
378
+ mcplab snapshot create --run 20260208-140213 --name "weather-api-baseline-v1"
379
+
380
+ # List snapshots
381
+ mcplab snapshot list
382
+
383
+ # Compare run against snapshot
384
+ mcplab snapshot compare --id <snapshotId> --run 20260208-150045
385
+ ```
386
+
387
+ Optional: compare immediately after a run:
388
+
389
+ ```bash
390
+ mcplab run -c mcplab/evals/eval.yaml --compare-snapshot <snapshotId>
391
+ ```
392
+
393
+ Config-first snapshot eval workflow:
394
+
395
+ ```bash
396
+ # Initialize snapshot eval policy in a config from a fully passing run
397
+ mcplab snapshot eval-init --config mcplab/evals/eval.yaml --run 20260208-140213 --name "baseline-v1"
398
+
399
+ # Update snapshot eval policy mode
400
+ mcplab snapshot eval-policy --config mcplab/evals/eval.yaml --enabled true --mode fail_on_drift
401
+
402
+ # Apply config snapshot policy during run (warn or fail_on_drift)
403
+ mcplab run -c mcplab/evals/eval.yaml --snapshot-eval
404
+ ```
405
+
406
+ ### Generate Reports
407
+
408
+ ```bash
409
+ # Regenerate HTML report from previous run
410
+ mcplab report --input mcplab/results/evaluation-runs/20260206-212239
411
+ ```
412
+
413
+ ---
414
+
415
+ ## 🤖 AI-Powered Features
416
+
417
+ These features are available through the web app (`mcplab app`).
418
+
419
+ ### Scenario Assistant
420
+
421
+ An interactive AI chat that helps you design and refine evaluation scenarios. Given a scenario, it can suggest improvements to the prompt, evaluation rules, and extraction patterns — and can call your MCP server's tools directly to demonstrate expected behavior.
422
+
423
+ Open the app, navigate to an eval, and open the **Scenario Assistant** panel on any scenario.
424
+
425
+ ### Result Assistant
426
+
427
+ An AI chat that analyzes completed evaluation runs. Ask it to explain failures, identify patterns across scenarios, or summarize what went wrong in a specific run. It has read-only access to run artifacts, traces, and results.
428
+
429
+ Open a run in the app and click **Result Assistant**.
430
+
431
+ ### MCP Tool Analysis
432
+
433
+ Automated quality review of your MCP server's tools. Connects to your server, discovers all tools, and produces a report covering:
434
+
435
+ - Name and description quality
436
+ - Schema completeness
437
+ - Safety classification (read-like vs. potentially destructive)
438
+ - Sample call behavior (optional — runs real calls against your server)
439
+
440
+ Reports are saved to `mcplab/results/tool-analysis/` and viewable in the app.
441
+
442
+ Navigate to **Tool Analysis** in the app sidebar to start an analysis job.
443
+
444
+ ### Markdown Reports
445
+
446
+ Store and browse custom analysis notes, comparison docs, or generated reports alongside your eval runs. Place `.md` files in `mcplab/reports/` and they become accessible in the app under **Reports**.
447
+
448
+ ---
449
+
450
+ ## 📚 Reusable configurations
451
+
452
+ Define servers, agents, and scenarios once and reuse them across multiple eval files.
453
+
454
+ ```
455
+ mcplab/
456
+ ├── servers.yaml # Shared MCP server definitions
457
+ ├── agents.yaml # Shared LLM agent definitions
458
+ └── scenarios/
459
+ ├── scenario-a.yaml
460
+ └── scenario-b.yaml
461
+ ```
462
+
463
+ Reference library items in eval configs:
464
+
465
+ ```yaml
466
+ servers:
467
+ - ref: "my-server" # from servers.yaml
468
+ agents:
469
+ - ref: "claude-sonnet" # from agents.yaml
470
+ scenarios:
471
+ - ref: "scenario-a" # from scenarios/scenario-a.yaml
472
+ ```
473
+
474
+ Libraries can be managed through the app's **Libraries** page.
475
+
476
+ ---
477
+
478
+ ## 📂 Output Structure
479
+
480
+ Each evaluation run creates a timestamped directory:
481
+
482
+ ```
483
+ mcplab/results/evaluation-runs/20260206-212239/
484
+ ├── trace.jsonl # Detailed execution log (every tool call, LLM response)
485
+ ├── results.json # Structured results (pass/fail, metrics, aggregates)
486
+ ├── summary.md # Human-readable summary table
487
+ └── report.html # Interactive HTML report (self-contained)
488
+ ```
489
+
490
+ Other output directories:
491
+
492
+ ```
493
+ mcplab/
494
+ ├── evals/ # Eval definition YAML files
495
+ ├── results/
496
+ │ ├── evaluation-runs/ # Run artifacts
497
+ │ └── tool-analysis/ # Saved tool analysis reports
498
+ ├── snapshots/ # Snapshot baselines
499
+ ├── reports/ # Custom markdown reports
500
+ ├── servers.yaml # Library: shared server definitions
501
+ ├── agents.yaml # Library: shared agent definitions
502
+ └── scenarios/ # Library: shared scenario files
503
+ ```
504
+
505
+ ### Trace Format (JSONL)
506
+
507
+ ```jsonl
508
+ {"type":"run_started","run_id":"...","ts":"2026-02-06T20:03:54.585Z"}
509
+ {"type":"scenario_started","scenario_id":"search-tags","agent":"claude-haiku","ts":"..."}
510
+ {"type":"llm_request","messages_summary":"user:Search for tags...","ts":"..."}
511
+ {"type":"llm_response","raw_or_summary":"tool_calls:search_tags","ts":"..."}
512
+ {"type":"tool_call","server":"demo","tool":"search_tags","args":{...},"ts_start":"..."}
513
+ {"type":"tool_result","server":"demo","tool":"search_tags","ok":true,"result_summary":"...","ts_end":"...","duration_ms":1114}
514
+ {"type":"final_answer","text":"Found 42 tags matching...","ts":"..."}
515
+ {"type":"scenario_finished","scenario_id":"search-tags","pass":true,"metrics":{...},"ts":"..."}
516
+ ```
517
+
518
+ ### Results Format (JSON)
519
+
520
+ ```json
521
+ {
522
+ "metadata": {
523
+ "run_id": "20260206-212239",
524
+ "timestamp": "2026-02-06T20:22:39.000Z",
525
+ "config_hash": "abc123...",
526
+ "git_commit": "def456..."
527
+ },
528
+ "summary": {
529
+ "total_scenarios": 8,
530
+ "total_runs": 8,
531
+ "pass_rate": 1.0,
532
+ "avg_tool_calls_per_run": 2.5,
533
+ "avg_tool_latency_ms": 950
534
+ },
535
+ "scenarios": [...]
536
+ }
537
+ ```
538
+
539
+ ---
540
+
541
+ ## 🎓 Real-World Examples
542
+
543
+ ### Example 1: Weather MCP Server
544
+
545
+ Test a weather data MCP server:
546
+
547
+ ```bash
548
+ # Run comprehensive test suite (9 scenarios)
549
+ mcplab run -c examples/eval-weather-comprehensive.yaml
550
+
551
+ # Test a specific scenario
552
+ mcplab run -c examples/eval-weather-comprehensive.yaml \
553
+ -s forecast-accuracy
554
+
555
+ # Compare Claude vs GPT-4 on all scenarios
556
+ mcplab run -c examples/eval-weather-simple.yaml \
557
+ --agents claude-haiku,gpt-4o-mini
558
+ ```
559
+
560
+ **Included scenarios:**
561
+ - Current conditions lookup
562
+ - Multi-day forecast retrieval
563
+ - Location search and resolution
564
+ - Severe weather alerts
565
+ - Historical data queries
566
+ - Unit conversion (metric/imperial)
567
+
568
+ ### Example 2: Multi-Agent Comparison
569
+
570
+ Create `multi-agent-eval.yaml` with one agent defined:
571
+
572
+ ```yaml
573
+ agents:
574
+ - id: claude-haiku
575
+ provider: anthropic
576
+ model: claude-haiku-4-5-20251001
577
+ - id: gpt-4o-mini
578
+ provider: openai
579
+ model: gpt-4o-mini
580
+ - id: gpt-4o
581
+ provider: openai
582
+ model: gpt-4o
583
+
584
+ scenarios:
585
+ - id: "complex-task"
586
+ prompt: "..."
587
+
588
+ run_defaults:
589
+ selected_agents:
590
+ - claude-haiku
591
+ ```
592
+
593
+ Run with all agents:
594
+
595
+ ```bash
596
+ mcplab run -c multi-agent-eval.yaml \
597
+ --agents claude-haiku,gpt-4o-mini,gpt-4o \
598
+ -n 5
599
+
600
+ # 1 scenario × 3 agents × 5 runs = 15 tests
601
+ ```
602
+
603
+ ### Example 3: CI/CD Integration
604
+
605
+ Add to `.github/workflows/mcp-eval.yml`:
606
+
607
+ ```yaml
608
+ name: MCP Evaluation
609
+
610
+ on: [push, pull_request]
611
+
612
+ jobs:
613
+ evaluate:
614
+ runs-on: ubuntu-latest
615
+ steps:
616
+ - uses: actions/checkout@v4
617
+ - uses: actions/setup-node@v4
618
+ with:
619
+ node-version: '22'
620
+
621
+ - run: npm install
622
+ - run: npm run build
623
+
624
+ - name: Run evaluations
625
+ run: mcplab run -c examples/eval.yaml -n 3
626
+ env:
627
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
628
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
629
+
630
+ - name: Upload results
631
+ uses: actions/upload-artifact@v4
632
+ with:
633
+ name: evaluation-results
634
+ path: mcplab/results/evaluation-runs/
635
+ ```
636
+
637
+ ---
638
+
639
+ ## 🛠️ Advanced Features
640
+
641
+ ### Custom Analysis Scripts
642
+
643
+ Analyze results with custom logic:
644
+
645
+ ```javascript
646
+ // my-analysis.mjs
647
+ import { readFileSync } from 'fs';
648
+
649
+ const results = JSON.parse(readFileSync('mcplab/results/evaluation-runs/LATEST/results.json'));
650
+
651
+ // Calculate custom metrics
652
+ for (const scenario of results.scenarios) {
653
+ const efficiency = scenario.pass_rate / scenario.runs[0].tool_call_count;
654
+ console.log(`${scenario.scenario_id}: ${efficiency.toFixed(2)} success/tool`);
655
+ }
656
+ ```
657
+
658
+ ### Generate Multi-LLM Configs
659
+
660
+ Auto-generate multi-agent configs:
661
+
662
+ ```bash
663
+ # Creates eval-weather-multi-llm.yaml
664
+ node scripts/generate-multi-llm-config.mjs examples/eval-weather.yaml
665
+ ```
666
+
667
+ ### Compare LLM Performance
668
+
669
+ Built-in comparison script:
670
+
671
+ ```bash
672
+ node scripts/compare-llm-results.mjs mcplab/results/evaluation-runs/20260206-212239/results.json
673
+ ```
674
+
675
+ Shows:
676
+ - Pass rates by LLM
677
+ - Tool usage efficiency
678
+ - Response times
679
+ - Scenario-by-scenario breakdown
680
+
681
+ ---
682
+
683
+ ## 🔧 Development
684
+
685
+ ### Project Structure
686
+
687
+ ```
688
+ mcp-evaluation/
689
+ ├── packages/
690
+ │ ├── cli/ # CLI tool (run, watch, report, app commands)
691
+ │ ├── app/ # Web frontend (React)
692
+ │ ├── core/ # Evaluation engine, agent adapters, MCP client
693
+ │ └── reporting/ # HTML report generation
694
+ ├── examples/ # Example evaluation configs
695
+ ├── scripts/ # Utility scripts (multi-LLM, comparison)
696
+ ├── mcplab/results/ # Evaluation results + analysis (gitignored)
697
+ └── .claude/ # Claude Code skills (optional)
698
+ ```
699
+
700
+ ### Run in Development Mode
701
+
702
+ ```bash
703
+ # Build all packages
704
+ npm run build
705
+
706
+ # Run CLI directly with tsx (no build needed)
707
+ npm run dev -- app --dev
708
+
709
+ # Or run just the frontend in watch mode
710
+ npm run app:dev:ui
711
+ ```
712
+
713
+ ### Run Tests
714
+
715
+ ```bash
716
+ npm test
717
+ ```
718
+
719
+ ---
720
+
721
+ ## 🤝 Contributing
722
+
723
+ Contributions welcome! Please:
724
+
725
+ 1. Fork the repository
726
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
727
+ 3. Commit your changes (`git commit -m 'Add amazing feature'`)
728
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
729
+ 5. Open a Pull Request
730
+
731
+ ---
732
+
733
+ ## License
734
+
735
+ MIT License - see [LICENSE](LICENSE) for details.
736
+
737
+ ---
738
+
739
+ ## Acknowledgments
740
+
741
+ Built with:
742
+ - [Model Context Protocol SDK](https://github.com/modelcontextprotocol/sdk)
743
+ - [Anthropic SDK](https://github.com/anthropics/anthropic-sdk-typescript)
744
+ - [OpenAI SDK](https://github.com/openai/openai-node)
745
+
746
+ ---
747
+
748
+ ## Support
749
+
750
+ - [Issue Tracker](https://github.com/inspectr-hq/mcplab/issues)
751
+ - [Discussions](https://github.com/inspectr-hq/mcplab/discussions)
752
+ - [MCP Protocol](https://modelcontextprotocol.io)
753
+
754
+ ---
755
+
756
+ <div align="center">
757
+
758
+ **⭐ Star this repo if you find it useful!**
759
+
760
+ Made with ❤️ by [Inspectr](https://inspectr.dev) for the MCP community
761
+
762
+ </div>