mcp-wordpress 1.5.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/README.md +332 -61
  2. package/dist/cache/CacheInvalidation.d.ts.map +1 -1
  3. package/dist/cache/CacheInvalidation.js +4 -4
  4. package/dist/cache/CacheInvalidation.js.map +1 -1
  5. package/dist/client/MockWordPressClient.d.ts +55 -0
  6. package/dist/client/MockWordPressClient.d.ts.map +1 -0
  7. package/dist/client/MockWordPressClient.js +369 -0
  8. package/dist/client/MockWordPressClient.js.map +1 -0
  9. package/dist/client/api.d.ts +1 -0
  10. package/dist/client/api.d.ts.map +1 -1
  11. package/dist/client/api.js +26 -60
  12. package/dist/client/api.js.map +1 -1
  13. package/dist/client/managers/AuthenticationManager.d.ts.map +1 -1
  14. package/dist/client/managers/AuthenticationManager.js +4 -3
  15. package/dist/client/managers/AuthenticationManager.js.map +1 -1
  16. package/dist/config/ConfigurationSchema.d.ts +3 -3
  17. package/dist/config/ConfigurationSchema.d.ts.map +1 -1
  18. package/dist/config/ConfigurationSchema.js +7 -24
  19. package/dist/config/ConfigurationSchema.js.map +1 -1
  20. package/dist/config/ServerConfiguration.d.ts +8 -0
  21. package/dist/config/ServerConfiguration.d.ts.map +1 -1
  22. package/dist/config/ServerConfiguration.js +80 -31
  23. package/dist/config/ServerConfiguration.js.map +1 -1
  24. package/dist/docs/DocumentationGenerator.d.ts.map +1 -1
  25. package/dist/docs/DocumentationGenerator.js +5 -7
  26. package/dist/docs/DocumentationGenerator.js.map +1 -1
  27. package/dist/index.d.ts.map +1 -1
  28. package/dist/index.js +33 -29
  29. package/dist/index.js.map +1 -1
  30. package/dist/security/InputValidator.d.ts.map +1 -1
  31. package/dist/security/InputValidator.js +3 -11
  32. package/dist/security/InputValidator.js.map +1 -1
  33. package/dist/server/ToolRegistry.d.ts +4 -0
  34. package/dist/server/ToolRegistry.d.ts.map +1 -1
  35. package/dist/server/ToolRegistry.js +71 -8
  36. package/dist/server/ToolRegistry.js.map +1 -1
  37. package/dist/tools/auth.d.ts.map +1 -1
  38. package/dist/tools/auth.js +8 -3
  39. package/dist/tools/auth.js.map +1 -1
  40. package/dist/tools/posts.d.ts.map +1 -1
  41. package/dist/tools/posts.js +287 -20
  42. package/dist/tools/posts.js.map +1 -1
  43. package/dist/tools/site.d.ts.map +1 -1
  44. package/dist/tools/site.js +47 -9
  45. package/dist/tools/site.js.map +1 -1
  46. package/dist/tools/users.d.ts.map +1 -1
  47. package/dist/tools/users.js +113 -10
  48. package/dist/tools/users.js.map +1 -1
  49. package/dist/utils/enhancedError.d.ts +61 -0
  50. package/dist/utils/enhancedError.d.ts.map +1 -0
  51. package/dist/utils/enhancedError.js +221 -0
  52. package/dist/utils/enhancedError.js.map +1 -0
  53. package/dist/utils/streaming.d.ts +104 -0
  54. package/dist/utils/streaming.d.ts.map +1 -0
  55. package/dist/utils/streaming.js +312 -0
  56. package/dist/utils/streaming.js.map +1 -0
  57. package/dist/utils/validation.d.ts +19 -3
  58. package/dist/utils/validation.d.ts.map +1 -1
  59. package/dist/utils/validation.js +174 -24
  60. package/dist/utils/validation.js.map +1 -1
  61. package/docs/ARCHITECTURE.md +850 -0
  62. package/docs/CACHING.md +20 -17
  63. package/docs/CONFIGURATION.md +660 -0
  64. package/docs/DOCKER.md +61 -60
  65. package/docs/EVALUATION.md +397 -0
  66. package/docs/INSTALLATION.md +423 -0
  67. package/docs/PERFORMANCE_MONITORING.md +17 -15
  68. package/docs/SECURITY.md +621 -0
  69. package/docs/SECURITY_TESTING.md +22 -26
  70. package/docs/TEST_SITE_SETUP.md +136 -0
  71. package/docs/TROUBLESHOOTING.md +578 -0
  72. package/docs/api/README.md +76 -91
  73. package/docs/api/categories/auth.md +0 -2
  74. package/docs/api/categories/cache.md +0 -2
  75. package/docs/api/categories/comment.md +0 -2
  76. package/docs/api/categories/media.md +0 -2
  77. package/docs/api/categories/page.md +0 -2
  78. package/docs/api/categories/performance.md +0 -2
  79. package/docs/api/categories/post.md +0 -2
  80. package/docs/api/categories/site.md +0 -2
  81. package/docs/api/categories/taxonomy.md +0 -2
  82. package/docs/api/categories/user.md +0 -2
  83. package/docs/api/summary.json +1 -1
  84. package/docs/api/tools/wp_approve_comment.md +11 -3
  85. package/docs/api/tools/wp_cache_clear.md +14 -5
  86. package/docs/api/tools/wp_cache_info.md +14 -5
  87. package/docs/api/tools/wp_cache_stats.md +14 -5
  88. package/docs/api/tools/wp_cache_warm.md +14 -5
  89. package/docs/api/tools/wp_create_application_password.md +11 -3
  90. package/docs/api/tools/wp_create_category.md +11 -3
  91. package/docs/api/tools/wp_create_comment.md +14 -5
  92. package/docs/api/tools/wp_create_page.md +13 -5
  93. package/docs/api/tools/wp_create_post.md +14 -7
  94. package/docs/api/tools/wp_create_tag.md +11 -3
  95. package/docs/api/tools/wp_create_user.md +13 -5
  96. package/docs/api/tools/wp_delete_application_password.md +11 -3
  97. package/docs/api/tools/wp_delete_category.md +11 -3
  98. package/docs/api/tools/wp_delete_comment.md +11 -3
  99. package/docs/api/tools/wp_delete_media.md +10 -3
  100. package/docs/api/tools/wp_delete_page.md +10 -3
  101. package/docs/api/tools/wp_delete_post.md +11 -5
  102. package/docs/api/tools/wp_delete_tag.md +11 -3
  103. package/docs/api/tools/wp_delete_user.md +10 -3
  104. package/docs/api/tools/wp_get_application_passwords.md +11 -3
  105. package/docs/api/tools/wp_get_auth_status.md +11 -3
  106. package/docs/api/tools/wp_get_category.md +11 -3
  107. package/docs/api/tools/wp_get_comment.md +11 -3
  108. package/docs/api/tools/wp_get_current_user.md +11 -3
  109. package/docs/api/tools/wp_get_media.md +11 -3
  110. package/docs/api/tools/wp_get_page.md +11 -3
  111. package/docs/api/tools/wp_get_page_revisions.md +11 -3
  112. package/docs/api/tools/wp_get_post.md +12 -5
  113. package/docs/api/tools/wp_get_post_revisions.md +11 -3
  114. package/docs/api/tools/wp_get_site_settings.md +10 -3
  115. package/docs/api/tools/wp_get_tag.md +11 -3
  116. package/docs/api/tools/wp_get_user.md +11 -3
  117. package/docs/api/tools/wp_list_categories.md +11 -3
  118. package/docs/api/tools/wp_list_comments.md +11 -3
  119. package/docs/api/tools/wp_list_media.md +14 -5
  120. package/docs/api/tools/wp_list_pages.md +14 -5
  121. package/docs/api/tools/wp_list_posts.md +15 -7
  122. package/docs/api/tools/wp_list_tags.md +11 -3
  123. package/docs/api/tools/wp_list_users.md +11 -3
  124. package/docs/api/tools/wp_performance_alerts.md +17 -7
  125. package/docs/api/tools/wp_performance_benchmark.md +17 -7
  126. package/docs/api/tools/wp_performance_export.md +17 -7
  127. package/docs/api/tools/wp_performance_history.md +17 -7
  128. package/docs/api/tools/wp_performance_optimize.md +17 -7
  129. package/docs/api/tools/wp_performance_stats.md +17 -7
  130. package/docs/api/tools/wp_search_site.md +11 -3
  131. package/docs/api/tools/wp_spam_comment.md +11 -3
  132. package/docs/api/tools/wp_switch_auth_method.md +14 -5
  133. package/docs/api/tools/wp_test_auth.md +11 -3
  134. package/docs/api/tools/wp_update_category.md +11 -3
  135. package/docs/api/tools/wp_update_comment.md +14 -5
  136. package/docs/api/tools/wp_update_media.md +14 -5
  137. package/docs/api/tools/wp_update_page.md +13 -5
  138. package/docs/api/tools/wp_update_post.md +14 -7
  139. package/docs/api/tools/wp_update_site_settings.md +14 -5
  140. package/docs/api/tools/wp_update_tag.md +11 -3
  141. package/docs/api/tools/wp_update_user.md +13 -5
  142. package/docs/api/tools/wp_upload_media.md +13 -5
  143. package/docs/api/types/WordPressPost.md +2 -0
  144. package/docs/code-improvements.md +40 -0
  145. package/docs/contract-testing.md +1 -1
  146. package/docs/developer/API_REFERENCE.md +19 -59
  147. package/docs/developer/ARCHITECTURE.md +8 -11
  148. package/docs/developer/BUILD_SYSTEM.md +2 -2
  149. package/docs/developer/CONTRIBUTING.md +3 -5
  150. package/docs/developer/GITHUB_ACTIONS_SETUP.md +2 -2
  151. package/docs/developer/MIGRATION_GUIDE.md +5 -6
  152. package/docs/developer/README.md +2 -1
  153. package/docs/developer/REFACTORING.md +9 -15
  154. package/docs/developer/RELEASE_PROCESS.md +4 -3
  155. package/docs/developer/TESTING.md +2 -2
  156. package/docs/examples/claude-desktop-config.md +8 -0
  157. package/docs/integrations/claude-desktop.md +426 -0
  158. package/docs/integrations/cline.md +537 -0
  159. package/docs/integrations/vs-code.md +515 -0
  160. package/docs/releases/COMMUNITY_ANNOUNCEMENT_v1.1.2.md +30 -23
  161. package/docs/releases/RELEASE_NOTES_v1.1.2.md +7 -6
  162. package/docs/testing-configurations.md +11 -0
  163. package/docs/user-guides/DOCKER_NPM_DTX_SETUP.md +3 -2
  164. package/docs/user-guides/DOCKER_SETUP.md +3 -2
  165. package/docs/user-guides/DTX_SETUP.md +6 -5
  166. package/docs/user-guides/DXT_INSTALLATION.md +4 -4
  167. package/docs/user-guides/NPM_SETUP.md +4 -2
  168. package/docs/user-guides/NPX_SETUP.md +4 -2
  169. package/docs/user-guides/SMITHERY_SETUP.md +402 -0
  170. package/docs/wordpress-rest-api-authentication-troubleshooting.md +45 -42
  171. package/package.json +12 -2
  172. package/src/cache/CacheInvalidation.ts +7 -18
  173. package/src/client/MockWordPressClient.ts +398 -0
  174. package/src/client/api.ts +77 -237
  175. package/src/client/managers/AuthenticationManager.ts +19 -56
  176. package/src/config/ConfigurationSchema.ts +14 -45
  177. package/src/config/ServerConfiguration.ts +98 -71
  178. package/src/docs/DocumentationGenerator.ts +39 -123
  179. package/src/dxt-entry.cjs +4 -1
  180. package/src/index.ts +35 -54
  181. package/src/security/InputValidator.ts +15 -57
  182. package/src/server/ToolRegistry.ts +88 -17
  183. package/src/tools/auth.ts +15 -22
  184. package/src/tools/posts.ts +347 -64
  185. package/src/tools/site.ts +69 -46
  186. package/src/tools/users.ts +142 -44
  187. package/src/utils/enhancedError.ts +248 -0
  188. package/src/utils/streaming.ts +428 -0
  189. package/src/utils/validation.ts +253 -92
  190. package/dist/mcp-wordpress-1.5.2.tgz +0 -0
package/docs/DOCKER.md CHANGED
@@ -1,10 +1,10 @@
1
1
  # Docker Deployment Guide (Legacy)
2
2
 
3
- ![Docker](https://img.shields.io/badge/Docker-ready-blue)
4
- ![Version](https://img.shields.io/badge/version-1.2.0-green)
3
+ ![Docker](https://img.shields.io/badge/Docker-ready-blue) ![Version](https://img.shields.io/badge/version-1.2.0-green)
5
4
  ![Security](https://img.shields.io/badge/security-hardened-brightgreen)
6
5
 
7
- > **📖 New Users**: For Claude Desktop MCP integration, see the **[Docker Setup Guide](user-guides/DOCKER_SETUP.md)** instead.
6
+ > **📖 New Users**: For Claude Desktop MCP integration, see the **[Docker Setup Guide](user-guides/DOCKER_SETUP.md)**
7
+ > instead.
8
8
 
9
9
  This guide covers advanced Docker deployment scenarios for production and development environments.
10
10
 
@@ -23,7 +23,8 @@ docker run -d \
23
23
  docdyhr/mcp-wordpress:latest
24
24
  ```
25
25
 
26
- **⚠️ Claude Desktop Users**: Do NOT use `-d` flag with Claude Desktop. See [Docker Setup Guide](user-guides/DOCKER_SETUP.md) for MCP integration.
26
+ **⚠️ Claude Desktop Users**: Do NOT use `-d` flag with Claude Desktop. See
27
+ [Docker Setup Guide](user-guides/DOCKER_SETUP.md) for MCP integration.
27
28
 
28
29
  ### Option 2: Docker Compose
29
30
 
@@ -54,15 +55,15 @@ docker run -d --name mcp-wordpress mcp-wordpress
54
55
 
55
56
  ### Environment Variables
56
57
 
57
- | Variable | Required | Description | Example |
58
- |----------|----------|-------------|---------|
59
- | `WORDPRESS_SITE_URL` | ✅ | WordPress site URL | `https://example.com` |
60
- | `WORDPRESS_USERNAME` | ✅ | WordPress username | `admin` |
61
- | `WORDPRESS_APP_PASSWORD` | ✅ | Application password | `xxxx xxxx xxxx xxxx xxxx xxxx` |
62
- | `WORDPRESS_AUTH_METHOD` | ❌ | Authentication method | `app-password` (default) |
63
- | `NODE_ENV` | ❌ | Environment mode | `production` |
64
- | `DEBUG` | ❌ | Enable debug logging | `false` |
65
- | `DISABLE_CACHE` | ❌ | Disable caching system | `false` |
58
+ | Variable | Required | Description | Example |
59
+ | ------------------------ | -------- | ---------------------- | ------------------------------- |
60
+ | `WORDPRESS_SITE_URL` | ✅ | WordPress site URL | `https://example.com` |
61
+ | `WORDPRESS_USERNAME` | ✅ | WordPress username | `admin` |
62
+ | `WORDPRESS_APP_PASSWORD` | ✅ | Application password | `xxxx xxxx xxxx xxxx xxxx xxxx` |
63
+ | `WORDPRESS_AUTH_METHOD` | ❌ | Authentication method | `app-password` (default) |
64
+ | `NODE_ENV` | ❌ | Environment mode | `production` |
65
+ | `DEBUG` | ❌ | Enable debug logging | `false` |
66
+ | `DISABLE_CACHE` | ❌ | Disable caching system | `false` |
66
67
 
67
68
  ### Multi-Site Configuration
68
69
 
@@ -111,7 +112,7 @@ docker run -d \
111
112
  **docker-compose.yml:**
112
113
 
113
114
  ```yaml
114
- version: '3.8'
115
+ version: "3.8"
115
116
 
116
117
  services:
117
118
  mcp-wordpress:
@@ -157,7 +158,7 @@ docker-compose up --profile dev
157
158
  ### Docker Swarm
158
159
 
159
160
  ```yaml
160
- version: '3.8'
161
+ version: "3.8"
161
162
 
162
163
  services:
163
164
  mcp-wordpress:
@@ -209,51 +210,51 @@ spec:
209
210
  app: mcp-wordpress
210
211
  spec:
211
212
  containers:
212
- - name: mcp-wordpress
213
- image: docdyhr/mcp-wordpress:latest
214
- ports:
215
- - containerPort: 3000
216
- env:
217
- - name: NODE_ENV
218
- value: \"production\"
219
- - name: WORDPRESS_SITE_URL
220
- valueFrom:
221
- secretKeyRef:
222
- name: wordpress-secrets
223
- key: site-url
224
- - name: WORDPRESS_USERNAME
225
- valueFrom:
226
- secretKeyRef:
227
- name: wordpress-secrets
228
- key: username
229
- - name: WORDPRESS_APP_PASSWORD
230
- valueFrom:
231
- secretKeyRef:
232
- name: wordpress-secrets
233
- key: app-password
234
- resources:
235
- requests:
236
- memory: \"256Mi\"
237
- cpu: \"250m\"
238
- limits:
239
- memory: \"512Mi\"
240
- cpu: \"500m\"
241
- livenessProbe:
242
- exec:
243
- command:
244
- - node
245
- - dist/index.js
246
- - --health-check
247
- initialDelaySeconds: 30
248
- periodSeconds: 30
249
- readinessProbe:
250
- exec:
251
- command:
252
- - node
253
- - dist/index.js
254
- - --health-check
255
- initialDelaySeconds: 10
256
- periodSeconds: 10
213
+ - name: mcp-wordpress
214
+ image: docdyhr/mcp-wordpress:latest
215
+ ports:
216
+ - containerPort: 3000
217
+ env:
218
+ - name: NODE_ENV
219
+ value: \"production\"
220
+ - name: WORDPRESS_SITE_URL
221
+ valueFrom:
222
+ secretKeyRef:
223
+ name: wordpress-secrets
224
+ key: site-url
225
+ - name: WORDPRESS_USERNAME
226
+ valueFrom:
227
+ secretKeyRef:
228
+ name: wordpress-secrets
229
+ key: username
230
+ - name: WORDPRESS_APP_PASSWORD
231
+ valueFrom:
232
+ secretKeyRef:
233
+ name: wordpress-secrets
234
+ key: app-password
235
+ resources:
236
+ requests:
237
+ memory: \"256Mi\"
238
+ cpu: \"250m\"
239
+ limits:
240
+ memory: \"512Mi\"
241
+ cpu: \"500m\"
242
+ livenessProbe:
243
+ exec:
244
+ command:
245
+ - node
246
+ - dist/index.js
247
+ - --health-check
248
+ initialDelaySeconds: 30
249
+ periodSeconds: 30
250
+ readinessProbe:
251
+ exec:
252
+ command:
253
+ - node
254
+ - dist/index.js
255
+ - --health-check
256
+ initialDelaySeconds: 10
257
+ periodSeconds: 10
257
258
  ```
258
259
 
259
260
  ## 🔧 Management Commands
@@ -0,0 +1,397 @@
1
+ # MCP WordPress Tools Evaluation Guide
2
+
3
+ This guide covers the comprehensive evaluation system for MCP WordPress tools using [mcp-evals](https://github.com/mclenhard/mcp-evals).
4
+
5
+ ## Overview
6
+
7
+ The evaluation system provides automated testing and scoring of WordPress MCP tools using LLM-based evaluation to ensure:
8
+
9
+ - **Tool Reliability**: Consistent performance across different scenarios
10
+ - **Quality Assurance**: Comprehensive testing of all 59 WordPress tools
11
+ - **Performance Monitoring**: Track tool performance over time
12
+ - **Regression Detection**: Identify when changes affect tool quality
13
+
14
+ ## Quick Start
15
+
16
+ ### Prerequisites
17
+
18
+ - Node.js 20+
19
+ - OpenAI API key (configured in GitHub secrets)
20
+ - WordPress test site credentials
21
+
22
+ ### Running Evaluations
23
+
24
+ ```bash
25
+ # Run all evaluations
26
+ npm run eval
27
+
28
+ # Run quick evaluation (critical tools only)
29
+ npm run eval:quick
30
+
31
+ # Run critical tools evaluation
32
+ npm run eval:critical
33
+
34
+ # Generate evaluation report
35
+ npm run eval:report
36
+
37
+ # Watch mode for development
38
+ npm run eval:watch
39
+ ```
40
+
41
+ ## Evaluation Configurations
42
+
43
+ ### 1. Comprehensive Evaluation (`wordpress-tools-eval.yaml`)
44
+
45
+ Tests all 59 WordPress tools across multiple categories:
46
+
47
+ - **Post Management**: Create, read, update, delete posts
48
+ - **Media Management**: Upload, manage media files
49
+ - **User Management**: User creation, role management
50
+ - **Comment Management**: Comment moderation workflows
51
+ - **Taxonomy Management**: Categories and tags
52
+ - **Site Management**: Settings, health checks
53
+ - **Performance**: Cache management, optimization
54
+ - **Authentication**: Security and permissions
55
+ - **Error Handling**: Edge cases and failures
56
+
57
+ ### 2. Critical Tools Evaluation (`critical-tools-eval.yaml`)
58
+
59
+ Focused evaluation of the most important tools with stricter scoring:
60
+
61
+ - Higher pass threshold (4.0/5.0 vs 3.5/5.0)
62
+ - Essential functionality only
63
+ - Faster execution for CI/CD pipelines
64
+
65
+ ### 3. TypeScript Evaluations (`critical-tools.eval.ts`)
66
+
67
+ Advanced evaluations using TypeScript for complex scenarios:
68
+
69
+ - Multi-tool workflows
70
+ - Error recovery scenarios
71
+ - Performance benchmarks
72
+ - Security testing
73
+
74
+ ## Evaluation Scoring
75
+
76
+ ### Scoring Criteria
77
+
78
+ Each evaluation is scored on five dimensions (1-5 scale):
79
+
80
+ 1. **Accuracy** (25%): How accurately the tool performs its function
81
+ 2. **Completeness** (20%): How thoroughly it completes the task
82
+ 3. **Relevance** (20%): How relevant the response is to the request
83
+ 4. **Clarity** (20%): How clear and understandable the output is
84
+ 5. **Reasoning** (15%): How well it handles edge cases and errors
85
+
86
+ ### Scoring Thresholds
87
+
88
+ - **Pass**: 3.5/5.0 (Acceptable performance)
89
+ - **Good**: 4.0/5.0 (Solid performance)
90
+ - **Excellent**: 4.5/5.0 (Outstanding performance)
91
+
92
+ ### Example Evaluation
93
+
94
+ ```yaml
95
+ - name: create_post_basic
96
+ description: Test basic post creation functionality
97
+ prompt: "Create a new blog post titled 'AI Trends in 2025' with content about emerging AI technologies"
98
+ expected_tools:
99
+ - wp_create_post
100
+ success_criteria:
101
+ - Post is created successfully
102
+ - Title matches the request
103
+ - Content is relevant to AI trends
104
+ ```
105
+
106
+ ## GitHub Actions Integration
107
+
108
+ ### Automated Evaluation Pipeline
109
+
110
+ The evaluation system runs automatically on:
111
+
112
+ - **Pull Requests**: When tools are modified
113
+ - **Push to Main**: After merging changes
114
+ - **Weekly Schedule**: Every Monday at 2 AM UTC
115
+ - **Manual Trigger**: Via GitHub Actions UI
116
+
117
+ ### Workflow Features
118
+
119
+ - **PR Comments**: Automatic evaluation results in PR comments
120
+ - **Artifacts**: Evaluation results saved for 30 days
121
+ - **Regression Detection**: Compares with previous results
122
+ - **Performance Tracking**: Trends over time
123
+ - **Failure Notifications**: Creates issues for significant regressions
124
+
125
+ ### Environment Variables
126
+
127
+ Required secrets in GitHub repository:
128
+
129
+ ```bash
130
+ OPENAI_API_KEY=your_openai_api_key
131
+ TEST_WORDPRESS_URL=https://test-site.com
132
+ TEST_WORDPRESS_USER=testuser
133
+ TEST_WORDPRESS_PASSWORD=app_password
134
+ ```
135
+
136
+ ## Writing Custom Evaluations
137
+
138
+ ### YAML Configuration
139
+
140
+ ```yaml
141
+ model:
142
+ provider: openai
143
+ name: gpt-4o
144
+ temperature: 0.3
145
+
146
+ evals:
147
+ - name: custom_evaluation
148
+ description: Description of what this tests
149
+ prompt: "Test prompt for the LLM"
150
+ expected_tools:
151
+ - wp_tool_name
152
+ success_criteria:
153
+ - What constitutes success
154
+ - Additional criteria
155
+ ```
156
+
157
+ ### TypeScript Evaluation
158
+
159
+ ```typescript
160
+ import { EvalFunction, grade } from 'mcp-evals';
161
+ import { openai } from 'mcp-evals/models';
162
+
163
+ export const customEval: EvalFunction = {
164
+ name: 'custom_evaluation',
165
+ description: 'Test custom functionality',
166
+ run: async () => {
167
+ const result = await grade(
168
+ openai("gpt-4o"),
169
+ "Your test prompt here",
170
+ {
171
+ systemPrompt: "Evaluation criteria...",
172
+ responseFormat: { type: "json_object" }
173
+ }
174
+ );
175
+ return JSON.parse(result);
176
+ }
177
+ };
178
+ ```
179
+
180
+ ## Evaluation Categories
181
+
182
+ ### 1. Functional Testing
183
+
184
+ Tests basic tool functionality:
185
+
186
+ ```yaml
187
+ - name: create_post_basic
188
+ prompt: "Create a new blog post with title and content"
189
+ expected_tools: [wp_create_post]
190
+ ```
191
+
192
+ ### 2. Integration Testing
193
+
194
+ Tests multiple tools working together:
195
+
196
+ ```yaml
197
+ - name: content_publishing_workflow
198
+ prompt: "Create a post with images and publish it"
199
+ expected_tools: [wp_create_post, wp_upload_media, wp_update_post]
200
+ ```
201
+
202
+ ### 3. Error Handling Testing
203
+
204
+ Tests edge cases and error scenarios:
205
+
206
+ ```yaml
207
+ - name: handle_invalid_post
208
+ prompt: "Try to update a non-existent post"
209
+ expected_tools: [wp_update_post]
210
+ ```
211
+
212
+ ### 4. Performance Testing
213
+
214
+ Tests tool performance and efficiency:
215
+
216
+ ```yaml
217
+ - name: high_volume_operations
218
+ prompt: "List 100 most recent posts and generate summary"
219
+ expected_tools: [wp_list_posts, wp_get_post]
220
+ ```
221
+
222
+ ## Best Practices
223
+
224
+ ### Writing Effective Evaluations
225
+
226
+ 1. **Clear Prompts**: Write specific, actionable prompts
227
+ 2. **Realistic Scenarios**: Test real-world use cases
228
+ 3. **Success Criteria**: Define clear success metrics
229
+ 4. **Error Cases**: Include failure scenarios
230
+ 5. **Performance**: Consider timeout and efficiency
231
+
232
+ ### Evaluation Maintenance
233
+
234
+ 1. **Regular Updates**: Keep evaluations current with tool changes
235
+ 2. **Threshold Tuning**: Adjust scoring thresholds based on results
236
+ 3. **Coverage Analysis**: Ensure all tools are tested
237
+ 4. **Performance Monitoring**: Track evaluation trends
238
+
239
+ ### CI/CD Integration
240
+
241
+ 1. **Fast Feedback**: Use `eval:quick` for PR checks
242
+ 2. **Comprehensive Testing**: Full evaluations on main branch
243
+ 3. **Regression Detection**: Compare with previous results
244
+ 4. **Performance Gates**: Fail builds on significant regressions
245
+
246
+ ## Troubleshooting
247
+
248
+ ### Common Issues
249
+
250
+ 1. **API Key Missing**: Ensure `OPENAI_API_KEY` is set
251
+ 2. **WordPress Connection**: Verify test site credentials
252
+ 3. **Timeout Errors**: Increase timeout or reduce evaluation scope
253
+ 4. **Rate Limiting**: Add delays between evaluations
254
+
255
+ ### Debug Mode
256
+
257
+ ```bash
258
+ # Run with debug output
259
+ DEBUG=true npm run eval
260
+
261
+ # Run single evaluation
262
+ npm run eval:quick -- --filter create_post_basic
263
+ ```
264
+
265
+ ### Local Testing
266
+
267
+ ```bash
268
+ # Set up local environment
269
+ export OPENAI_API_KEY=your_key
270
+ export WORDPRESS_SITE_URL=http://localhost/wp
271
+ export WORDPRESS_USERNAME=admin
272
+ export WORDPRESS_APP_PASSWORD=your_password
273
+
274
+ # Run evaluations
275
+ npm run eval:quick
276
+ ```
277
+
278
+ ## Reporting
279
+
280
+ ### Automatic Reports
281
+
282
+ Evaluation results are automatically processed into:
283
+
284
+ - **JSON Summary**: Machine-readable results
285
+ - **HTML Report**: Visual dashboard
286
+ - **Markdown Report**: Documentation-friendly format
287
+
288
+ ### Manual Report Generation
289
+
290
+ ```bash
291
+ # Generate comprehensive report
292
+ npm run eval:report
293
+
294
+ # View results
295
+ open evaluations/reports/evaluation-report.html
296
+ ```
297
+
298
+ ### Example Report Structure
299
+
300
+ ```json
301
+ {
302
+ "overall_score": 4.2,
303
+ "tests_passed": 23,
304
+ "total_tests": 25,
305
+ "status": "good",
306
+ "categories": {
307
+ "post": { "passed": 5, "total": 6, "avg_score": 4.1 },
308
+ "media": { "passed": 3, "total": 3, "avg_score": 4.5 }
309
+ },
310
+ "failed_tests": [
311
+ {
312
+ "name": "handle_invalid_post",
313
+ "score": 3.2,
314
+ "reason": "Error handling could be more graceful"
315
+ }
316
+ ],
317
+ "recommendations": [
318
+ "Improve error handling for edge cases",
319
+ "Add more comprehensive validation"
320
+ ]
321
+ }
322
+ ```
323
+
324
+ ## Advanced Features
325
+
326
+ ### Performance Tracking
327
+
328
+ Track evaluation performance over time:
329
+
330
+ ```bash
331
+ # Generate performance trends
332
+ node evaluations/scripts/track-performance.js
333
+
334
+ # Compare with previous results
335
+ node evaluations/scripts/compare-performance.js
336
+ ```
337
+
338
+ ### Custom Scoring
339
+
340
+ Override default scoring with custom logic:
341
+
342
+ ```typescript
343
+ const customScoring = {
344
+ accuracy: { weight: 0.3, min: 4.0 },
345
+ completeness: { weight: 0.3, min: 3.5 },
346
+ relevance: { weight: 0.2, min: 3.0 },
347
+ clarity: { weight: 0.1, min: 3.0 },
348
+ reasoning: { weight: 0.1, min: 3.0 }
349
+ };
350
+ ```
351
+
352
+ ### Integration with Monitoring
353
+
354
+ Connect evaluations to monitoring systems:
355
+
356
+ ```bash
357
+ # Export metrics to Prometheus
358
+ npm run eval:metrics
359
+
360
+ # Send results to monitoring dashboard
361
+ npm run eval:monitor
362
+ ```
363
+
364
+ ## Contributing
365
+
366
+ ### Adding New Evaluations
367
+
368
+ 1. Create evaluation in `evaluations/config/`
369
+ 2. Add to appropriate category
370
+ 3. Test locally with `npm run eval:quick`
371
+ 4. Submit PR with evaluation results
372
+
373
+ ### Improving Existing Evaluations
374
+
375
+ 1. Review current evaluation scores
376
+ 2. Identify areas for improvement
377
+ 3. Update prompts and success criteria
378
+ 4. Test changes thoroughly
379
+
380
+ ### Reporting Issues
381
+
382
+ - Use GitHub Issues for evaluation problems
383
+ - Include full evaluation results
384
+ - Provide reproduction steps
385
+ - Tag with `evaluation` label
386
+
387
+ ## Resources
388
+
389
+ - [mcp-evals Documentation](https://github.com/mclenhard/mcp-evals)
390
+ - [OpenAI API Documentation](https://platform.openai.com/docs)
391
+ - [WordPress REST API](https://developer.wordpress.org/rest-api/)
392
+ - [GitHub Actions Documentation](https://docs.github.com/en/actions)
393
+
394
+ ---
395
+
396
+ **Ready to improve tool quality?** Start by running `npm run eval:quick` to see current performance, then dive into
397
+ writing custom evaluations for your specific use cases!