claude-flow-novice 1.5.2 ā 1.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/architecture/system-architect.md +3 -44
- package/.claude/agents/benchmarking-tests/test-agent-code-heavy.md +747 -0
- package/.claude/agents/benchmarking-tests/test-agent-metadata.md +181 -0
- package/.claude/agents/benchmarking-tests/test-agent-minimal.md +67 -0
- package/.claude/agents/data/ml/data-ml-model.md +5 -119
- package/.claude/agents/development/backend/dev-backend-api.md +4 -115
- package/.claude/agents/devops/ci-cd/ops-cicd-github.md +4 -114
- package/.claude/agents/documentation/api-docs/docs-api-openapi.md +4 -113
- package/.claude/agents/github/multi-repo-swarm.md +1 -28
- package/.claude/agents/github/pr-manager.md +1 -29
- package/.claude/agents/github/project-board-sync.md +1 -32
- package/.claude/agents/github/release-manager.md +1 -32
- package/.claude/agents/github/release-swarm.md +1 -33
- package/.claude/agents/github/repo-architect.md +1 -34
- package/.claude/agents/github/swarm-issue.md +1 -26
- package/.claude/agents/github/swarm-pr.md +1 -30
- package/.claude/agents/github/sync-coordinator.md +1 -30
- package/.claude/agents/github/workflow-automation.md +1 -31
- package/.claude/agents/neural/neural-pattern-agent.md +2 -50
- package/.claude/agents/specialized/mobile/spec-mobile-react-native.md +6 -142
- package/.claude/agents/sublinear/consciousness-evolution-agent.md +2 -18
- package/.claude/agents/sublinear/matrix-solver-agent.md +2 -16
- package/.claude/agents/sublinear/nanosecond-scheduler-agent.md +2 -19
- package/.claude/agents/sublinear/pagerank-agent.md +2 -19
- package/.claude/agents/sublinear/phi-calculator-agent.md +2 -19
- package/.claude/agents/sublinear/psycho-symbolic-agent.md +2 -19
- package/.claude/agents/sublinear/sublinear.md +2 -1
- package/.claude/agents/sublinear/temporal-advantage-agent.md +2 -16
- package/.claude/agents/testing/e2e/playwright-agent.md +7 -0
- package/.claude-flow-novice/.claude/agents/architecture/system-architect.md +3 -44
- package/.claude-flow-novice/.claude/agents/benchmarking-tests/test-agent-code-heavy.md +747 -0
- package/.claude-flow-novice/.claude/agents/benchmarking-tests/test-agent-metadata.md +181 -0
- package/.claude-flow-novice/.claude/agents/benchmarking-tests/test-agent-minimal.md +67 -0
- package/.claude-flow-novice/.claude/agents/data/ml/data-ml-model.md +5 -119
- package/.claude-flow-novice/.claude/agents/development/backend/dev-backend-api.md +4 -115
- package/.claude-flow-novice/.claude/agents/devops/ci-cd/ops-cicd-github.md +4 -114
- package/.claude-flow-novice/.claude/agents/documentation/api-docs/docs-api-openapi.md +4 -113
- package/.claude-flow-novice/.claude/agents/github/multi-repo-swarm.md +1 -28
- package/.claude-flow-novice/.claude/agents/github/pr-manager.md +1 -29
- package/.claude-flow-novice/.claude/agents/github/project-board-sync.md +1 -32
- package/.claude-flow-novice/.claude/agents/github/release-manager.md +1 -32
- package/.claude-flow-novice/.claude/agents/github/release-swarm.md +1 -33
- package/.claude-flow-novice/.claude/agents/github/repo-architect.md +1 -34
- package/.claude-flow-novice/.claude/agents/github/swarm-issue.md +1 -26
- package/.claude-flow-novice/.claude/agents/github/swarm-pr.md +1 -30
- package/.claude-flow-novice/.claude/agents/github/sync-coordinator.md +1 -30
- package/.claude-flow-novice/.claude/agents/github/workflow-automation.md +1 -31
- package/.claude-flow-novice/.claude/agents/neural/neural-pattern-agent.md +2 -50
- package/.claude-flow-novice/.claude/agents/specialized/mobile/spec-mobile-react-native.md +6 -142
- package/.claude-flow-novice/.claude/agents/sublinear/consciousness-evolution-agent.md +2 -18
- package/.claude-flow-novice/.claude/agents/sublinear/matrix-solver-agent.md +2 -16
- package/.claude-flow-novice/.claude/agents/sublinear/nanosecond-scheduler-agent.md +2 -19
- package/.claude-flow-novice/.claude/agents/sublinear/pagerank-agent.md +2 -19
- package/.claude-flow-novice/.claude/agents/sublinear/phi-calculator-agent.md +2 -19
- package/.claude-flow-novice/.claude/agents/sublinear/psycho-symbolic-agent.md +2 -19
- package/.claude-flow-novice/.claude/agents/sublinear/sublinear.md +2 -1
- package/.claude-flow-novice/.claude/agents/sublinear/temporal-advantage-agent.md +2 -16
- package/.claude-flow-novice/.claude/agents/testing/e2e/playwright-agent.md +7 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/CLAUDE.md +188 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/claude-flow-universal +81 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/claude-flow.bat +18 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/claude-flow.ps1 +24 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/claude-md.js +982 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/analysis/bottleneck-detect.md +162 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/automation/auto-agent.md +122 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/coordination/swarm-init.md +85 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/github/github-swarm.md +121 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/helpers/standard-checkpoint-hooks.sh +179 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/notification.md +113 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/post-command.md +116 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/post-edit.md +117 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/post-task.md +112 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/pre-command.md +113 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/pre-edit.md +113 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/pre-search.md +112 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/pre-task.md +111 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/session-end.md +118 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/session-restore.md +118 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/commands/hooks/session-start.md +117 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/coordination-md.js +340 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/coordination.md +16 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/enhanced-templates.js +2347 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/github-safe-enhanced.js +331 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/github-safe.js +106 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/memory-bank-md.js +259 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/memory-bank.md +16 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/readme-files.js +72 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/safe-hook-patterns.js +430 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/settings.json +109 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/settings.json.enhanced +35 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/sparc-modes.js +1401 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/CLAUDE.md +188 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/claude-flow-universal +81 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/claude-flow.bat +18 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/claude-flow.ps1 +24 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/claude-md.js +982 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/analysis/bottleneck-detect.md +162 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/automation/auto-agent.md +122 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/coordination/swarm-init.md +85 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/github/github-swarm.md +121 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/helpers/standard-checkpoint-hooks.sh +179 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/notification.md +113 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/post-command.md +116 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/post-edit.md +117 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/post-task.md +112 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/pre-command.md +113 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/pre-edit.md +113 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/pre-search.md +112 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/pre-task.md +111 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/session-end.md +118 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/session-restore.md +118 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/commands/hooks/session-start.md +117 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/coordination-md.js +340 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/coordination.md +16 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/enhanced-templates.js +2347 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/github-safe-enhanced.js +331 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/github-safe.js +106 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/memory-bank-md.js +259 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/memory-bank.md +16 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/readme-files.js +72 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/safe-hook-patterns.js +430 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/settings.json +109 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/settings.json.enhanced +35 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/sparc-modes.js +1401 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/templates/verification-claude-md.js +432 -0
- package/.claude-flow-novice/dist/src/cli/simple-commands/init/verification-claude-md.js +432 -0
- package/.claude-flow-novice/dist/src/slash-commands/benchmark-prompts.js +281 -0
- package/CLAUDE.md +1927 -127
- package/package.json +3 -3
- package/src/cli/simple-commands/init/index.js +39 -4
- package/src/cli/simple-commands/init/templates/CLAUDE.md +8 -10
- package/src/slash-commands/benchmark-prompts.js +281 -0
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
// verification-claude-md.js - Verification and Pair Programming CLAUDE.md template
|
|
2
|
+
|
|
3
|
+
export function createVerificationClaudeMd() {
|
|
4
|
+
return `# Claude Code Configuration - Truth Verification & Pair Programming Environment
|
|
5
|
+
|
|
6
|
+
## š VERIFICATION-FIRST DEVELOPMENT
|
|
7
|
+
|
|
8
|
+
This project enforces **"truth is enforced, not assumed"** with mandatory verification for all operations.
|
|
9
|
+
|
|
10
|
+
### šÆ Truth Verification System Active
|
|
11
|
+
- **Threshold**: 0.95 (95% accuracy required)
|
|
12
|
+
- **Mode**: Strict verification with auto-rollback
|
|
13
|
+
- **Pair Programming**: Real-time collaborative development
|
|
14
|
+
- **Background Monitoring**: Continuous validation enabled
|
|
15
|
+
|
|
16
|
+
## šØ CRITICAL: VERIFICATION COMMANDS
|
|
17
|
+
|
|
18
|
+
### Initialize Verification System
|
|
19
|
+
\`\`\`bash
|
|
20
|
+
# Set verification mode (strict/moderate/development)
|
|
21
|
+
./claude-flow-novice verify init strict # 95% threshold, auto-rollback
|
|
22
|
+
./claude-flow-novice verify init moderate # 85% threshold, no auto-rollback
|
|
23
|
+
./claude-flow-novice verify init development # 75% threshold, for prototyping
|
|
24
|
+
\`\`\`
|
|
25
|
+
|
|
26
|
+
### Run Verification
|
|
27
|
+
\`\`\`bash
|
|
28
|
+
# Verify specific tasks
|
|
29
|
+
./claude-flow-novice verify verify task-123 --agent coder
|
|
30
|
+
./claude-flow-novice verify verify task-456 --agent reviewer --threshold 0.90
|
|
31
|
+
|
|
32
|
+
# Check truth scores
|
|
33
|
+
./claude-flow-novice truth # View current truth scores
|
|
34
|
+
./claude-flow-novice truth --report # Generate detailed report
|
|
35
|
+
./claude-flow-novice truth --analyze # Analyze failure patterns
|
|
36
|
+
\`\`\`
|
|
37
|
+
|
|
38
|
+
### Pair Programming Mode
|
|
39
|
+
\`\`\`bash
|
|
40
|
+
# Start pair programming with real-time verification
|
|
41
|
+
./claude-flow-novice pair --start # Begin collaborative session
|
|
42
|
+
./claude-flow-novice pair --start --mode strict # Production-quality pairing
|
|
43
|
+
./claude-flow-novice pair --verify --threshold 0.90 # Custom threshold
|
|
44
|
+
|
|
45
|
+
# Background monitoring (use run_in_background: true)
|
|
46
|
+
./claude-flow-novice pair --start --monitor # Continuous monitoring dashboard
|
|
47
|
+
\`\`\`
|
|
48
|
+
|
|
49
|
+
## š VERIFICATION REQUIREMENTS BY AGENT TYPE
|
|
50
|
+
|
|
51
|
+
### Coder Agents
|
|
52
|
+
- **Compile**: Code must compile without errors (35% weight)
|
|
53
|
+
- **Test**: All tests must pass (25% weight)
|
|
54
|
+
- **Lint**: Code quality checks (20% weight)
|
|
55
|
+
- **Typecheck**: Type safety verification (20% weight)
|
|
56
|
+
|
|
57
|
+
### Reviewer Agents
|
|
58
|
+
- **Code Analysis**: Static code analysis
|
|
59
|
+
- **Security Scan**: Vulnerability detection
|
|
60
|
+
- **Performance Check**: Regression testing
|
|
61
|
+
|
|
62
|
+
### Tester Agents
|
|
63
|
+
- **Unit Tests**: Component-level testing
|
|
64
|
+
- **Integration Tests**: System-wide validation
|
|
65
|
+
- **Coverage Check**: Minimum coverage thresholds
|
|
66
|
+
|
|
67
|
+
### Planner Agents
|
|
68
|
+
- **Task Decomposition**: Valid task breakdown
|
|
69
|
+
- **Dependency Check**: Dependency validation
|
|
70
|
+
- **Feasibility**: Resource analysis
|
|
71
|
+
|
|
72
|
+
## š BACKGROUND TASK MANAGEMENT
|
|
73
|
+
|
|
74
|
+
### Running Verification in Background
|
|
75
|
+
\`\`\`javascript
|
|
76
|
+
// Use run_in_background parameter for continuous monitoring
|
|
77
|
+
{
|
|
78
|
+
"tool": "Bash",
|
|
79
|
+
"command": "./claude-flow-novice pair --start --monitor",
|
|
80
|
+
"run_in_background": true // Enables background execution
|
|
81
|
+
}
|
|
82
|
+
\`\`\`
|
|
83
|
+
|
|
84
|
+
### Managing Background Tasks
|
|
85
|
+
\`\`\`bash
|
|
86
|
+
# Interactive management
|
|
87
|
+
/bashes # View all background tasks
|
|
88
|
+
|
|
89
|
+
# Check specific verification task
|
|
90
|
+
"Check status of bash_1" # Via prompt to Claude
|
|
91
|
+
|
|
92
|
+
# Monitor verification output
|
|
93
|
+
"Show output from bash_1" # Real-time monitoring
|
|
94
|
+
|
|
95
|
+
# Kill verification session
|
|
96
|
+
"Kill bash_1" # Stop background verification
|
|
97
|
+
\`\`\`
|
|
98
|
+
|
|
99
|
+
## šØ CRITICAL: CONCURRENT VERIFICATION
|
|
100
|
+
|
|
101
|
+
**MANDATORY**: All verification operations MUST be concurrent:
|
|
102
|
+
|
|
103
|
+
### ā
CORRECT - Parallel Verification
|
|
104
|
+
\`\`\`javascript
|
|
105
|
+
[Single Message]:
|
|
106
|
+
// Initialize verification for multiple tasks
|
|
107
|
+
- Bash("./claude-flow-novice verify verify task-1 --agent coder")
|
|
108
|
+
- Bash("./claude-flow-novice verify verify task-2 --agent reviewer")
|
|
109
|
+
- Bash("./claude-flow-novice verify verify task-3 --agent tester")
|
|
110
|
+
|
|
111
|
+
// Check all truth scores
|
|
112
|
+
- Bash("./claude-flow-novice truth --json")
|
|
113
|
+
|
|
114
|
+
// Start monitoring in background
|
|
115
|
+
- Bash("./claude-flow-novice pair --start --monitor", run_in_background: true)
|
|
116
|
+
\`\`\`
|
|
117
|
+
|
|
118
|
+
### ā WRONG - Sequential Verification
|
|
119
|
+
\`\`\`javascript
|
|
120
|
+
Message 1: Verify task-1
|
|
121
|
+
Message 2: Verify task-2
|
|
122
|
+
Message 3: Check truth score
|
|
123
|
+
// This is 3x slower!
|
|
124
|
+
\`\`\`
|
|
125
|
+
|
|
126
|
+
## š VERIFICATION METRICS & THRESHOLDS
|
|
127
|
+
|
|
128
|
+
### Target Metrics
|
|
129
|
+
- **Truth Accuracy Rate**: >95%
|
|
130
|
+
- **Integration Success Rate**: >90%
|
|
131
|
+
- **Automated Rollback Frequency**: <5%
|
|
132
|
+
- **Human Intervention Rate**: <10%
|
|
133
|
+
|
|
134
|
+
### Verification Modes
|
|
135
|
+
| Mode | Threshold | Auto-Rollback | Use Case |
|
|
136
|
+
|------|-----------|---------------|----------|
|
|
137
|
+
| **Strict** | 0.95 | ā
Enabled | Production |
|
|
138
|
+
| **Moderate** | 0.85 | ā Disabled | Development |
|
|
139
|
+
| **Development** | 0.75 | ā Disabled | Prototyping |
|
|
140
|
+
|
|
141
|
+
## š¤ PAIR PROGRAMMING WORKFLOW
|
|
142
|
+
|
|
143
|
+
### 1. Start Session
|
|
144
|
+
\`\`\`bash
|
|
145
|
+
# Initialize pair programming with verification
|
|
146
|
+
./claude-flow-novice pair --start --mode strict
|
|
147
|
+
\`\`\`
|
|
148
|
+
|
|
149
|
+
### 2. Real-time Verification Cycle
|
|
150
|
+
\`\`\`
|
|
151
|
+
Developer writes code
|
|
152
|
+
ā
|
|
153
|
+
AI agent reviews in real-time
|
|
154
|
+
ā
|
|
155
|
+
Verification engine checks:
|
|
156
|
+
- Compilation (35%)
|
|
157
|
+
- Tests (25%)
|
|
158
|
+
- Linting (20%)
|
|
159
|
+
- Type safety (20%)
|
|
160
|
+
ā
|
|
161
|
+
Truth score calculated
|
|
162
|
+
ā
|
|
163
|
+
Pass (>0.95) ā Accept change
|
|
164
|
+
Fail (<0.95) ā Suggest fixes or rollback
|
|
165
|
+
\`\`\`
|
|
166
|
+
|
|
167
|
+
### 3. Continuous Monitoring
|
|
168
|
+
\`\`\`bash
|
|
169
|
+
# Monitor in background
|
|
170
|
+
./claude-flow-novice pair --start --monitor &
|
|
171
|
+
|
|
172
|
+
# Check verification output
|
|
173
|
+
/bashes # Interactive view
|
|
174
|
+
"Check bash_1 output" # Via prompt
|
|
175
|
+
\`\`\`
|
|
176
|
+
|
|
177
|
+
## š SECURITY & AUDIT FEATURES
|
|
178
|
+
|
|
179
|
+
### Cryptographic Verification
|
|
180
|
+
- All verification results are cryptographically signed
|
|
181
|
+
- SHA256 checksums for integrity
|
|
182
|
+
- Immutable audit trail
|
|
183
|
+
|
|
184
|
+
### Byzantine Fault Tolerance
|
|
185
|
+
- Protection against incorrect agents
|
|
186
|
+
- Consensus requirements (2/3+ majority)
|
|
187
|
+
- Automatic agent quarantine
|
|
188
|
+
|
|
189
|
+
### Audit Trail
|
|
190
|
+
\`\`\`bash
|
|
191
|
+
# View verification history
|
|
192
|
+
cat .swarm/verification-memory.json | jq .history
|
|
193
|
+
|
|
194
|
+
# Check agent reliability
|
|
195
|
+
./claude-flow-novice truth --agent coder --detailed
|
|
196
|
+
\`\`\`
|
|
197
|
+
|
|
198
|
+
## š QUICK START VERIFICATION WORKFLOW
|
|
199
|
+
|
|
200
|
+
### Step 1: Initialize Project with Verification
|
|
201
|
+
\`\`\`bash
|
|
202
|
+
# Initialize with verification-first approach
|
|
203
|
+
npx claude-flow@alpha init --verify --pair
|
|
204
|
+
|
|
205
|
+
# Set up strict verification
|
|
206
|
+
./claude-flow-novice verify init strict
|
|
207
|
+
\`\`\`
|
|
208
|
+
|
|
209
|
+
### Step 2: Start Development with Pair Programming
|
|
210
|
+
\`\`\`bash
|
|
211
|
+
# Start pair programming session
|
|
212
|
+
./claude-flow-novice pair --start --mode strict --monitor &
|
|
213
|
+
|
|
214
|
+
# Monitor verification (background task)
|
|
215
|
+
/bashes # Check bash_1 status
|
|
216
|
+
\`\`\`
|
|
217
|
+
|
|
218
|
+
### Step 3: Develop with Continuous Verification
|
|
219
|
+
\`\`\`bash
|
|
220
|
+
# All changes are automatically verified
|
|
221
|
+
# Truth scores maintained above 0.95
|
|
222
|
+
# Auto-rollback on verification failures
|
|
223
|
+
\`\`\`
|
|
224
|
+
|
|
225
|
+
### Step 4: Check Truth Metrics
|
|
226
|
+
\`\`\`bash
|
|
227
|
+
# View current truth scores
|
|
228
|
+
./claude-flow-novice truth
|
|
229
|
+
|
|
230
|
+
# Generate detailed report
|
|
231
|
+
./claude-flow-novice truth --report --export metrics.json
|
|
232
|
+
\`\`\`
|
|
233
|
+
|
|
234
|
+
## š VERIFICATION CHECKLIST
|
|
235
|
+
|
|
236
|
+
Before ANY operation:
|
|
237
|
+
- ā
Is verification system initialized?
|
|
238
|
+
- ā
Is pair programming mode active?
|
|
239
|
+
- ā
Are background monitors running?
|
|
240
|
+
- ā
Is truth threshold configured correctly?
|
|
241
|
+
- ā
Are all agents configured for verification?
|
|
242
|
+
|
|
243
|
+
## š ļø BUILD COMMANDS WITH VERIFICATION
|
|
244
|
+
|
|
245
|
+
### Standard Commands (with verification)
|
|
246
|
+
- \`npm run build\`: Build with verification checks
|
|
247
|
+
- \`npm run test\`: Test with truth scoring
|
|
248
|
+
- \`npm run lint\`: Lint with verification tracking
|
|
249
|
+
- \`npm run typecheck\`: Type check with validation
|
|
250
|
+
|
|
251
|
+
### Verification Commands
|
|
252
|
+
- \`./claude-flow-novice verify status\`: Check system status
|
|
253
|
+
- \`./claude-flow-novice verify verify <task>\`: Run verification
|
|
254
|
+
- \`./claude-flow-novice truth\`: View truth scores
|
|
255
|
+
- \`./claude-flow-novice pair --start\`: Begin pair programming
|
|
256
|
+
|
|
257
|
+
## š¾ PERSISTENT VERIFICATION MEMORY
|
|
258
|
+
|
|
259
|
+
Verification data stored in:
|
|
260
|
+
- \`.swarm/verification-memory.json\`: Verification history
|
|
261
|
+
- \`.swarm/memory.db\`: Persistent swarm memory
|
|
262
|
+
- \`.claude/verification-config.json\`: Custom configuration
|
|
263
|
+
|
|
264
|
+
## šÆ AGENT VERIFICATION PATTERNS
|
|
265
|
+
|
|
266
|
+
### Full-Stack Development with Verification
|
|
267
|
+
\`\`\`javascript
|
|
268
|
+
// Deploy agents with verification requirements
|
|
269
|
+
Task("System architecture", "Design with verification", "system-architect")
|
|
270
|
+
Task("Backend APIs", "Implement with 95% accuracy", "backend-dev")
|
|
271
|
+
Task("Frontend", "Build with validation", "mobile-dev")
|
|
272
|
+
Task("Testing", "Comprehensive verification", "tester")
|
|
273
|
+
Task("Review", "Verify all changes", "reviewer")
|
|
274
|
+
\`\`\`
|
|
275
|
+
|
|
276
|
+
### Verification-First TDD
|
|
277
|
+
\`\`\`javascript
|
|
278
|
+
// TDD with mandatory verification
|
|
279
|
+
Task("Write failing tests", "Verify test quality", "tester")
|
|
280
|
+
Task("Implement code", "Verify implementation", "coder")
|
|
281
|
+
Task("Refactor", "Verify improvements", "reviewer")
|
|
282
|
+
Task("Validate", "Final verification", "production-validator")
|
|
283
|
+
\`\`\`
|
|
284
|
+
|
|
285
|
+
## ā” PERFORMANCE WITH VERIFICATION
|
|
286
|
+
|
|
287
|
+
### Optimized Verification
|
|
288
|
+
- **Parallel Checks**: Run all verifications concurrently
|
|
289
|
+
- **Cached Results**: Skip unchanged file verification
|
|
290
|
+
- **Smart Batching**: Group related verifications
|
|
291
|
+
- **Background Execution**: Non-blocking verification
|
|
292
|
+
|
|
293
|
+
### Expected Performance
|
|
294
|
+
- Verification overhead: <10% for most operations
|
|
295
|
+
- Truth score calculation: <500ms
|
|
296
|
+
- Rollback execution: <2 seconds
|
|
297
|
+
- Background monitoring: Minimal impact
|
|
298
|
+
|
|
299
|
+
## š AUTOMATED WORKFLOWS
|
|
300
|
+
|
|
301
|
+
### CI/CD Integration
|
|
302
|
+
\`\`\`yaml
|
|
303
|
+
# .github/workflows/verification.yml
|
|
304
|
+
- name: Run Verification
|
|
305
|
+
run: |
|
|
306
|
+
npx claude-flow-novice verify init strict
|
|
307
|
+
npx claude-flow-novice verify verify \${{ github.run_id }}
|
|
308
|
+
npx claude-flow-novice truth --threshold 0.95
|
|
309
|
+
\`\`\`
|
|
310
|
+
|
|
311
|
+
### Pre-commit Hooks
|
|
312
|
+
\`\`\`bash
|
|
313
|
+
# .git/hooks/pre-commit
|
|
314
|
+
#!/bin/bash
|
|
315
|
+
npx claude-flow-novice verify verify pre-commit --agent coder
|
|
316
|
+
SCORE=\$(npx claude-flow-novice truth --json | jq .averageScore)
|
|
317
|
+
if [ "\$SCORE" -lt "0.85" ]; then
|
|
318
|
+
echo "ā Commit blocked: Truth score \$SCORE below threshold"
|
|
319
|
+
exit 1
|
|
320
|
+
fi
|
|
321
|
+
\`\`\`
|
|
322
|
+
|
|
323
|
+
## š DOCUMENTATION
|
|
324
|
+
|
|
325
|
+
- [Truth Verification System](https://github.com/ruvnet/claude-flow/wiki/Truth-Verification-System)
|
|
326
|
+
- [Pair Programming Guide](https://github.com/ruvnet/claude-flow/wiki/Pair-Programming-System)
|
|
327
|
+
- [Background Commands](https://github.com/ruvnet/claude-flow/wiki/background-commands)
|
|
328
|
+
- [Agent Verification](https://github.com/ruvnet/claude-flow/wiki/Agent-Verification)
|
|
329
|
+
|
|
330
|
+
## šØ IMPORTANT REMINDERS
|
|
331
|
+
|
|
332
|
+
1. **Truth is Enforced**: Every operation requires verification
|
|
333
|
+
2. **Parallel Execution**: All verifications run concurrently
|
|
334
|
+
3. **Background Monitoring**: Use run_in_background for continuous checks
|
|
335
|
+
4. **Persistent Memory**: All verification data is saved
|
|
336
|
+
5. **Auto-Rollback**: Failed verifications trigger automatic recovery
|
|
337
|
+
|
|
338
|
+
---
|
|
339
|
+
|
|
340
|
+
Remember: **"Truth is enforced, not assumed"** - All operations require verification!
|
|
341
|
+
`;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
export function createVerificationSettingsJson() {
|
|
345
|
+
return JSON.stringify(
|
|
346
|
+
{
|
|
347
|
+
version: '1.0.0',
|
|
348
|
+
verification: {
|
|
349
|
+
enabled: true,
|
|
350
|
+
mode: 'strict',
|
|
351
|
+
threshold: 0.95,
|
|
352
|
+
autoRollback: true,
|
|
353
|
+
requireConsensus: true,
|
|
354
|
+
},
|
|
355
|
+
pairProgramming: {
|
|
356
|
+
enabled: true,
|
|
357
|
+
mode: 'strict',
|
|
358
|
+
realTimeVerification: true,
|
|
359
|
+
continuousMonitoring: true,
|
|
360
|
+
backgroundExecution: true,
|
|
361
|
+
},
|
|
362
|
+
agentVerification: {
|
|
363
|
+
coder: {
|
|
364
|
+
checks: ['compile', 'test', 'lint', 'typecheck'],
|
|
365
|
+
weights: {
|
|
366
|
+
compile: 0.35,
|
|
367
|
+
test: 0.25,
|
|
368
|
+
lint: 0.2,
|
|
369
|
+
typecheck: 0.2,
|
|
370
|
+
},
|
|
371
|
+
},
|
|
372
|
+
reviewer: {
|
|
373
|
+
checks: ['code-analysis', 'security-scan', 'performance-check'],
|
|
374
|
+
weights: {
|
|
375
|
+
'code-analysis': 0.4,
|
|
376
|
+
'security-scan': 0.35,
|
|
377
|
+
'performance-check': 0.25,
|
|
378
|
+
},
|
|
379
|
+
},
|
|
380
|
+
tester: {
|
|
381
|
+
checks: ['unit-tests', 'integration-tests', 'coverage-check'],
|
|
382
|
+
weights: {
|
|
383
|
+
'unit-tests': 0.35,
|
|
384
|
+
'integration-tests': 0.35,
|
|
385
|
+
'coverage-check': 0.3,
|
|
386
|
+
},
|
|
387
|
+
},
|
|
388
|
+
planner: {
|
|
389
|
+
checks: ['task-decomposition', 'dependency-check', 'feasibility'],
|
|
390
|
+
weights: {
|
|
391
|
+
'task-decomposition': 0.4,
|
|
392
|
+
'dependency-check': 0.3,
|
|
393
|
+
feasibility: 0.3,
|
|
394
|
+
},
|
|
395
|
+
},
|
|
396
|
+
},
|
|
397
|
+
backgroundTasks: {
|
|
398
|
+
autoBackground: {
|
|
399
|
+
enabled: true,
|
|
400
|
+
patterns: [
|
|
401
|
+
'./claude-flow-novice pair --start',
|
|
402
|
+
'./claude-flow-novice verify verify',
|
|
403
|
+
'./claude-flow-novice truth --monitor',
|
|
404
|
+
'*--monitor*',
|
|
405
|
+
'*--watch*',
|
|
406
|
+
],
|
|
407
|
+
},
|
|
408
|
+
},
|
|
409
|
+
metrics: {
|
|
410
|
+
targets: {
|
|
411
|
+
truthAccuracy: 0.95,
|
|
412
|
+
integrationSuccess: 0.9,
|
|
413
|
+
rollbackFrequency: 0.05,
|
|
414
|
+
humanIntervention: 0.1,
|
|
415
|
+
},
|
|
416
|
+
},
|
|
417
|
+
hooks: {
|
|
418
|
+
'pre-commit': {
|
|
419
|
+
enabled: true,
|
|
420
|
+
commands: ['npx claude-flow-novice verify verify pre-commit --agent coder'],
|
|
421
|
+
threshold: 0.85,
|
|
422
|
+
},
|
|
423
|
+
'post-task': {
|
|
424
|
+
enabled: true,
|
|
425
|
+
commands: ['npx claude-flow-novice truth --json', 'npx claude-flow-novice verify status'],
|
|
426
|
+
},
|
|
427
|
+
},
|
|
428
|
+
},
|
|
429
|
+
null,
|
|
430
|
+
2,
|
|
431
|
+
);
|
|
432
|
+
}
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
const path = require('path');
|
|
2
|
+
const { AgentBenchmarkSystem } = require('../../benchmark/agent-benchmarking/index');
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Benchmark agent prompt formats
|
|
6
|
+
*
|
|
7
|
+
* @command /benchmark-prompts
|
|
8
|
+
* @description Run automated benchmarks comparing different agent prompt formatting styles
|
|
9
|
+
* @example /benchmark-prompts run
|
|
10
|
+
* @example /benchmark-prompts run --rounds 5 --parallel
|
|
11
|
+
* @example /benchmark-prompts analyze
|
|
12
|
+
* @example /benchmark-prompts report markdown
|
|
13
|
+
*/
|
|
14
|
+
async function benchmarkPrompts(args) {
|
|
15
|
+
const system = new AgentBenchmarkSystem();
|
|
16
|
+
|
|
17
|
+
const command = args[0] || 'help';
|
|
18
|
+
|
|
19
|
+
try {
|
|
20
|
+
switch (command) {
|
|
21
|
+
case 'run':
|
|
22
|
+
return await runBenchmark(system, args);
|
|
23
|
+
|
|
24
|
+
case 'test':
|
|
25
|
+
return await testFormat(system, args);
|
|
26
|
+
|
|
27
|
+
case 'analyze':
|
|
28
|
+
return await analyzeBenchmark(system);
|
|
29
|
+
|
|
30
|
+
case 'report':
|
|
31
|
+
return await generateReport(system, args);
|
|
32
|
+
|
|
33
|
+
case 'compare':
|
|
34
|
+
return await compareBenchmark(system);
|
|
35
|
+
|
|
36
|
+
case 'list':
|
|
37
|
+
return await listBenchmarks(system);
|
|
38
|
+
|
|
39
|
+
case 'reset':
|
|
40
|
+
return await resetBenchmarks(system);
|
|
41
|
+
|
|
42
|
+
case 'help':
|
|
43
|
+
default:
|
|
44
|
+
return showHelp();
|
|
45
|
+
}
|
|
46
|
+
} catch (error) {
|
|
47
|
+
console.error(`\nā Error: ${error.message}\n`);
|
|
48
|
+
if (args.includes('--verbose')) {
|
|
49
|
+
console.error(error.stack);
|
|
50
|
+
}
|
|
51
|
+
return { success: false, error: error.message };
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
async function runBenchmark(system, args) {
|
|
56
|
+
console.log('\nšÆ Running Agent Prompt Format Benchmark\n');
|
|
57
|
+
|
|
58
|
+
const options = {
|
|
59
|
+
rounds: 3,
|
|
60
|
+
parallel: false,
|
|
61
|
+
verbose: false,
|
|
62
|
+
scenarios: null
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
// Parse options
|
|
66
|
+
for (let i = 1; i < args.length; i++) {
|
|
67
|
+
const arg = args[i];
|
|
68
|
+
if (arg === '--rounds' && args[i + 1]) {
|
|
69
|
+
options.rounds = parseInt(args[i + 1]);
|
|
70
|
+
i++;
|
|
71
|
+
} else if (arg === '--parallel') {
|
|
72
|
+
options.parallel = true;
|
|
73
|
+
} else if (arg === '--verbose') {
|
|
74
|
+
options.verbose = true;
|
|
75
|
+
} else if (arg === '--scenarios' && args[i + 1]) {
|
|
76
|
+
options.scenarios = args[i + 1];
|
|
77
|
+
i++;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
console.log('Configuration:');
|
|
82
|
+
console.log(` Rounds: ${options.rounds}`);
|
|
83
|
+
console.log(` Parallel: ${options.parallel ? 'Yes' : 'No'}`);
|
|
84
|
+
console.log(` Scenarios: ${options.scenarios || 'All'}\n`);
|
|
85
|
+
|
|
86
|
+
const result = await system.run(options);
|
|
87
|
+
|
|
88
|
+
console.log('\nā
Benchmark complete!');
|
|
89
|
+
console.log('\nNext steps:');
|
|
90
|
+
console.log(' /benchmark-prompts analyze - View statistical analysis');
|
|
91
|
+
console.log(' /benchmark-prompts report - Generate detailed reports');
|
|
92
|
+
console.log(' /benchmark-prompts compare - Interactive comparison\n');
|
|
93
|
+
|
|
94
|
+
return { success: true, result };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async function testFormat(system, args) {
|
|
98
|
+
const format = args[1];
|
|
99
|
+
if (!format || !['minimal', 'metadata', 'code-heavy'].includes(format)) {
|
|
100
|
+
console.error('\nā Invalid format. Use: minimal, metadata, or code-heavy\n');
|
|
101
|
+
return { success: false };
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
console.log(`\nš§Ŗ Testing ${format} format\n`);
|
|
105
|
+
|
|
106
|
+
// Run benchmark with only specified format
|
|
107
|
+
const options = {
|
|
108
|
+
rounds: parseInt(args[2]) || 3,
|
|
109
|
+
parallel: false,
|
|
110
|
+
verbose: args.includes('--verbose')
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
// This would need modification to support single format testing
|
|
114
|
+
console.log('Note: Currently runs all formats. Single format testing coming soon.\n');
|
|
115
|
+
|
|
116
|
+
const result = await system.run(options);
|
|
117
|
+
return { success: true, result };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async function analyzeBenchmark(system) {
|
|
121
|
+
console.log('\nš Analyzing Benchmark Results\n');
|
|
122
|
+
|
|
123
|
+
const result = await system.analyze();
|
|
124
|
+
|
|
125
|
+
if (!result) {
|
|
126
|
+
return { success: false };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
console.log('\nā
Analysis complete\n');
|
|
130
|
+
return { success: true, result };
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
async function generateReport(system, args) {
|
|
134
|
+
const format = args[1] || 'all';
|
|
135
|
+
|
|
136
|
+
if (!['all', 'markdown', 'csv', 'json'].includes(format)) {
|
|
137
|
+
console.error('\nā Invalid format. Use: all, markdown, csv, or json\n');
|
|
138
|
+
return { success: false };
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
console.log(`\nš Generating ${format} report(s)\n`);
|
|
142
|
+
|
|
143
|
+
const result = await system.generateReport(format);
|
|
144
|
+
|
|
145
|
+
if (!result) {
|
|
146
|
+
return { success: false };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
console.log('\nā
Report generation complete\n');
|
|
150
|
+
return { success: true, result };
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
async function compareBenchmark(system) {
|
|
154
|
+
console.log('\nš Interactive Benchmark Comparison\n');
|
|
155
|
+
|
|
156
|
+
// Load and analyze latest results
|
|
157
|
+
const result = await system.analyze();
|
|
158
|
+
|
|
159
|
+
if (!result) {
|
|
160
|
+
return { success: false };
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const { results, analysis } = result;
|
|
164
|
+
|
|
165
|
+
// Print comparison table
|
|
166
|
+
console.log('Format Comparison:');
|
|
167
|
+
console.log('ā'.repeat(80));
|
|
168
|
+
console.log('Format'.padEnd(15) + 'Quality'.padEnd(15) + 'Speed'.padEnd(15) + 'Consistency'.padEnd(15) + 'Success');
|
|
169
|
+
console.log('ā'.repeat(80));
|
|
170
|
+
|
|
171
|
+
for (const [formatName, formatData] of Object.entries(results.formats)) {
|
|
172
|
+
const agg = formatData.aggregated;
|
|
173
|
+
console.log(
|
|
174
|
+
formatName.padEnd(15) +
|
|
175
|
+
`${agg.overallQuality.toFixed(1)}%`.padEnd(15) +
|
|
176
|
+
`${agg.overallResponseTime.toFixed(0)}ms`.padEnd(15) +
|
|
177
|
+
`${agg.overallConsistency.toFixed(1)}%`.padEnd(15) +
|
|
178
|
+
`${agg.successRate.toFixed(1)}%`
|
|
179
|
+
);
|
|
180
|
+
}
|
|
181
|
+
console.log('ā'.repeat(80));
|
|
182
|
+
|
|
183
|
+
console.log('\nš Winner: ' + results.summary.winner.toUpperCase());
|
|
184
|
+
console.log('\nRecommendations:');
|
|
185
|
+
for (const rec of results.summary.recommendation) {
|
|
186
|
+
console.log(` ⢠${rec}`);
|
|
187
|
+
}
|
|
188
|
+
console.log('');
|
|
189
|
+
|
|
190
|
+
return { success: true, results, analysis };
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
async function listBenchmarks(system) {
|
|
194
|
+
console.log('\nš Benchmark Results\n');
|
|
195
|
+
await system.listResults();
|
|
196
|
+
console.log('');
|
|
197
|
+
return { success: true };
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
async function resetBenchmarks(system) {
|
|
201
|
+
console.log('\nā ļø This will delete all benchmark results.\n');
|
|
202
|
+
|
|
203
|
+
// In interactive mode, would ask for confirmation
|
|
204
|
+
// For now, proceeding with reset
|
|
205
|
+
|
|
206
|
+
await system.reset();
|
|
207
|
+
console.log('');
|
|
208
|
+
return { success: true };
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
function showHelp() {
|
|
212
|
+
console.log(`
|
|
213
|
+
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
214
|
+
ā Agent Prompt Format Benchmark System ā
|
|
215
|
+
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
216
|
+
|
|
217
|
+
Compare different agent prompt formatting styles:
|
|
218
|
+
⢠Minimal Format - Simple frontmatter only
|
|
219
|
+
⢠Metadata Format - Full configuration with hooks and capabilities
|
|
220
|
+
⢠Code-Heavy Format - Extensive code examples
|
|
221
|
+
|
|
222
|
+
COMMANDS:
|
|
223
|
+
/benchmark-prompts run [options] Run full benchmark suite
|
|
224
|
+
/benchmark-prompts test <format> Test specific format
|
|
225
|
+
/benchmark-prompts analyze Analyze collected results
|
|
226
|
+
/benchmark-prompts report [format] Generate reports (all|markdown|csv|json)
|
|
227
|
+
/benchmark-prompts compare Interactive comparison view
|
|
228
|
+
/benchmark-prompts list List all benchmark results
|
|
229
|
+
/benchmark-prompts reset Clear all benchmark data
|
|
230
|
+
|
|
231
|
+
OPTIONS:
|
|
232
|
+
--rounds <n> Number of test rounds (default: 3)
|
|
233
|
+
--parallel Run tests in parallel
|
|
234
|
+
--scenarios <list> Comma-separated scenario IDs to test
|
|
235
|
+
--verbose Detailed output
|
|
236
|
+
--export Export results
|
|
237
|
+
|
|
238
|
+
EXAMPLES:
|
|
239
|
+
/benchmark-prompts run
|
|
240
|
+
/benchmark-prompts run --rounds 5 --parallel
|
|
241
|
+
/benchmark-prompts test minimal --verbose
|
|
242
|
+
/benchmark-prompts analyze
|
|
243
|
+
/benchmark-prompts report markdown
|
|
244
|
+
/benchmark-prompts compare
|
|
245
|
+
|
|
246
|
+
TEST SCENARIOS:
|
|
247
|
+
⢠simple-code-analysis - Basic algorithm optimization
|
|
248
|
+
⢠memory-leak-detection - Memory leak identification
|
|
249
|
+
⢠database-query-optimization - Query performance optimization
|
|
250
|
+
⢠caching-strategy - Cache architecture design
|
|
251
|
+
⢠resource-allocation - Resource calculation
|
|
252
|
+
⢠async-pattern-optimization - Async code optimization
|
|
253
|
+
⢠algorithm-complexity-reduction - Algorithm improvement
|
|
254
|
+
⢠load-testing-strategy - Load test planning
|
|
255
|
+
⢠bottleneck-identification - Performance bottleneck analysis
|
|
256
|
+
⢠scalability-architecture - Scalability design
|
|
257
|
+
|
|
258
|
+
BENCHMARK METRICS:
|
|
259
|
+
⢠Quality Score - Completeness, accuracy, relevance, clarity
|
|
260
|
+
⢠Response Time - Total time to generate response
|
|
261
|
+
⢠Consistency - Variance across multiple rounds
|
|
262
|
+
⢠Success Rate - Percentage of successful completions
|
|
263
|
+
|
|
264
|
+
For more information, see: /benchmark/agent-benchmarking/README.md
|
|
265
|
+
`);
|
|
266
|
+
|
|
267
|
+
return { success: true };
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
module.exports = {
|
|
271
|
+
command: 'benchmark-prompts',
|
|
272
|
+
description: 'Run automated benchmarks comparing agent prompt formats',
|
|
273
|
+
handler: benchmarkPrompts,
|
|
274
|
+
examples: [
|
|
275
|
+
'/benchmark-prompts run',
|
|
276
|
+
'/benchmark-prompts run --rounds 5 --parallel',
|
|
277
|
+
'/benchmark-prompts analyze',
|
|
278
|
+
'/benchmark-prompts report markdown',
|
|
279
|
+
'/benchmark-prompts compare'
|
|
280
|
+
]
|
|
281
|
+
};
|