bobs-workshop 3.1.1 → 3.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/tools/background-agent/manager.d.ts.map +1 -1
- package/dist/tools/background-agent/manager.js +39 -12
- package/dist/tools/manual/verify-manual.d.ts.map +1 -1
- package/dist/tools/manual/verify-manual.js +36 -6
- package/package.json +1 -1
- package/src/agents/alice.md +14 -14
- package/src/agents/bob-rev.md +13 -14
- package/src/agents/bob-send.md +14 -15
- package/src/agents/bob.md +12 -14
- package/src/agents/trace.md +2 -2
- package/src/skills/api-patterns/SKILL.md +15 -15
- package/src/skills/architecture/SKILL.md +4 -4
- package/src/skills/brainstorming/SKILL.md +18 -18
- package/src/skills/clean-code/SKILL.md +11 -11
- package/src/skills/code-review-checklist/SKILL.md +23 -23
- package/src/skills/database-design/SKILL.md +1 -1
- package/src/skills/exploration/SKILL.md +5 -5
- package/src/skills/performance/SKILL.md +1 -1
- package/src/skills/plan-writing/SKILL.md +6 -6
- package/src/skills/security/SKILL.md +13 -13
- package/src/skills/simplification/SKILL.md +2 -2
- package/src/skills/systematic-debugging/SKILL.md +6 -6
- package/src/skills/testing-patterns/SKILL.md +1 -1
- package/src/tools/background-agent/manager.ts +44 -12
- package/src/tools/manual/verify-manual.ts +44 -6
- package/src/skills/verification/SKILL.md +0 -286
|
@@ -1,5 +1,38 @@
|
|
|
1
1
|
import { tool, type ToolDefinition } from "@opencode-ai/plugin/tool";
|
|
2
2
|
|
|
3
|
+
const MAX_SYSTEM_CONTENT_LENGTH = 50000;
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Sanitize content for safe JSON transmission to kimi model
|
|
7
|
+
* Handles Unicode issues, control characters, and length limits
|
|
8
|
+
*/
|
|
9
|
+
function sanitizeForModel(content: string, maxLength: number = MAX_SYSTEM_CONTENT_LENGTH): string {
|
|
10
|
+
if (!content) return "";
|
|
11
|
+
|
|
12
|
+
// Step 1: Remove control characters (keep only \n, \r, \t)
|
|
13
|
+
let sanitized = content
|
|
14
|
+
.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F]/g, '')
|
|
15
|
+
// Step 2: Replace Unicode line/paragraph separators
|
|
16
|
+
.replace(/\u2028/g, '\n')
|
|
17
|
+
.replace(/\u2029/g, '\n')
|
|
18
|
+
// Step 3: Replace other problematic Unicode
|
|
19
|
+
.replace(/\uFEFF/g, '') // BOM
|
|
20
|
+
.replace(/[\u200B-\u200D]/g, '') // Zero-width spaces
|
|
21
|
+
// Step 4: Escape backslashes to prevent JSON issues
|
|
22
|
+
.replace(/\\/g, '\\\\')
|
|
23
|
+
// Step 5: Normalize line endings
|
|
24
|
+
.replace(/\r\n/g, '\n')
|
|
25
|
+
.replace(/\r/g, '\n');
|
|
26
|
+
|
|
27
|
+
// Step 6: Truncate if too long (with indicator)
|
|
28
|
+
if (sanitized.length > maxLength) {
|
|
29
|
+
const truncationMsg = "\n\n[Content truncated due to length limits]";
|
|
30
|
+
sanitized = sanitized.substring(0, maxLength - truncationMsg.length) + truncationMsg;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return sanitized;
|
|
34
|
+
}
|
|
35
|
+
|
|
3
36
|
const VerifyManualTool: ToolDefinition = tool({
|
|
4
37
|
description: "Run bob-rev verification in background for a MANUAL",
|
|
5
38
|
args: {
|
|
@@ -27,24 +60,29 @@ const VerifyManualTool: ToolDefinition = tool({
|
|
|
27
60
|
let manualContent = "";
|
|
28
61
|
if (existsSync(args.manual_path)) {
|
|
29
62
|
manualContent = readFileSync(args.manual_path, "utf8");
|
|
30
|
-
// Sanitize content
|
|
31
|
-
manualContent = manualContent
|
|
32
|
-
.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F]/g, '') // Remove control chars except \n, \r, \t
|
|
33
|
-
.replace(/\u2028/g, '\n') // Replace line separator with newline
|
|
34
|
-
.replace(/\u2029/g, '\n'); // Replace paragraph separator with newline
|
|
63
|
+
// Sanitize content for kimi model compatibility
|
|
64
|
+
manualContent = sanitizeForModel(manualContent, 30000);
|
|
35
65
|
}
|
|
36
66
|
|
|
37
67
|
const skillPath = join(directory, ".opencode", "agent", "bobs-workshop", "bob-rev.md");
|
|
38
68
|
let agentPrompt = "You are bob-rev, a reviewer agent that verifies implementation against MANUAL requirements.";
|
|
39
69
|
if (existsSync(skillPath)) {
|
|
40
70
|
agentPrompt = readFileSync(skillPath, "utf8");
|
|
71
|
+
// Sanitize agent prompt as well
|
|
72
|
+
agentPrompt = sanitizeForModel(agentPrompt, 20000);
|
|
41
73
|
}
|
|
42
74
|
|
|
75
|
+
// Build and sanitize system prompt
|
|
76
|
+
const systemPrompt = sanitizeForModel(
|
|
77
|
+
`${agentPrompt}\n\n## MANUAL to Verify:\n${manualContent}`,
|
|
78
|
+
MAX_SYSTEM_CONTENT_LENGTH
|
|
79
|
+
);
|
|
80
|
+
|
|
43
81
|
await (client as { session: { prompt: (args: unknown) => Promise<unknown> } }).session.prompt({
|
|
44
82
|
path: { id: createData.data.id },
|
|
45
83
|
body: {
|
|
46
84
|
agent: "bob-rev",
|
|
47
|
-
system:
|
|
85
|
+
system: systemPrompt,
|
|
48
86
|
tools: {
|
|
49
87
|
task: false,
|
|
50
88
|
delegate_task: false,
|
|
@@ -1,286 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: verification
|
|
3
|
-
description: Evidence-based verification methodology. SEE code working, do not just trust tests. Use when shipping code, final verification, output validation, evidence gathering. CRITICAL: Evidence before assertions.
|
|
4
|
-
metadata:
|
|
5
|
-
recommended_for: bob-send
|
|
6
|
-
category: verification
|
|
7
|
-
---
|
|
8
|
-
|
|
9
|
-
# Evidence-Based Verification Skill
|
|
10
|
-
|
|
11
|
-
## Core Philosophy
|
|
12
|
-
|
|
13
|
-
> "Evidence before assertions. Always."
|
|
14
|
-
|
|
15
|
-
Never claim code works without **seeing** it work. Tests passing is not enough. You must SEE the output.
|
|
16
|
-
|
|
17
|
-
## When to Use
|
|
18
|
-
|
|
19
|
-
Use this skill during the SEND phase or any final verification:
|
|
20
|
-
- Before shipping code
|
|
21
|
-
- When validating implementation works
|
|
22
|
-
- When gathering evidence for review
|
|
23
|
-
- When tests pass but you need runtime proof
|
|
24
|
-
|
|
25
|
-
## The Three Laws
|
|
26
|
-
|
|
27
|
-
1. **See it working** - Screenshots, curl responses, CLI output. Actual evidence.
|
|
28
|
-
2. **Human checkpoint** - No auto-shipping. Human reviews evidence and decides.
|
|
29
|
-
3. **Fallback hierarchy** - If primary method fails, try the next method down.
|
|
30
|
-
|
|
31
|
-
## Project Type Detection
|
|
32
|
-
|
|
33
|
-
First, detect what type of project you're verifying:
|
|
34
|
-
|
|
35
|
-
| Detection Pattern | Project Type | Primary Method |
|
|
36
|
-
|-------------------|--------------|----------------|
|
|
37
|
-
| package.json + src/app or pages/ | Web app | Screenshot + test flows |
|
|
38
|
-
| package.json + routes or controllers/ | API | Curl endpoints |
|
|
39
|
-
| Cargo.toml + src/main.rs with clap | CLI | Run commands |
|
|
40
|
-
| pyproject.toml + `__main__.py` | CLI | Run commands |
|
|
41
|
-
| `**/lib.rs` or `setup.py` | Library | Run examples |
|
|
42
|
-
| Dockerfile or docker-compose.yml | Service | Health check + logs |
|
|
43
|
-
|
|
44
|
-
## Verification Methods
|
|
45
|
-
|
|
46
|
-
### Web Applications
|
|
47
|
-
|
|
48
|
-
```bash
|
|
49
|
-
# 1. Start dev server
|
|
50
|
-
npm run dev &
|
|
51
|
-
SERVER_PID=$!
|
|
52
|
-
|
|
53
|
-
# 2. Wait for server (max 30s)
|
|
54
|
-
sleep 5 # or: until curl -s http://localhost:3000 > /dev/null
|
|
55
|
-
|
|
56
|
-
# 3. Verify key pages load
|
|
57
|
-
curl -s http://localhost:3000 | head -20
|
|
58
|
-
|
|
59
|
-
# 4. Check for errors (if browser available, take screenshots)
|
|
60
|
-
|
|
61
|
-
# 5. Cleanup
|
|
62
|
-
kill $SERVER_PID
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
**Evidence to capture:**
|
|
66
|
-
- Screenshots of key pages (if browser available)
|
|
67
|
-
- HTML response showing expected content
|
|
68
|
-
- Console errors (if any)
|
|
69
|
-
|
|
70
|
-
### Browser Verification (Playwright)
|
|
71
|
-
|
|
72
|
-
For web applications, use Playwright for comprehensive browser verification:
|
|
73
|
-
|
|
74
|
-
**Capabilities:**
|
|
75
|
-
- **Verification**: Ensure web elements are present and functional
|
|
76
|
-
- **Browsing**: Navigate through web pages to gather information
|
|
77
|
-
- **Web Scraping**: Extract data from websites
|
|
78
|
-
- **Testing**: Run end-to-end tests
|
|
79
|
-
- **Screenshots**: Capture visual evidence of web states
|
|
80
|
-
|
|
81
|
-
**Usage:**
|
|
82
|
-
When a task involves browser interactions, use Playwright MCP tools:
|
|
83
|
-
```bash
|
|
84
|
-
# Navigate to page
|
|
85
|
-
playwright_browser_navigate(url="http://localhost:3000")
|
|
86
|
-
|
|
87
|
-
# Take screenshot
|
|
88
|
-
playwright_browser_take_screenshot(filename="homepage.png", type="png")
|
|
89
|
-
|
|
90
|
-
# Interact with elements
|
|
91
|
-
playwright_browser_click(ref="submit-button")
|
|
92
|
-
|
|
93
|
-
# Verify element present
|
|
94
|
-
playwright_browser_snapshot()
|
|
95
|
-
```
|
|
96
|
-
|
|
97
|
-
**Best practices:**
|
|
98
|
-
- Capture screenshots before and after interactions
|
|
99
|
-
- Verify console for errors
|
|
100
|
-
- Test across different viewport sizes
|
|
101
|
-
- Document all browser-based evidence
|
|
102
|
-
|
|
103
|
-
### APIs
|
|
104
|
-
|
|
105
|
-
```bash
|
|
106
|
-
# 1. Start server
|
|
107
|
-
npm start &
|
|
108
|
-
sleep 3
|
|
109
|
-
|
|
110
|
-
# 2. Test key endpoints
|
|
111
|
-
curl -s -w "%{http_code}" http://localhost:3000/api/health
|
|
112
|
-
curl -s -X POST http://localhost:3000/api/[endpoint] \
|
|
113
|
-
-H "Content-Type: application/json" \
|
|
114
|
-
-d '{"test": true}'
|
|
115
|
-
|
|
116
|
-
# 3. Verify response shapes match spec
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
**Evidence to capture:**
|
|
120
|
-
- Status codes for each endpoint
|
|
121
|
-
- Response bodies (truncated if large)
|
|
122
|
-
- Error responses
|
|
123
|
-
|
|
124
|
-
### CLI Tools
|
|
125
|
-
|
|
126
|
-
```bash
|
|
127
|
-
# 1. Test help
|
|
128
|
-
./cli --help
|
|
129
|
-
echo "Exit code: $?"
|
|
130
|
-
|
|
131
|
-
# 2. Test primary commands
|
|
132
|
-
./cli [command] test-input.txt
|
|
133
|
-
echo "Exit code: $?"
|
|
134
|
-
|
|
135
|
-
# 3. Test error handling
|
|
136
|
-
./cli [command] nonexistent.txt
|
|
137
|
-
echo "Exit code: $?" # Should be non-zero
|
|
138
|
-
```
|
|
139
|
-
|
|
140
|
-
**Evidence to capture:**
|
|
141
|
-
- Command output (stdout)
|
|
142
|
-
- Error output (stderr)
|
|
143
|
-
- Exit codes
|
|
144
|
-
|
|
145
|
-
### Libraries
|
|
146
|
-
|
|
147
|
-
```bash
|
|
148
|
-
# 1. Run tests (confirm they pass)
|
|
149
|
-
npm test
|
|
150
|
-
|
|
151
|
-
# 2. Run examples from documentation
|
|
152
|
-
node examples/basic-usage.js
|
|
153
|
-
|
|
154
|
-
# 3. Check types compile
|
|
155
|
-
npm run typecheck
|
|
156
|
-
```
|
|
157
|
-
|
|
158
|
-
**Evidence to capture:**
|
|
159
|
-
- Example output
|
|
160
|
-
- Test coverage summary
|
|
161
|
-
|
|
162
|
-
## Fallback Hierarchy
|
|
163
|
-
|
|
164
|
-
If primary method fails, fall back in order:
|
|
165
|
-
|
|
166
|
-
```
|
|
167
|
-
1. Full runtime (screenshot/curl/run) ─ FAILED
|
|
168
|
-
│
|
|
169
|
-
└─► 2. Integration tests ─ FAILED
|
|
170
|
-
│
|
|
171
|
-
└─► 3. Unit tests + examples ─ FAILED
|
|
172
|
-
│
|
|
173
|
-
└─► 4. Type check + lint only ─ FAILED
|
|
174
|
-
│
|
|
175
|
-
└─► 5. Code review only (last resort)
|
|
176
|
-
└─► Report: "Unable to verify runtime behavior"
|
|
177
|
-
```
|
|
178
|
-
|
|
179
|
-
**Always record:**
|
|
180
|
-
- Which method was attempted
|
|
181
|
-
- Why it failed
|
|
182
|
-
- Which fallback was used
|
|
183
|
-
|
|
184
|
-
## Timeout Handling
|
|
185
|
-
|
|
186
|
-
**30-second timeout per verification method.**
|
|
187
|
-
|
|
188
|
-
If method times out:
|
|
189
|
-
1. Kill the process
|
|
190
|
-
2. Log the timeout
|
|
191
|
-
3. Try fallback method
|
|
192
|
-
|
|
193
|
-
## Evidence Storage
|
|
194
|
-
|
|
195
|
-
Document evidence in MANUAL Verification Logs:
|
|
196
|
-
|
|
197
|
-
```markdown
|
|
198
|
-
## ✅ Verification Evidence
|
|
199
|
-
|
|
200
|
-
### Runtime Verification
|
|
201
|
-
**Project Type**: web
|
|
202
|
-
**Method Used**: curl + HTML inspection
|
|
203
|
-
**Fallback Level**: Primary (no fallback needed)
|
|
204
|
-
|
|
205
|
-
### Evidence Captured
|
|
206
|
-
1. **Homepage**: Returns 200, contains expected title
|
|
207
|
-
2. **API Health**: /api/health returns {"status": "ok"}
|
|
208
|
-
3. **Login Flow**: Form renders, submits correctly
|
|
209
|
-
|
|
210
|
-
### Screenshots/Output
|
|
211
|
-
- [Describe what was observed]
|
|
212
|
-
- [Key behaviors verified]
|
|
213
|
-
|
|
214
|
-
### Result: ✅ VERIFIED
|
|
215
|
-
```
|
|
216
|
-
|
|
217
|
-
## Red Flags — STOP and Reassess
|
|
218
|
-
|
|
219
|
-
If you're thinking any of these, you're about to violate the methodology:
|
|
220
|
-
|
|
221
|
-
- "Tests pass, that's good enough" → **NO. SEE it working.**
|
|
222
|
-
- "I'll verify after shipping" → **NO. Verify BEFORE ship.**
|
|
223
|
-
- "The type checker caught everything" → **NO. Types don't catch runtime issues.**
|
|
224
|
-
- "Screenshot failed but it probably works" → **NO. "Probably" isn't evidence.**
|
|
225
|
-
- "It should work because..." → **NO. SHOW it works.**
|
|
226
|
-
- "Evidence isn't necessary for this change" → **NO. Every change gets verified.**
|
|
227
|
-
|
|
228
|
-
## Rationalizations to Resist
|
|
229
|
-
|
|
230
|
-
| Excuse | Reality |
|
|
231
|
-
|--------|---------|
|
|
232
|
-
| "Tests pass" | Tests aren't enough. SEE it working. |
|
|
233
|
-
| "Type checker caught everything" | Types don't catch runtime issues. |
|
|
234
|
-
| "It worked before" | Prove it works NOW. |
|
|
235
|
-
| "Human checkpoint is formality" | Human checkpoint is the gate. |
|
|
236
|
-
| "Code review is enough" | Code review is last resort fallback. |
|
|
237
|
-
| "Tests are flaky, ignore failure" | Flaky tests hide real failures. Fix or accept with caveat. |
|
|
238
|
-
|
|
239
|
-
## Flaky Test Detection
|
|
240
|
-
|
|
241
|
-
```
|
|
242
|
-
Test fails?
|
|
243
|
-
├── Re-run failed tests
|
|
244
|
-
├── Pass 2nd time?
|
|
245
|
-
│ └── Yes → Note flakiness, accept with caveat
|
|
246
|
-
└── Fail 2nd time?
|
|
247
|
-
├── Run isolated
|
|
248
|
-
└── Still fail? → Real failure, must fix
|
|
249
|
-
```
|
|
250
|
-
|
|
251
|
-
## Output Format
|
|
252
|
-
|
|
253
|
-
When verification completes:
|
|
254
|
-
|
|
255
|
-
```markdown
|
|
256
|
-
### Verification Summary
|
|
257
|
-
|
|
258
|
-
| Check | Status | Method | Notes |
|
|
259
|
-
|-------|--------|--------|-------|
|
|
260
|
-
| Build | ✅ | npm run build | No errors |
|
|
261
|
-
| Lint | ✅ | npm run lint | 0 issues |
|
|
262
|
-
| Tests | ✅ | npm test | 47/47 passed |
|
|
263
|
-
| Runtime | ✅ | curl + inspect | Evidence captured |
|
|
264
|
-
|
|
265
|
-
### Evidence
|
|
266
|
-
- Homepage renders correctly (verified via curl)
|
|
267
|
-
- API returns expected responses
|
|
268
|
-
- No console errors observed
|
|
269
|
-
|
|
270
|
-
### Result: ✅ ALL CHECKS PASS + EVIDENCE GATHERED
|
|
271
|
-
```
|
|
272
|
-
|
|
273
|
-
## Anti-Patterns
|
|
274
|
-
|
|
275
|
-
**Don't do these:**
|
|
276
|
-
|
|
277
|
-
- Trusting test results without seeing code run
|
|
278
|
-
- Skipping output verification because "tests pass"
|
|
279
|
-
- Proceeding when verification method fails without fallback
|
|
280
|
-
- Modifying code after verification passes (invalidates verification)
|
|
281
|
-
- Ignoring flaky test warnings
|
|
282
|
-
- Auto-approving without human checkpoint
|
|
283
|
-
|
|
284
|
-
---
|
|
285
|
-
|
|
286
|
-
**Remember**: You are gathering EVIDENCE. The human decides if it ships. Your job is to SEE the code work and document what you observed.
|