bobs-workshop 3.1.1 → 3.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,38 @@
1
1
  import { tool, type ToolDefinition } from "@opencode-ai/plugin/tool";
2
2
 
3
+ const MAX_SYSTEM_CONTENT_LENGTH = 50000;
4
+
5
+ /**
6
+ * Sanitize content for safe JSON transmission to kimi model
7
+ * Handles Unicode issues, control characters, and length limits
8
+ */
9
+ function sanitizeForModel(content: string, maxLength: number = MAX_SYSTEM_CONTENT_LENGTH): string {
10
+ if (!content) return "";
11
+
12
+ // Step 1: Remove control characters (keep only \n, \r, \t)
13
+ let sanitized = content
14
+ .replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F]/g, '')
15
+ // Step 2: Replace Unicode line/paragraph separators
16
+ .replace(/\u2028/g, '\n')
17
+ .replace(/\u2029/g, '\n')
18
+ // Step 3: Replace other problematic Unicode
19
+ .replace(/\uFEFF/g, '') // BOM
20
+ .replace(/[\u200B-\u200D]/g, '') // Zero-width spaces
21
+ // Step 4: Escape backslashes to prevent JSON issues
22
+ .replace(/\\/g, '\\\\')
23
+ // Step 5: Normalize line endings
24
+ .replace(/\r\n/g, '\n')
25
+ .replace(/\r/g, '\n');
26
+
27
+ // Step 6: Truncate if too long (with indicator)
28
+ if (sanitized.length > maxLength) {
29
+ const truncationMsg = "\n\n[Content truncated due to length limits]";
30
+ sanitized = sanitized.substring(0, maxLength - truncationMsg.length) + truncationMsg;
31
+ }
32
+
33
+ return sanitized;
34
+ }
35
+
3
36
  const VerifyManualTool: ToolDefinition = tool({
4
37
  description: "Run bob-rev verification in background for a MANUAL",
5
38
  args: {
@@ -27,24 +60,29 @@ const VerifyManualTool: ToolDefinition = tool({
27
60
  let manualContent = "";
28
61
  if (existsSync(args.manual_path)) {
29
62
  manualContent = readFileSync(args.manual_path, "utf8");
30
- // Sanitize content to prevent JSON parsing issues with kimi model
31
- manualContent = manualContent
32
- .replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F]/g, '') // Remove control chars except \n, \r, \t
33
- .replace(/\u2028/g, '\n') // Replace line separator with newline
34
- .replace(/\u2029/g, '\n'); // Replace paragraph separator with newline
63
+ // Sanitize content for kimi model compatibility
64
+ manualContent = sanitizeForModel(manualContent, 30000);
35
65
  }
36
66
 
37
67
  const skillPath = join(directory, ".opencode", "agent", "bobs-workshop", "bob-rev.md");
38
68
  let agentPrompt = "You are bob-rev, a reviewer agent that verifies implementation against MANUAL requirements.";
39
69
  if (existsSync(skillPath)) {
40
70
  agentPrompt = readFileSync(skillPath, "utf8");
71
+ // Sanitize agent prompt as well
72
+ agentPrompt = sanitizeForModel(agentPrompt, 20000);
41
73
  }
42
74
 
75
+ // Build and sanitize system prompt
76
+ const systemPrompt = sanitizeForModel(
77
+ `${agentPrompt}\n\n## MANUAL to Verify:\n${manualContent}`,
78
+ MAX_SYSTEM_CONTENT_LENGTH
79
+ );
80
+
43
81
  await (client as { session: { prompt: (args: unknown) => Promise<unknown> } }).session.prompt({
44
82
  path: { id: createData.data.id },
45
83
  body: {
46
84
  agent: "bob-rev",
47
- system: `${agentPrompt}\n\n## MANUAL to Verify:\n${manualContent}`,
85
+ system: systemPrompt,
48
86
  tools: {
49
87
  task: false,
50
88
  delegate_task: false,
@@ -1,286 +0,0 @@
1
- ---
2
- name: verification
3
- description: Evidence-based verification methodology. SEE code working, do not just trust tests. Use when shipping code, final verification, output validation, evidence gathering. CRITICAL: Evidence before assertions.
4
- metadata:
5
- recommended_for: bob-send
6
- category: verification
7
- ---
8
-
9
- # Evidence-Based Verification Skill
10
-
11
- ## Core Philosophy
12
-
13
- > "Evidence before assertions. Always."
14
-
15
- Never claim code works without **seeing** it work. Tests passing is not enough. You must SEE the output.
16
-
17
- ## When to Use
18
-
19
- Use this skill during the SEND phase or any final verification:
20
- - Before shipping code
21
- - When validating implementation works
22
- - When gathering evidence for review
23
- - When tests pass but you need runtime proof
24
-
25
- ## The Three Laws
26
-
27
- 1. **See it working** - Screenshots, curl responses, CLI output. Actual evidence.
28
- 2. **Human checkpoint** - No auto-shipping. Human reviews evidence and decides.
29
- 3. **Fallback hierarchy** - If primary method fails, try the next method down.
30
-
31
- ## Project Type Detection
32
-
33
- First, detect what type of project you're verifying:
34
-
35
- | Detection Pattern | Project Type | Primary Method |
36
- |-------------------|--------------|----------------|
37
- | package.json + src/app or pages/ | Web app | Screenshot + test flows |
38
- | package.json + routes or controllers/ | API | Curl endpoints |
39
- | Cargo.toml + src/main.rs with clap | CLI | Run commands |
40
- | pyproject.toml + `__main__.py` | CLI | Run commands |
41
- | `**/lib.rs` or `setup.py` | Library | Run examples |
42
- | Dockerfile or docker-compose.yml | Service | Health check + logs |
43
-
44
- ## Verification Methods
45
-
46
- ### Web Applications
47
-
48
- ```bash
49
- # 1. Start dev server
50
- npm run dev &
51
- SERVER_PID=$!
52
-
53
- # 2. Wait for server (max 30s)
54
- sleep 5 # or: until curl -s http://localhost:3000 > /dev/null
55
-
56
- # 3. Verify key pages load
57
- curl -s http://localhost:3000 | head -20
58
-
59
- # 4. Check for errors (if browser available, take screenshots)
60
-
61
- # 5. Cleanup
62
- kill $SERVER_PID
63
- ```
64
-
65
- **Evidence to capture:**
66
- - Screenshots of key pages (if browser available)
67
- - HTML response showing expected content
68
- - Console errors (if any)
69
-
70
- ### Browser Verification (Playwright)
71
-
72
- For web applications, use Playwright for comprehensive browser verification:
73
-
74
- **Capabilities:**
75
- - **Verification**: Ensure web elements are present and functional
76
- - **Browsing**: Navigate through web pages to gather information
77
- - **Web Scraping**: Extract data from websites
78
- - **Testing**: Run end-to-end tests
79
- - **Screenshots**: Capture visual evidence of web states
80
-
81
- **Usage:**
82
- When a task involves browser interactions, use Playwright MCP tools:
83
- ```bash
84
- # Navigate to page
85
- playwright_browser_navigate(url="http://localhost:3000")
86
-
87
- # Take screenshot
88
- playwright_browser_take_screenshot(filename="homepage.png", type="png")
89
-
90
- # Interact with elements
91
- playwright_browser_click(ref="submit-button")
92
-
93
- # Verify element present
94
- playwright_browser_snapshot()
95
- ```
96
-
97
- **Best practices:**
98
- - Capture screenshots before and after interactions
99
- - Verify console for errors
100
- - Test across different viewport sizes
101
- - Document all browser-based evidence
102
-
103
- ### APIs
104
-
105
- ```bash
106
- # 1. Start server
107
- npm start &
108
- sleep 3
109
-
110
- # 2. Test key endpoints
111
- curl -s -w "%{http_code}" http://localhost:3000/api/health
112
- curl -s -X POST http://localhost:3000/api/[endpoint] \
113
- -H "Content-Type: application/json" \
114
- -d '{"test": true}'
115
-
116
- # 3. Verify response shapes match spec
117
- ```
118
-
119
- **Evidence to capture:**
120
- - Status codes for each endpoint
121
- - Response bodies (truncated if large)
122
- - Error responses
123
-
124
- ### CLI Tools
125
-
126
- ```bash
127
- # 1. Test help
128
- ./cli --help
129
- echo "Exit code: $?"
130
-
131
- # 2. Test primary commands
132
- ./cli [command] test-input.txt
133
- echo "Exit code: $?"
134
-
135
- # 3. Test error handling
136
- ./cli [command] nonexistent.txt
137
- echo "Exit code: $?" # Should be non-zero
138
- ```
139
-
140
- **Evidence to capture:**
141
- - Command output (stdout)
142
- - Error output (stderr)
143
- - Exit codes
144
-
145
- ### Libraries
146
-
147
- ```bash
148
- # 1. Run tests (confirm they pass)
149
- npm test
150
-
151
- # 2. Run examples from documentation
152
- node examples/basic-usage.js
153
-
154
- # 3. Check types compile
155
- npm run typecheck
156
- ```
157
-
158
- **Evidence to capture:**
159
- - Example output
160
- - Test coverage summary
161
-
162
- ## Fallback Hierarchy
163
-
164
- If primary method fails, fall back in order:
165
-
166
- ```
167
- 1. Full runtime (screenshot/curl/run) ─ FAILED
168
-
169
- └─► 2. Integration tests ─ FAILED
170
-
171
- └─► 3. Unit tests + examples ─ FAILED
172
-
173
- └─► 4. Type check + lint only ─ FAILED
174
-
175
- └─► 5. Code review only (last resort)
176
- └─► Report: "Unable to verify runtime behavior"
177
- ```
178
-
179
- **Always record:**
180
- - Which method was attempted
181
- - Why it failed
182
- - Which fallback was used
183
-
184
- ## Timeout Handling
185
-
186
- **30-second timeout per verification method.**
187
-
188
- If method times out:
189
- 1. Kill the process
190
- 2. Log the timeout
191
- 3. Try fallback method
192
-
193
- ## Evidence Storage
194
-
195
- Document evidence in MANUAL Verification Logs:
196
-
197
- ```markdown
198
- ## ✅ Verification Evidence
199
-
200
- ### Runtime Verification
201
- **Project Type**: web
202
- **Method Used**: curl + HTML inspection
203
- **Fallback Level**: Primary (no fallback needed)
204
-
205
- ### Evidence Captured
206
- 1. **Homepage**: Returns 200, contains expected title
207
- 2. **API Health**: /api/health returns {"status": "ok"}
208
- 3. **Login Flow**: Form renders, submits correctly
209
-
210
- ### Screenshots/Output
211
- - [Describe what was observed]
212
- - [Key behaviors verified]
213
-
214
- ### Result: ✅ VERIFIED
215
- ```
216
-
217
- ## Red Flags — STOP and Reassess
218
-
219
- If you're thinking any of these, you're about to violate the methodology:
220
-
221
- - "Tests pass, that's good enough" → **NO. SEE it working.**
222
- - "I'll verify after shipping" → **NO. Verify BEFORE ship.**
223
- - "The type checker caught everything" → **NO. Types don't catch runtime issues.**
224
- - "Screenshot failed but it probably works" → **NO. "Probably" isn't evidence.**
225
- - "It should work because..." → **NO. SHOW it works.**
226
- - "Evidence isn't necessary for this change" → **NO. Every change gets verified.**
227
-
228
- ## Rationalizations to Resist
229
-
230
- | Excuse | Reality |
231
- |--------|---------|
232
- | "Tests pass" | Tests aren't enough. SEE it working. |
233
- | "Type checker caught everything" | Types don't catch runtime issues. |
234
- | "It worked before" | Prove it works NOW. |
235
- | "Human checkpoint is formality" | Human checkpoint is the gate. |
236
- | "Code review is enough" | Code review is last resort fallback. |
237
- | "Tests are flaky, ignore failure" | Flaky tests hide real failures. Fix or accept with caveat. |
238
-
239
- ## Flaky Test Detection
240
-
241
- ```
242
- Test fails?
243
- ├── Re-run failed tests
244
- ├── Pass 2nd time?
245
- │ └── Yes → Note flakiness, accept with caveat
246
- └── Fail 2nd time?
247
- ├── Run isolated
248
- └── Still fail? → Real failure, must fix
249
- ```
250
-
251
- ## Output Format
252
-
253
- When verification completes:
254
-
255
- ```markdown
256
- ### Verification Summary
257
-
258
- | Check | Status | Method | Notes |
259
- |-------|--------|--------|-------|
260
- | Build | ✅ | npm run build | No errors |
261
- | Lint | ✅ | npm run lint | 0 issues |
262
- | Tests | ✅ | npm test | 47/47 passed |
263
- | Runtime | ✅ | curl + inspect | Evidence captured |
264
-
265
- ### Evidence
266
- - Homepage renders correctly (verified via curl)
267
- - API returns expected responses
268
- - No console errors observed
269
-
270
- ### Result: ✅ ALL CHECKS PASS + EVIDENCE GATHERED
271
- ```
272
-
273
- ## Anti-Patterns
274
-
275
- **Don't do these:**
276
-
277
- - Trusting test results without seeing code run
278
- - Skipping output verification because "tests pass"
279
- - Proceeding when verification method fails without fallback
280
- - Modifying code after verification passes (invalidates verification)
281
- - Ignoring flaky test warnings
282
- - Auto-approving without human checkpoint
283
-
284
- ---
285
-
286
- **Remember**: You are gathering EVIDENCE. The human decides if it ships. Your job is to SEE the code work and document what you observed.