opencode-vision 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -21
- package/SKILL.md +81 -19
- package/dist/index.js +5 -0
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -23,26 +23,36 @@ delegates to a vision subagent, and parses a typed report.
|
|
|
23
23
|
|
|
24
24
|
## Install
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
Two parts: the plugin (registers subagents) and the skill (the SKILL.md the
|
|
27
|
+
agent sees). Both are one-line commands.
|
|
28
|
+
|
|
29
|
+
### 1. Install the plugin (subagent registration)
|
|
30
|
+
|
|
31
|
+
Add to your `~/.config/opencode/opencode.json`:
|
|
27
32
|
|
|
28
33
|
```json
|
|
29
34
|
{
|
|
30
35
|
"$schema": "https://opencode.ai/config.json",
|
|
31
36
|
"plugin": [
|
|
32
37
|
"opencode-vision"
|
|
33
|
-
]
|
|
34
|
-
"skills": {
|
|
35
|
-
"paths": [
|
|
36
|
-
"~/.cache/opencode/node_modules/opencode-vision"
|
|
37
|
-
]
|
|
38
|
-
}
|
|
38
|
+
]
|
|
39
39
|
}
|
|
40
40
|
```
|
|
41
41
|
|
|
42
|
-
opencode auto-installs the npm package via Bun on next launch
|
|
43
|
-
`
|
|
44
|
-
|
|
45
|
-
|
|
42
|
+
opencode auto-installs the npm package via Bun on next launch. The plugin's
|
|
43
|
+
`config(cfg)` hook registers 10 `vision-*` subagents programmatically.
|
|
44
|
+
|
|
45
|
+
### 2. Install the skill (SKILL.md discovery)
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
npx skills add WeZZard/skills -a opencode -g --skill vision
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
This uses the [open agent skills CLI](https://github.com/vercel-labs/skills)
|
|
52
|
+
to fetch `SKILL.md` from this repo and drop it into
|
|
53
|
+
`~/.agents/skills/vision/SKILL.md` — a directory opencode scans by default
|
|
54
|
+
(along with `~/.config/opencode/skills/`). No `skills.paths` config entry
|
|
55
|
+
needed.
|
|
46
56
|
|
|
47
57
|
The old `~/.config/opencode/agents/visual-judge.md` subagent is removed —
|
|
48
58
|
this plugin replaces it with 10 typed `vision-*` subagents. Delete the old
|
|
@@ -52,16 +62,16 @@ file if present:
|
|
|
52
62
|
rm -f ~/.config/opencode/agents/visual-judge.md
|
|
53
63
|
```
|
|
54
64
|
|
|
55
|
-
Restart opencode for
|
|
65
|
+
Restart opencode for both changes to take effect.
|
|
56
66
|
|
|
57
|
-
> **Why
|
|
58
|
-
>
|
|
59
|
-
>
|
|
60
|
-
>
|
|
61
|
-
>
|
|
62
|
-
>
|
|
63
|
-
>
|
|
64
|
-
>
|
|
67
|
+
> **Why two steps?** opencode's plugin loader resolves the npm package to its
|
|
68
|
+
> `dist/index.js` entrypoint and runs the `config(cfg)` hook that registers
|
|
69
|
+
> the 10 subagents. But opencode's *skill* loader scans filesystem directories
|
|
70
|
+
> for `SKILL.md` — it does not look inside npm packages automatically. The
|
|
71
|
+
> `npx skills` command bridges this gap by placing `SKILL.md` where opencode's
|
|
72
|
+
> default skill scan finds it. This is a workaround for opencode bug #33896
|
|
73
|
+
> (plugin-registered skills not discoverable); it will be withdrawn once the
|
|
74
|
+
> upstream fix (PR #33918) ships.
|
|
65
75
|
|
|
66
76
|
## Verify
|
|
67
77
|
|
|
@@ -116,6 +126,9 @@ opencode/vision/ # this sub-package, published as opencode-visi
|
|
|
116
126
|
visual-judgment-request.v1.json
|
|
117
127
|
visual-judgment-report.v1.json
|
|
118
128
|
README.md # this file
|
|
129
|
+
|
|
130
|
+
skills/vision/SKILL.md # symlink → ../../opencode/vision/SKILL.md
|
|
131
|
+
# lets npx skills discover and install the skill
|
|
119
132
|
```
|
|
120
133
|
|
|
121
134
|
## Build & publish (maintainers)
|
|
@@ -162,4 +175,19 @@ Published via GitHub raw URLs (branch `main`):
|
|
|
162
175
|
|
|
163
176
|
The files also live in this repo under `opencode/vision/schemas/` for
|
|
164
177
|
editing. The URL is the canonical `$id`/`$schema` reference used by the
|
|
165
|
-
SKILL.md and subagent body.
|
|
178
|
+
SKILL.md and subagent body.
|
|
179
|
+
|
|
180
|
+
## Withdraw the skill workaround
|
|
181
|
+
|
|
182
|
+
When opencode bug [#33896](https://github.com/anomalyco/opencode/issues/33896)
|
|
183
|
+
is fixed (PR [#33918](https://github.com/anomalyco/opencode/pull/33918)
|
|
184
|
+
merged and shipped), the plugin can self-register the skill via the v2
|
|
185
|
+
`ctx.skill.transform()` API. At that point the `npx skills`-installed file
|
|
186
|
+
becomes redundant:
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
npx skills remove vision -a opencode -g
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
The plugin will then handle both subagent registration and skill discovery,
|
|
193
|
+
making the install a single `"plugin": ["opencode-vision"]` line.
|
package/SKILL.md
CHANGED
|
@@ -1,29 +1,49 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: vision
|
|
3
3
|
description: >-
|
|
4
|
-
Use when
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
4
|
+
Use when a tool result contains an image attachment the current model
|
|
5
|
+
cannot see (attachments[].mime = "image/png",
|
|
6
|
+
url = "data:image/png;base64,...") OR the user asks to visually
|
|
7
|
+
verify/check rendered content ("visually verify", "screenshot shows",
|
|
8
|
+
"centered/visible/hidden", "looks right", "matches the design").
|
|
9
|
+
Triggers on screenshots from chrome-devtools_take_screenshot,
|
|
10
|
+
playwright_browser_take_screenshot,
|
|
11
|
+
cua-driver_get_window_state/zoom/take_screenshot. Routes image bytes
|
|
12
|
+
to a vision-* subagent when the orchestrator's model is text-only
|
|
13
|
+
(e.g. glm-5.2, deepseek-v4-pro) and cannot see images itself.
|
|
14
|
+
Classifies intent into a typed judgment (presence/absence/alignment/
|
|
15
|
+
ordering/equality/layout/readability/state/diff/describe), asks the
|
|
16
|
+
user once per session which vision model to use, assembles a versioned
|
|
17
|
+
request, delegates, parses the typed report. Image paths from
|
|
18
|
+
screenshot_out_file/filePath; inline-only images saved to /tmp via
|
|
19
|
+
base64 -d.
|
|
13
20
|
---
|
|
14
21
|
|
|
15
22
|
# Vision — Visual Judgment Skill
|
|
16
23
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
returns a typed report.
|
|
24
|
+
When a task requires visual verification and the orchestrator's model
|
|
25
|
+
cannot see images, this skill routes image bytes to a vision subagent
|
|
26
|
+
that returns a typed report. The extraction pipeline is:
|
|
20
27
|
**Detect → Classify → Assemble → Pick model → Delegate → Parse**.
|
|
21
28
|
|
|
29
|
+
## When NOT to invoke this skill
|
|
30
|
+
|
|
31
|
+
If the orchestrator's model is itself vision-capable (e.g. you are
|
|
32
|
+
running on `kimi-for-coding/k2p7`, `openai/gpt-5.5`,
|
|
33
|
+
`ollama-cloud/gemini-3-flash-preview`, `opencode-go/qwen3.7-plus`, etc. —
|
|
34
|
+
the same models listed in Step 4's mapping table), do NOT delegate to a
|
|
35
|
+
vision subagent. Analyze the image attachment directly — you can see it.
|
|
36
|
+
This skill is only for orchestrators whose model cannot see images
|
|
37
|
+
(e.g. `ollama-cloud/glm-5.2`, `deepseek/deepseek-v4-pro`).
|
|
38
|
+
|
|
22
39
|
## Why this skill exists
|
|
23
40
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
41
|
+
Tool results in opencode can carry image attachments (`attachments[]`
|
|
42
|
+
with `mime: "image/*"` and `url: "data:image/...;base64,..."`). A model
|
|
43
|
+
trained without multimodal support sees the text part of these results
|
|
44
|
+
but the image bytes are invisible to it. This skill recognizes when such
|
|
45
|
+
an attachment is present and routes it to a vision subagent that can see
|
|
46
|
+
it, giving you a stable typed contract for the exchange.
|
|
27
47
|
|
|
28
48
|
## The two schemas
|
|
29
49
|
|
|
@@ -34,7 +54,7 @@ This skill gives you a stable contract for talking to one.
|
|
|
34
54
|
|
|
35
55
|
## Step 1. Detect
|
|
36
56
|
|
|
37
|
-
Visual-judgment intent arrives from
|
|
57
|
+
Visual-judgment intent arrives from three sources. Recognize all three.
|
|
38
58
|
|
|
39
59
|
### Source A — explicit visual-judgment language in a user prompt
|
|
40
60
|
|
|
@@ -67,6 +87,27 @@ welcome header." The text describes structure, but "looks right" is a
|
|
|
67
87
|
visual layout quality the text can't fully prove → you detect a
|
|
68
88
|
visual-judgment need.
|
|
69
89
|
|
|
90
|
+
### Source C — image attachment in a tool result
|
|
91
|
+
|
|
92
|
+
When any tool result in the transcript contains an `attachments[]` entry
|
|
93
|
+
with `mime` starting `image/`, that is an image the orchestrator cannot
|
|
94
|
+
see. This is a trigger regardless of whether the user explicitly asked for
|
|
95
|
+
visual verification — the image's mere presence means a visual judgment
|
|
96
|
+
*could* be needed. Recognize these patterns:
|
|
97
|
+
|
|
98
|
+
| Tool | Signal in result | File path available? |
|
|
99
|
+
|---|---|---|
|
|
100
|
+
| `chrome-devtools_take_screenshot` | `attachments[].mime = image/png` | Yes, if `filePath` was passed to the tool |
|
|
101
|
+
| `playwright_browser_take_screenshot` | `attachments[].mime = image/png` | Yes, if `filename` was passed (saved to output dir) |
|
|
102
|
+
| `cua-driver_get_window_state` | `screenshot` field (base64) + `screenshot_file_path` if `screenshot_out_file` was passed | Yes if `screenshot_out_file` set |
|
|
103
|
+
| `cua-driver_zoom` | Cropped JPEG returned inline | **No** — inline only, must be saved to disk first (see 3f) |
|
|
104
|
+
| `cua-driver_take_screenshot` | `attachments[].mime = image/png` | Yes if `filePath` set |
|
|
105
|
+
|
|
106
|
+
**Gating rule**: auto-invoke only when the user's current task has a
|
|
107
|
+
visual component (layout, alignment, presence, state, readability — see
|
|
108
|
+
Step 2). If the task has no visual component, do nothing; note the image
|
|
109
|
+
is available if needed later.
|
|
110
|
+
|
|
70
111
|
## Step 2. Classify
|
|
71
112
|
|
|
72
113
|
Map the NL task to one of the 10 closed `judgment.type` values. Each has
|
|
@@ -147,9 +188,30 @@ screenshot-save instructions.
|
|
|
147
188
|
### 3e. Edge case — built-in computer-use MCP
|
|
148
189
|
|
|
149
190
|
The built-in Claude Code `computer-use` MCP returns screenshots as inline
|
|
150
|
-
base64 images, not file paths.
|
|
151
|
-
|
|
152
|
-
`
|
|
191
|
+
base64 images, not file paths. The vision subagent needs a file path to
|
|
192
|
+
`read`. Prefer `cua-driver` for desktop visual judgments — it has
|
|
193
|
+
`screenshot_out_file`.
|
|
194
|
+
|
|
195
|
+
### 3f. Inline-only image attachments (no file path)
|
|
196
|
+
|
|
197
|
+
Some tool results return image attachments with
|
|
198
|
+
`attachments[].url = "data:image/...;base64,..."` but **no file path** —
|
|
199
|
+
e.g. `cua-driver_zoom` (inline-only, no path param), or
|
|
200
|
+
`playwright_browser_take_screenshot` called without a `filename`. The
|
|
201
|
+
vision subagent needs a file path to `read`. Save the inline image to
|
|
202
|
+
disk first:
|
|
203
|
+
|
|
204
|
+
```
|
|
205
|
+
If a tool result has attachments[].url starting "data:image/...;base64,"
|
|
206
|
+
but no file path:
|
|
207
|
+
1. Extract the base64 payload from the data URL (the part after
|
|
208
|
+
";base64,").
|
|
209
|
+
2. Write it to /tmp/vision-<random>.png via bash:
|
|
210
|
+
echo "<base64>" | base64 -d > /tmp/vision-<random>.png
|
|
211
|
+
3. Use that path in the request's images[].path.
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
This is the recommended handling for any inline-only image result.
|
|
153
215
|
|
|
154
216
|
## Step 4. Pick model (once per session)
|
|
155
217
|
|
package/dist/index.js
CHANGED
|
@@ -7,6 +7,7 @@ var candidateDirs = [bundleDir, join(bundleDir, "..")];
|
|
|
7
7
|
var dataDir = candidateDirs.find((d) => existsSync(join(d, "vision-models.json")) && existsSync(join(d, "subagent-body.md"))) ?? bundleDir;
|
|
8
8
|
var manifest = JSON.parse(readFileSync(join(dataDir, "vision-models.json"), "utf8"));
|
|
9
9
|
var bodyTpl = readFileSync(join(dataDir, "subagent-body.md"), "utf8");
|
|
10
|
+
var VISION_CAPABLE_ORCHESTRATORS = new Set(manifest.models.map((m) => `${m.provider}/${m.model_id}`));
|
|
10
11
|
var PERMISSION = {
|
|
11
12
|
edit: "deny",
|
|
12
13
|
read: "allow",
|
|
@@ -23,6 +24,10 @@ function subagentName(entry) {
|
|
|
23
24
|
}
|
|
24
25
|
var plugin = async () => ({
|
|
25
26
|
config: async (cfg) => {
|
|
27
|
+
const orchestrator = cfg.model;
|
|
28
|
+
if (orchestrator && VISION_CAPABLE_ORCHESTRATORS.has(orchestrator)) {
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
26
31
|
cfg.agent ??= {};
|
|
27
32
|
for (const e of manifest.models) {
|
|
28
33
|
const name = subagentName(e);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "opencode-vision",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "Typed visual-judgment skill for opencode. Registers 10 vision subagents (one per top-tier vision model across OpenAI, Kimi for Coding, Ollama Cloud, and opencode-go) and a skill that teaches a text-only orchestrator to extract visual-judgment intent, classify it into a typed judgment, and delegate to a vision subagent with a versioned request/report contract.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
"prebuild": "rm -rf dist",
|
|
22
22
|
"build": "bun build ./plugin.ts --outfile ./dist/index.js --target node --format esm --packages external",
|
|
23
23
|
"prepublishOnly": "bun run build",
|
|
24
|
+
"sync:skill": "cp SKILL.md ../../skills/vision/SKILL.md",
|
|
24
25
|
"typecheck": "tsc --noEmit"
|
|
25
26
|
},
|
|
26
27
|
"keywords": [
|