opencode-see-image 0.9.2 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -1
- package/README.md +75 -68
- package/index.ts +143 -56
- package/package.json +1 -1
|
@@ -12,7 +12,15 @@
|
|
|
12
12
|
"Read(//Users/alfa/Documents/opencodeprojects/opencode-see-image/bun-types/**)",
|
|
13
13
|
"Bash(bun run *)",
|
|
14
14
|
"WebFetch(domain:docs.z.ai)",
|
|
15
|
-
"Bash(npm publish *)"
|
|
15
|
+
"Bash(npm publish *)",
|
|
16
|
+
"Bash(python3 -c ' *)",
|
|
17
|
+
"Bash(open -a Preview \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/3.png\" \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/4.png\" \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/5.png\")",
|
|
18
|
+
"Bash(ps -o etime= -p 82196)",
|
|
19
|
+
"Bash(echo \"STILL RUNNING \\($\\(ps -o etime= -p 82196)",
|
|
20
|
+
"Bash(awk '/export type TextPart = \\\\{/,/\\\\};/' node_modules/@opencode-ai/sdk/dist/gen/types.gen.d.t)",
|
|
21
|
+
"Bash(awk '{print $2, $9, $11, $12, $13}')",
|
|
22
|
+
"Bash(pkill -f \"[o]pencode run\")",
|
|
23
|
+
"Bash(pkill -f \"[o]pencode-run\")"
|
|
16
24
|
]
|
|
17
25
|
}
|
|
18
26
|
}
|
package/README.md
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
# opencode-see-image
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
give non-vision opencode models the ability to see images and screenshots by routing them to a vision-capable model.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
when a user attaches a screenshot to a text-only model, opencode rejects it with an error. This plugin intercepts that flow by registering a `see_image` tool that sends the image to a vision model and returns a textual description the primary model can reason about.
|
|
6
6
|
|
|
7
|
-
##
|
|
7
|
+
## install
|
|
8
8
|
|
|
9
|
-
**
|
|
9
|
+
**one command (recommended):**
|
|
10
10
|
```bash
|
|
11
11
|
opencode plugin opencode-see-image --global
|
|
12
12
|
```
|
|
13
13
|
This installs the package and adds it to your config. Then restart opencode.
|
|
14
14
|
|
|
15
|
-
**
|
|
15
|
+
**edit config manually:**
|
|
16
16
|
|
|
17
17
|
Add the plugin to your opencode config:
|
|
18
18
|
|
|
@@ -25,76 +25,76 @@ Add the plugin to your opencode config:
|
|
|
25
25
|
```
|
|
26
26
|
Then restart opencode.
|
|
27
27
|
|
|
28
|
-
##
|
|
28
|
+
## install via your agent (for some reason?)
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
ask your agent:
|
|
31
31
|
```
|
|
32
32
|
install the opencode-see-image plugin
|
|
33
33
|
```
|
|
34
|
-
|
|
34
|
+
it'll run `opencode plugin opencode-see-image --global` and tell you to restart.
|
|
35
35
|
|
|
36
|
-
##
|
|
36
|
+
## prerequisites
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
you need a connected vision-capable provider. The plugin auto-detects whichever you have connected, **either of these work**:
|
|
39
39
|
|
|
40
|
-
###
|
|
41
|
-
1.
|
|
42
|
-
2.
|
|
43
|
-
3.
|
|
40
|
+
### free (OpenCode Zen)
|
|
41
|
+
1. run `/connect` in opencode
|
|
42
|
+
2. select **opencode** (OpenCode Zen)
|
|
43
|
+
3. paste your API key from [opencode.ai/auth](https://opencode.ai/auth)
|
|
44
44
|
|
|
45
|
-
|
|
45
|
+
the plugin falls back to **mimo-v2.5-free**.
|
|
46
46
|
|
|
47
|
-
###
|
|
48
|
-
1.
|
|
49
|
-
2.
|
|
50
|
-
3.
|
|
47
|
+
### paid, w/ OpenCode Go
|
|
48
|
+
1. run `/connect` in opencode
|
|
49
|
+
2. select **opencode-go**
|
|
50
|
+
3. paste your API key from [opencode.ai/auth](https://opencode.ai/auth)
|
|
51
51
|
|
|
52
|
-
|
|
52
|
+
the plugin prefers **minimax-m3** via opencode-go when available.
|
|
53
53
|
|
|
54
|
-
###
|
|
54
|
+
### paid, w/ another provider
|
|
55
55
|
|
|
56
|
-
|
|
56
|
+
set the `SEE_IMAGE_*` env vars to point at any Anthropic-Messages-compatible endpoint. see [Configuration](#configuration) below.
|
|
57
57
|
|
|
58
|
-
**
|
|
58
|
+
**the resolve order:** explicit `SEE_IMAGE_API_KEY` env → configured `SEE_IMAGE_PROVIDER` → `opencode-go` (MiniMax M3) → `opencode` (mimo-v2.5-free).
|
|
59
59
|
|
|
60
|
-
##
|
|
60
|
+
## how the _eye surgery_ works
|
|
61
61
|
|
|
62
62
|
```
|
|
63
63
|
user attaches screenshot
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
|
|
|
65
|
+
v
|
|
66
66
|
opencode rejects it: 'this model does not support image input'
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
| (the model only sees the filename)
|
|
68
|
+
v
|
|
69
69
|
plugin's system-prompt instructions tell the model to call see_image
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
|
|
|
71
|
+
v
|
|
72
72
|
see_image tool:
|
|
73
|
-
1. queries opencode's SQLite DB for the image
|
|
73
|
+
1. queries opencode's SQLite DB for the image
|
|
74
74
|
2. falls back to filesystem search if not in DB
|
|
75
75
|
3. sends the image to the vision model via opencode's SDK
|
|
76
76
|
4. returns the textual description
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
|
|
|
78
|
+
v
|
|
79
79
|
primary model answers using the description
|
|
80
80
|
```
|
|
81
81
|
|
|
82
|
-
##
|
|
82
|
+
## the `see_image` tool
|
|
83
83
|
|
|
84
|
-
|
|
84
|
+
the plugin registers a `see_image` tool with two arguments:
|
|
85
85
|
|
|
86
|
-
|
|
|
86
|
+
| arg | type | required? | description |
|
|
87
87
|
|---|---|---|---|
|
|
88
|
-
| `filePath` | string |
|
|
89
|
-
| `question` | string |
|
|
88
|
+
| `filePath` | string | y | path to the image. Absolute path, or a bare filename like `"Screenshot 2026-06-18 at 17.32.24.png"` to auto-locate. |
|
|
89
|
+
| `question` | string | n | a specific question about the image. Defaults to a general detailed description. Use this to focus on a particular detail (e.g. `"What error is shown in the terminal?"`). |
|
|
90
90
|
|
|
91
|
-
|
|
91
|
+
your model calls this tool automatically when you attach a screenshot, you don't need to do anything special. The `question` arg is optional; the model uses it when you ask something specific about the image.
|
|
92
92
|
|
|
93
|
-
##
|
|
93
|
+
## configuration
|
|
94
94
|
|
|
95
|
-
|
|
95
|
+
all settings are env-var overrides. The plugin uses opencode's SDK client by default (handles auth automatically). Set `SEE_IMAGE_API_KEY` to bypass the SDK and call an HTTP endpoint directly.
|
|
96
96
|
|
|
97
|
-
|
|
|
97
|
+
| env var | default | description |
|
|
98
98
|
|---|---|---|
|
|
99
99
|
| `SEE_IMAGE_MODEL` | `minimax-m3` | Vision model ID |
|
|
100
100
|
| `SEE_IMAGE_PROVIDER` | `opencode-go` | Provider ID for SDK routing |
|
|
@@ -102,11 +102,17 @@ All settings are env-var overrides. The plugin uses opencode's SDK client by def
|
|
|
102
102
|
| `SEE_IMAGE_ENDPOINT` | `https://opencode.ai/zen/go/v1/messages` | HTTP endpoint (only used if `SEE_IMAGE_API_KEY` is set) |
|
|
103
103
|
| `SEE_IMAGE_API_VERSION` | `2023-06-01` | `anthropic-version` header (HTTP mode only) |
|
|
104
104
|
| `SEE_IMAGE_USER_AGENT` | _(Chrome UA)_ | User-Agent header (HTTP mode only) |
|
|
105
|
-
| `SEE_IMAGE_TIMEOUT` | `30000` |
|
|
105
|
+
| `SEE_IMAGE_TIMEOUT` | `30000` | Timeout in ms for session setup and HTTP-mode calls. |
|
|
106
|
+
| `SEE_IMAGE_STALL_TIMEOUT` | `60000` | Stall timeout in ms (SDK streaming). The call is only aborted if the vision model produces no new tokens for this long — so long transcriptions keep running as long as they're progressing. |
|
|
107
|
+
| `SEE_IMAGE_MAX_TIMEOUT` | `0` | Absolute cap in ms on a single streaming call. `0` = no cap. |
|
|
106
108
|
|
|
107
|
-
###
|
|
109
|
+
### streaming
|
|
108
110
|
|
|
109
|
-
|
|
111
|
+
On the SDK path the plugin streams the vision model's output and shows live progress in the tool call (`see_image: reading… N chars`). Instead of a hard timeout, it uses a **stall timeout** (`SEE_IMAGE_STALL_TIMEOUT`): a slow-but-progressing model (e.g. transcribing a huge table) runs to completion, while a genuinely hung call is still reaped. If a call is cut short, whatever was streamed so far is returned rather than nothing.
|
|
112
|
+
|
|
113
|
+
### using a different vision model
|
|
114
|
+
|
|
115
|
+
any Anthropic-Messages-compatible endpoint works. for example, to use a direct MiniMax key:
|
|
110
116
|
|
|
111
117
|
```bash
|
|
112
118
|
export SEE_IMAGE_ENDPOINT="https://api.minimax.io/v1/messages"
|
|
@@ -114,60 +120,61 @@ export SEE_IMAGE_MODEL="minimax-m3"
|
|
|
114
120
|
export SEE_IMAGE_API_KEY="your-minimax-key"
|
|
115
121
|
```
|
|
116
122
|
|
|
117
|
-
|
|
123
|
+
to use a different opencode-go model (e.g. Kimi K2.7):
|
|
118
124
|
|
|
119
125
|
```bash
|
|
120
126
|
export SEE_IMAGE_MODEL="kimi-k2.7-code"
|
|
121
127
|
```
|
|
122
128
|
|
|
123
|
-
###
|
|
129
|
+
### verified vision-capable models
|
|
124
130
|
|
|
125
131
|
**Free (OpenCode Zen):**
|
|
126
132
|
|
|
127
|
-
|
|
|
128
|
-
|
|
129
|
-
| `mimo-v2.5-free` |
|
|
130
|
-
| `big-pickle` |
|
|
133
|
+
| model | Notes |
|
|
134
|
+
|---|---|
|
|
135
|
+
| `mimo-v2.5-free` | free. may be a bit slow. default fallback when only Zen is connected (routed via CLI). |
|
|
136
|
+
| `big-pickle` | for some reason, big pickle works as an image capable model when called through the sdk w/ an active opencode go sub. |
|
|
131
137
|
|
|
132
|
-
**
|
|
138
|
+
**paid (OpenCode Go):**
|
|
133
139
|
|
|
134
|
-
|
|
|
140
|
+
| model | speed | notes |
|
|
135
141
|
|---|---|---|
|
|
136
|
-
| `minimax-m3` | ~3000ms |
|
|
137
|
-
| `kimi-k2.7-code` | ~7000ms |
|
|
138
|
-
| `kimi-k2.6` | ~
|
|
139
|
-
| `qwen3.7-plus` | ~
|
|
142
|
+
| `minimax-m3` | ~3000ms | default. fast, clean, and accurate. |
|
|
143
|
+
| `kimi-k2.7-code` | ~7000ms | clean and accurate. |
|
|
144
|
+
| `kimi-k2.6` | ~12000ms | accurate but slow. |
|
|
145
|
+
| `qwen3.7-plus` | ~15000ms | slow, spends a bit more tokens because of thinking. |
|
|
140
146
|
|
|
141
|
-
##
|
|
147
|
+
## updating
|
|
142
148
|
|
|
143
|
-
**
|
|
149
|
+
**auto-update (built in):** uses the opencode-plugin-update-kit and shows a toast: *"opencode-see-image updated to X.Y.Z, restart opencode to apply"*. You just need to restart opencode to load the new version.
|
|
144
150
|
|
|
145
|
-
**
|
|
151
|
+
**manual update**:
|
|
146
152
|
```bash
|
|
147
153
|
opencode plugin opencode-see-image --force --global
|
|
148
154
|
```
|
|
149
|
-
|
|
155
|
+
then restart opencode.
|
|
150
156
|
|
|
151
|
-
**
|
|
157
|
+
**pin a version** in your config to opt out of auto-updates:
|
|
152
158
|
```jsonc
|
|
153
159
|
"plugin": ["opencode-see-image@0.4.2"]
|
|
154
160
|
```
|
|
155
161
|
|
|
156
|
-
##
|
|
162
|
+
## kimitations
|
|
157
163
|
|
|
158
|
-
- **macOS-only filesystem search
|
|
164
|
+
- **macOS-only filesystem search**. the filesystem fallback targets macOS screenshot temp dirs. Linux/Windows users should rely on the DB lookup (which is cross-platform) or pass absolute paths.
|
|
165
|
+
> if you can add compat for more platforms, i would love a pr.
|
|
159
166
|
|
|
160
|
-
##
|
|
167
|
+
## file search locations
|
|
161
168
|
|
|
162
|
-
|
|
169
|
+
when opencode rejects an image attachment, the model only receives a bare filename. `see_image` searches these locations in order:
|
|
163
170
|
|
|
164
171
|
1. `$TMPDIR/TemporaryItems/NSIRD_screencaptureui_*/` (where macOS stashes dragged screenshots)
|
|
165
172
|
2. `$TMPDIR/TemporaryItems/`
|
|
166
173
|
3. `~/Desktop` (default screenshot save location)
|
|
167
174
|
4. `~/Downloads`
|
|
168
|
-
5.
|
|
175
|
+
5. current working directory
|
|
169
176
|
|
|
170
|
-
|
|
177
|
+
pass an absolute `filePath` to skip the search.
|
|
171
178
|
|
|
172
179
|
## License
|
|
173
180
|
|
package/index.ts
CHANGED
|
@@ -12,6 +12,12 @@ const ENDPOINT =
|
|
|
12
12
|
const MODEL = process.env.SEE_IMAGE_MODEL || "minimax-m3"
|
|
13
13
|
const PROVIDER_ID = process.env.SEE_IMAGE_PROVIDER || "opencode-go"
|
|
14
14
|
const TIMEOUT = parseInt(process.env.SEE_IMAGE_TIMEOUT || "30000", 10)
|
|
15
|
+
// Stall timeout: while streaming, only abort if the model produces no new
|
|
16
|
+
// tokens for this long. Lets long transcriptions run as long as they keep
|
|
17
|
+
// progressing. Used for the SDK streaming path.
|
|
18
|
+
const STALL_TIMEOUT = parseInt(process.env.SEE_IMAGE_STALL_TIMEOUT || "60000", 10)
|
|
19
|
+
// Optional absolute cap on a single streaming call (0 = no cap).
|
|
20
|
+
const MAX_TIMEOUT = parseInt(process.env.SEE_IMAGE_MAX_TIMEOUT || "0", 10)
|
|
15
21
|
const API_VERSION = process.env.SEE_IMAGE_API_VERSION || "2023-06-01"
|
|
16
22
|
const USER_AGENT =
|
|
17
23
|
process.env.SEE_IMAGE_USER_AGENT ||
|
|
@@ -216,12 +222,15 @@ function readProviderKey(providerID: string): string | null {
|
|
|
216
222
|
}
|
|
217
223
|
}
|
|
218
224
|
|
|
225
|
+
type ProgressFn = (info: { chars: number; preview: string; provider: string; model: string }) => void
|
|
226
|
+
|
|
219
227
|
async function seeImageViaSDK(
|
|
220
228
|
client: any,
|
|
221
229
|
dataUrl: string,
|
|
222
230
|
mediaType: string,
|
|
223
231
|
prompt: string,
|
|
224
232
|
abort?: AbortSignal,
|
|
233
|
+
onProgress?: ProgressFn,
|
|
225
234
|
): Promise<{ text: string; model: string; provider: string }> {
|
|
226
235
|
const errors: string[] = []
|
|
227
236
|
|
|
@@ -288,6 +297,115 @@ async function seeImageViaSDK(
|
|
|
288
297
|
return null
|
|
289
298
|
}
|
|
290
299
|
|
|
300
|
+
// Stream a vision response from a paid/SDK provider. Subscribes to opencode's
|
|
301
|
+
// event stream so we can (a) surface live progress and (b) use a *stall*
|
|
302
|
+
// timeout — we only give up if the model goes quiet for STALL_TIMEOUT, so a
|
|
303
|
+
// long transcription keeps running as long as it's producing tokens. Returns
|
|
304
|
+
// whatever text was produced, even if a stall/abort cut it short (partial).
|
|
305
|
+
const streamCandidate = async (
|
|
306
|
+
providerID: string,
|
|
307
|
+
modelID: string,
|
|
308
|
+
): Promise<string | null> => {
|
|
309
|
+
const sessionRes = await Promise.race([
|
|
310
|
+
client.session.create({ body: {} }),
|
|
311
|
+
new Promise<never>((_, reject) =>
|
|
312
|
+
setTimeout(
|
|
313
|
+
() => reject(new Error(`session.create timed out after ${TIMEOUT}ms`)),
|
|
314
|
+
TIMEOUT,
|
|
315
|
+
),
|
|
316
|
+
),
|
|
317
|
+
])
|
|
318
|
+
const sessionID: string | undefined = sessionRes.data?.id
|
|
319
|
+
if (!sessionID) throw new Error("no session ID")
|
|
320
|
+
|
|
321
|
+
const controller = new AbortController()
|
|
322
|
+
const onAbort = () => controller.abort()
|
|
323
|
+
abort?.addEventListener("abort", onAbort)
|
|
324
|
+
|
|
325
|
+
// Subscribe to events before prompting so we don't miss early tokens.
|
|
326
|
+
let stream: AsyncGenerator<any> | undefined
|
|
327
|
+
try {
|
|
328
|
+
const sub = await client.event.subscribe()
|
|
329
|
+
stream = sub?.stream
|
|
330
|
+
} catch {}
|
|
331
|
+
|
|
332
|
+
const partsByID = new Map<string, string>()
|
|
333
|
+
let streamedText = ""
|
|
334
|
+
let lastActivity = Date.now()
|
|
335
|
+
let finished = false
|
|
336
|
+
|
|
337
|
+
const consume = (async () => {
|
|
338
|
+
if (!stream) return
|
|
339
|
+
try {
|
|
340
|
+
for await (const ev of stream) {
|
|
341
|
+
if (finished) break
|
|
342
|
+
if (
|
|
343
|
+
ev?.type === "message.part.updated" &&
|
|
344
|
+
ev.properties?.part?.type === "text" &&
|
|
345
|
+
ev.properties.part.sessionID === sessionID
|
|
346
|
+
) {
|
|
347
|
+
const p = ev.properties.part
|
|
348
|
+
partsByID.set(p.id, typeof p.text === "string" ? p.text : "")
|
|
349
|
+
streamedText = [...partsByID.values()].join("\n").trim()
|
|
350
|
+
lastActivity = Date.now()
|
|
351
|
+
onProgress?.({
|
|
352
|
+
chars: streamedText.length,
|
|
353
|
+
preview: streamedText.slice(-160),
|
|
354
|
+
provider: providerID,
|
|
355
|
+
model: modelID,
|
|
356
|
+
})
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
} catch {}
|
|
360
|
+
})()
|
|
361
|
+
|
|
362
|
+
// Stall watchdog (only when we actually have a stream to measure activity).
|
|
363
|
+
const stallTimer = stream
|
|
364
|
+
? setInterval(() => {
|
|
365
|
+
if (Date.now() - lastActivity > STALL_TIMEOUT) controller.abort()
|
|
366
|
+
}, 1000)
|
|
367
|
+
: undefined
|
|
368
|
+
const maxTimer =
|
|
369
|
+
MAX_TIMEOUT > 0 ? setTimeout(() => controller.abort(), MAX_TIMEOUT) : undefined
|
|
370
|
+
|
|
371
|
+
let res: any
|
|
372
|
+
try {
|
|
373
|
+
res = await client.session.prompt({
|
|
374
|
+
path: { id: sessionID },
|
|
375
|
+
body: {
|
|
376
|
+
model: { providerID, modelID },
|
|
377
|
+
parts: [
|
|
378
|
+
{ type: "file", mime: mediaType, url: dataUrl },
|
|
379
|
+
{ type: "text", text: prompt },
|
|
380
|
+
],
|
|
381
|
+
tools: {},
|
|
382
|
+
system:
|
|
383
|
+
"You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
|
|
384
|
+
},
|
|
385
|
+
signal: controller.signal,
|
|
386
|
+
})
|
|
387
|
+
} catch (e: any) {
|
|
388
|
+
// Aborted by stall/max/external — fall through to whatever we streamed.
|
|
389
|
+
if (!streamedText) throw e
|
|
390
|
+
} finally {
|
|
391
|
+
finished = true
|
|
392
|
+
if (stallTimer) clearInterval(stallTimer)
|
|
393
|
+
if (maxTimer) clearTimeout(maxTimer)
|
|
394
|
+
try { await stream?.return?.(undefined) } catch {}
|
|
395
|
+
abort?.removeEventListener("abort", onAbort)
|
|
396
|
+
client.session.delete({ path: { id: sessionID } }).catch(() => {})
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
const finalText = (res?.data?.parts ?? [])
|
|
400
|
+
.filter((p: any) => p.type === "text")
|
|
401
|
+
.map((p: any) => p.text)
|
|
402
|
+
.filter((t: any) => typeof t === "string" && t.length > 0)
|
|
403
|
+
.join("\n")
|
|
404
|
+
.trim()
|
|
405
|
+
|
|
406
|
+
return finalText || streamedText || null
|
|
407
|
+
}
|
|
408
|
+
|
|
291
409
|
let result: { text: string; model: string; provider: string } | undefined
|
|
292
410
|
|
|
293
411
|
try {
|
|
@@ -312,56 +430,8 @@ async function seeImageViaSDK(
|
|
|
312
430
|
continue
|
|
313
431
|
}
|
|
314
432
|
|
|
315
|
-
let sessionID: string | undefined
|
|
316
433
|
try {
|
|
317
|
-
const
|
|
318
|
-
client.session.create({ body: {} }),
|
|
319
|
-
new Promise<never>((_, reject) =>
|
|
320
|
-
setTimeout(
|
|
321
|
-
() => reject(new Error(`session.create timed out after ${TIMEOUT}ms`)),
|
|
322
|
-
TIMEOUT,
|
|
323
|
-
),
|
|
324
|
-
),
|
|
325
|
-
])
|
|
326
|
-
sessionID = sessionRes.data?.id
|
|
327
|
-
if (!sessionID) {
|
|
328
|
-
errors.push(`${providerID}/${modelID}: no session ID`)
|
|
329
|
-
continue
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
const controller = new AbortController()
|
|
333
|
-
const onAbort = () => controller.abort()
|
|
334
|
-
abort?.addEventListener("abort", onAbort)
|
|
335
|
-
const timer = setTimeout(() => controller.abort(), TIMEOUT)
|
|
336
|
-
let res
|
|
337
|
-
try {
|
|
338
|
-
res = await client.session.prompt({
|
|
339
|
-
path: { id: sessionID },
|
|
340
|
-
body: {
|
|
341
|
-
model: { providerID, modelID },
|
|
342
|
-
parts: [
|
|
343
|
-
{ type: "file", mime: mediaType, url: dataUrl },
|
|
344
|
-
{ type: "text", text: prompt },
|
|
345
|
-
],
|
|
346
|
-
tools: {},
|
|
347
|
-
system:
|
|
348
|
-
"You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
|
|
349
|
-
},
|
|
350
|
-
signal: controller.signal,
|
|
351
|
-
})
|
|
352
|
-
} finally {
|
|
353
|
-
clearTimeout(timer)
|
|
354
|
-
abort?.removeEventListener("abort", onAbort)
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
const parts = res.data?.parts ?? []
|
|
358
|
-
const text = (parts as any[])
|
|
359
|
-
.filter((p: any) => p.type === "text")
|
|
360
|
-
.map((p: any) => p.text)
|
|
361
|
-
.filter((t: any) => typeof t === "string" && t.length > 0)
|
|
362
|
-
.join("\n")
|
|
363
|
-
.trim()
|
|
364
|
-
|
|
434
|
+
const text = await streamCandidate(providerID, modelID)
|
|
365
435
|
if (text) {
|
|
366
436
|
result = { text, model: modelID, provider: providerID }
|
|
367
437
|
break
|
|
@@ -369,12 +439,6 @@ async function seeImageViaSDK(
|
|
|
369
439
|
errors.push(`${providerID}/${modelID}: no text in response`)
|
|
370
440
|
} catch (e: any) {
|
|
371
441
|
errors.push(`${providerID}/${modelID}: ${e?.message ?? e}`)
|
|
372
|
-
} finally {
|
|
373
|
-
if (sessionID) {
|
|
374
|
-
await client.session
|
|
375
|
-
.delete({ path: { id: sessionID } })
|
|
376
|
-
.catch(() => {})
|
|
377
|
-
}
|
|
378
442
|
}
|
|
379
443
|
}
|
|
380
444
|
|
|
@@ -522,7 +586,17 @@ const SeeImagePlugin: Plugin = async (ctx) => {
|
|
|
522
586
|
.string()
|
|
523
587
|
.optional()
|
|
524
588
|
.describe(
|
|
525
|
-
|
|
589
|
+
[
|
|
590
|
+
"What to ask the vision model. Omit for a general detailed description.",
|
|
591
|
+
"Tailor it to the situation for much better results:",
|
|
592
|
+
'- Reading/transcribing text or code: "Transcribe all text exactly, preserving layout, line breaks, and code indentation."',
|
|
593
|
+
'- An error or stack trace screenshot: "Quote the exact error message and stack trace, then state the likely cause."',
|
|
594
|
+
'- Reproducing a UI as code: "Describe the layout, components, text, colors, and spacing precisely enough to rebuild this UI in code."',
|
|
595
|
+
'- A technical diagram/architecture: "Explain this diagram: list each component and the relationships and data/flow direction between them."',
|
|
596
|
+
'- A chart/graph/dashboard: "Read this visualization: axes, series, key values, and the main takeaway."',
|
|
597
|
+
'- Comparing against an expected design: "Describe this UI in detail so it can be diffed against an expected layout (note any visible defects or misalignment)."',
|
|
598
|
+
"Otherwise pass the user's own specific question verbatim.",
|
|
599
|
+
].join("\n"),
|
|
526
600
|
),
|
|
527
601
|
},
|
|
528
602
|
async execute(args, context) {
|
|
@@ -539,12 +613,25 @@ const SeeImagePlugin: Plugin = async (ctx) => {
|
|
|
539
613
|
const b64 = resolved.dataUrl.split(",")[1] || ""
|
|
540
614
|
result = await seeImageViaHTTP(b64, resolved.mediaType, prompt, context.abort)
|
|
541
615
|
} else {
|
|
616
|
+
// Throttle live progress updates so we don't spam the UI while the
|
|
617
|
+
// vision model streams a long response.
|
|
618
|
+
let lastUpdate = 0
|
|
619
|
+
const onProgress: ProgressFn = (info) => {
|
|
620
|
+
const now = Date.now()
|
|
621
|
+
if (now - lastUpdate < 400) return
|
|
622
|
+
lastUpdate = now
|
|
623
|
+
context.metadata({
|
|
624
|
+
title: `see_image: reading… ${info.chars} chars (${info.model})`,
|
|
625
|
+
metadata: { streaming: true, chars: info.chars, preview: info.preview },
|
|
626
|
+
})
|
|
627
|
+
}
|
|
542
628
|
result = await seeImageViaSDK(
|
|
543
629
|
client,
|
|
544
630
|
resolved.dataUrl,
|
|
545
631
|
resolved.mediaType,
|
|
546
632
|
prompt,
|
|
547
633
|
context.abort,
|
|
634
|
+
onProgress,
|
|
548
635
|
)
|
|
549
636
|
}
|
|
550
637
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "opencode-see-image",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.10.0",
|
|
4
4
|
"description": "Give non-vision opencode models the ability to see images/screenshots by routing them to a vision-capable model (MiniMax M3 via opencode-go by default).",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.ts",
|