opencode-see-image 0.9.3 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -1
- package/README.md +7 -1
- package/index.ts +132 -55
- package/package.json +1 -1
|
@@ -12,7 +12,15 @@
|
|
|
12
12
|
"Read(//Users/alfa/Documents/opencodeprojects/opencode-see-image/bun-types/**)",
|
|
13
13
|
"Bash(bun run *)",
|
|
14
14
|
"WebFetch(domain:docs.z.ai)",
|
|
15
|
-
"Bash(npm publish *)"
|
|
15
|
+
"Bash(npm publish *)",
|
|
16
|
+
"Bash(python3 -c ' *)",
|
|
17
|
+
"Bash(open -a Preview \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/3.png\" \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/4.png\" \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/5.png\")",
|
|
18
|
+
"Bash(ps -o etime= -p 82196)",
|
|
19
|
+
"Bash(echo \"STILL RUNNING \\($\\(ps -o etime= -p 82196)",
|
|
20
|
+
"Bash(awk '/export type TextPart = \\\\{/,/\\\\};/' node_modules/@opencode-ai/sdk/dist/gen/types.gen.d.t)",
|
|
21
|
+
"Bash(awk '{print $2, $9, $11, $12, $13}')",
|
|
22
|
+
"Bash(pkill -f \"[o]pencode run\")",
|
|
23
|
+
"Bash(pkill -f \"[o]pencode-run\")"
|
|
16
24
|
]
|
|
17
25
|
}
|
|
18
26
|
}
|
package/README.md
CHANGED
|
@@ -102,7 +102,13 @@ all settings are env-var overrides. The plugin uses opencode's SDK client by def
|
|
|
102
102
|
| `SEE_IMAGE_ENDPOINT` | `https://opencode.ai/zen/go/v1/messages` | HTTP endpoint (only used if `SEE_IMAGE_API_KEY` is set) |
|
|
103
103
|
| `SEE_IMAGE_API_VERSION` | `2023-06-01` | `anthropic-version` header (HTTP mode only) |
|
|
104
104
|
| `SEE_IMAGE_USER_AGENT` | _(Chrome UA)_ | User-Agent header (HTTP mode only) |
|
|
105
|
-
| `SEE_IMAGE_TIMEOUT` | `30000` |
|
|
105
|
+
| `SEE_IMAGE_TIMEOUT` | `30000` | Timeout in ms for session setup and HTTP-mode calls. |
|
|
106
|
+
| `SEE_IMAGE_STALL_TIMEOUT` | `60000` | Stall timeout in ms (SDK streaming). The call is only aborted if the vision model produces no new tokens for this long — so long transcriptions keep running as long as they're progressing. |
|
|
107
|
+
| `SEE_IMAGE_MAX_TIMEOUT` | `0` | Absolute cap in ms on a single streaming call. `0` = no cap. |
|
|
108
|
+
|
|
109
|
+
### streaming
|
|
110
|
+
|
|
111
|
+
On the SDK path the plugin streams the vision model's output and shows live progress in the tool call (`see_image: reading… N chars`). Instead of a hard timeout, it uses a **stall timeout** (`SEE_IMAGE_STALL_TIMEOUT`): a slow-but-progressing model (e.g. transcribing a huge table) runs to completion, while a genuinely hung call is still reaped. If a call is cut short, whatever was streamed so far is returned rather than nothing.
|
|
106
112
|
|
|
107
113
|
### using a different vision model
|
|
108
114
|
|
package/index.ts
CHANGED
|
@@ -12,6 +12,12 @@ const ENDPOINT =
|
|
|
12
12
|
const MODEL = process.env.SEE_IMAGE_MODEL || "minimax-m3"
|
|
13
13
|
const PROVIDER_ID = process.env.SEE_IMAGE_PROVIDER || "opencode-go"
|
|
14
14
|
const TIMEOUT = parseInt(process.env.SEE_IMAGE_TIMEOUT || "30000", 10)
|
|
15
|
+
// Stall timeout: while streaming, only abort if the model produces no new
|
|
16
|
+
// tokens for this long. Lets long transcriptions run as long as they keep
|
|
17
|
+
// progressing. Used for the SDK streaming path.
|
|
18
|
+
const STALL_TIMEOUT = parseInt(process.env.SEE_IMAGE_STALL_TIMEOUT || "60000", 10)
|
|
19
|
+
// Optional absolute cap on a single streaming call (0 = no cap).
|
|
20
|
+
const MAX_TIMEOUT = parseInt(process.env.SEE_IMAGE_MAX_TIMEOUT || "0", 10)
|
|
15
21
|
const API_VERSION = process.env.SEE_IMAGE_API_VERSION || "2023-06-01"
|
|
16
22
|
const USER_AGENT =
|
|
17
23
|
process.env.SEE_IMAGE_USER_AGENT ||
|
|
@@ -216,12 +222,15 @@ function readProviderKey(providerID: string): string | null {
|
|
|
216
222
|
}
|
|
217
223
|
}
|
|
218
224
|
|
|
225
|
+
type ProgressFn = (info: { chars: number; preview: string; provider: string; model: string }) => void
|
|
226
|
+
|
|
219
227
|
async function seeImageViaSDK(
|
|
220
228
|
client: any,
|
|
221
229
|
dataUrl: string,
|
|
222
230
|
mediaType: string,
|
|
223
231
|
prompt: string,
|
|
224
232
|
abort?: AbortSignal,
|
|
233
|
+
onProgress?: ProgressFn,
|
|
225
234
|
): Promise<{ text: string; model: string; provider: string }> {
|
|
226
235
|
const errors: string[] = []
|
|
227
236
|
|
|
@@ -288,6 +297,115 @@ async function seeImageViaSDK(
|
|
|
288
297
|
return null
|
|
289
298
|
}
|
|
290
299
|
|
|
300
|
+
// Stream a vision response from a paid/SDK provider. Subscribes to opencode's
|
|
301
|
+
// event stream so we can (a) surface live progress and (b) use a *stall*
|
|
302
|
+
// timeout — we only give up if the model goes quiet for STALL_TIMEOUT, so a
|
|
303
|
+
// long transcription keeps running as long as it's producing tokens. Returns
|
|
304
|
+
// whatever text was produced, even if a stall/abort cut it short (partial).
|
|
305
|
+
const streamCandidate = async (
|
|
306
|
+
providerID: string,
|
|
307
|
+
modelID: string,
|
|
308
|
+
): Promise<string | null> => {
|
|
309
|
+
const sessionRes = await Promise.race([
|
|
310
|
+
client.session.create({ body: {} }),
|
|
311
|
+
new Promise<never>((_, reject) =>
|
|
312
|
+
setTimeout(
|
|
313
|
+
() => reject(new Error(`session.create timed out after ${TIMEOUT}ms`)),
|
|
314
|
+
TIMEOUT,
|
|
315
|
+
),
|
|
316
|
+
),
|
|
317
|
+
])
|
|
318
|
+
const sessionID: string | undefined = sessionRes.data?.id
|
|
319
|
+
if (!sessionID) throw new Error("no session ID")
|
|
320
|
+
|
|
321
|
+
const controller = new AbortController()
|
|
322
|
+
const onAbort = () => controller.abort()
|
|
323
|
+
abort?.addEventListener("abort", onAbort)
|
|
324
|
+
|
|
325
|
+
// Subscribe to events before prompting so we don't miss early tokens.
|
|
326
|
+
let stream: AsyncGenerator<any> | undefined
|
|
327
|
+
try {
|
|
328
|
+
const sub = await client.event.subscribe()
|
|
329
|
+
stream = sub?.stream
|
|
330
|
+
} catch {}
|
|
331
|
+
|
|
332
|
+
const partsByID = new Map<string, string>()
|
|
333
|
+
let streamedText = ""
|
|
334
|
+
let lastActivity = Date.now()
|
|
335
|
+
let finished = false
|
|
336
|
+
|
|
337
|
+
const consume = (async () => {
|
|
338
|
+
if (!stream) return
|
|
339
|
+
try {
|
|
340
|
+
for await (const ev of stream) {
|
|
341
|
+
if (finished) break
|
|
342
|
+
if (
|
|
343
|
+
ev?.type === "message.part.updated" &&
|
|
344
|
+
ev.properties?.part?.type === "text" &&
|
|
345
|
+
ev.properties.part.sessionID === sessionID
|
|
346
|
+
) {
|
|
347
|
+
const p = ev.properties.part
|
|
348
|
+
partsByID.set(p.id, typeof p.text === "string" ? p.text : "")
|
|
349
|
+
streamedText = [...partsByID.values()].join("\n").trim()
|
|
350
|
+
lastActivity = Date.now()
|
|
351
|
+
onProgress?.({
|
|
352
|
+
chars: streamedText.length,
|
|
353
|
+
preview: streamedText.slice(-160),
|
|
354
|
+
provider: providerID,
|
|
355
|
+
model: modelID,
|
|
356
|
+
})
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
} catch {}
|
|
360
|
+
})()
|
|
361
|
+
|
|
362
|
+
// Stall watchdog (only when we actually have a stream to measure activity).
|
|
363
|
+
const stallTimer = stream
|
|
364
|
+
? setInterval(() => {
|
|
365
|
+
if (Date.now() - lastActivity > STALL_TIMEOUT) controller.abort()
|
|
366
|
+
}, 1000)
|
|
367
|
+
: undefined
|
|
368
|
+
const maxTimer =
|
|
369
|
+
MAX_TIMEOUT > 0 ? setTimeout(() => controller.abort(), MAX_TIMEOUT) : undefined
|
|
370
|
+
|
|
371
|
+
let res: any
|
|
372
|
+
try {
|
|
373
|
+
res = await client.session.prompt({
|
|
374
|
+
path: { id: sessionID },
|
|
375
|
+
body: {
|
|
376
|
+
model: { providerID, modelID },
|
|
377
|
+
parts: [
|
|
378
|
+
{ type: "file", mime: mediaType, url: dataUrl },
|
|
379
|
+
{ type: "text", text: prompt },
|
|
380
|
+
],
|
|
381
|
+
tools: {},
|
|
382
|
+
system:
|
|
383
|
+
"You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
|
|
384
|
+
},
|
|
385
|
+
signal: controller.signal,
|
|
386
|
+
})
|
|
387
|
+
} catch (e: any) {
|
|
388
|
+
// Aborted by stall/max/external — fall through to whatever we streamed.
|
|
389
|
+
if (!streamedText) throw e
|
|
390
|
+
} finally {
|
|
391
|
+
finished = true
|
|
392
|
+
if (stallTimer) clearInterval(stallTimer)
|
|
393
|
+
if (maxTimer) clearTimeout(maxTimer)
|
|
394
|
+
try { await stream?.return?.(undefined) } catch {}
|
|
395
|
+
abort?.removeEventListener("abort", onAbort)
|
|
396
|
+
client.session.delete({ path: { id: sessionID } }).catch(() => {})
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
const finalText = (res?.data?.parts ?? [])
|
|
400
|
+
.filter((p: any) => p.type === "text")
|
|
401
|
+
.map((p: any) => p.text)
|
|
402
|
+
.filter((t: any) => typeof t === "string" && t.length > 0)
|
|
403
|
+
.join("\n")
|
|
404
|
+
.trim()
|
|
405
|
+
|
|
406
|
+
return finalText || streamedText || null
|
|
407
|
+
}
|
|
408
|
+
|
|
291
409
|
let result: { text: string; model: string; provider: string } | undefined
|
|
292
410
|
|
|
293
411
|
try {
|
|
@@ -312,56 +430,8 @@ async function seeImageViaSDK(
|
|
|
312
430
|
continue
|
|
313
431
|
}
|
|
314
432
|
|
|
315
|
-
let sessionID: string | undefined
|
|
316
433
|
try {
|
|
317
|
-
const
|
|
318
|
-
client.session.create({ body: {} }),
|
|
319
|
-
new Promise<never>((_, reject) =>
|
|
320
|
-
setTimeout(
|
|
321
|
-
() => reject(new Error(`session.create timed out after ${TIMEOUT}ms`)),
|
|
322
|
-
TIMEOUT,
|
|
323
|
-
),
|
|
324
|
-
),
|
|
325
|
-
])
|
|
326
|
-
sessionID = sessionRes.data?.id
|
|
327
|
-
if (!sessionID) {
|
|
328
|
-
errors.push(`${providerID}/${modelID}: no session ID`)
|
|
329
|
-
continue
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
const controller = new AbortController()
|
|
333
|
-
const onAbort = () => controller.abort()
|
|
334
|
-
abort?.addEventListener("abort", onAbort)
|
|
335
|
-
const timer = setTimeout(() => controller.abort(), TIMEOUT)
|
|
336
|
-
let res
|
|
337
|
-
try {
|
|
338
|
-
res = await client.session.prompt({
|
|
339
|
-
path: { id: sessionID },
|
|
340
|
-
body: {
|
|
341
|
-
model: { providerID, modelID },
|
|
342
|
-
parts: [
|
|
343
|
-
{ type: "file", mime: mediaType, url: dataUrl },
|
|
344
|
-
{ type: "text", text: prompt },
|
|
345
|
-
],
|
|
346
|
-
tools: {},
|
|
347
|
-
system:
|
|
348
|
-
"You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
|
|
349
|
-
},
|
|
350
|
-
signal: controller.signal,
|
|
351
|
-
})
|
|
352
|
-
} finally {
|
|
353
|
-
clearTimeout(timer)
|
|
354
|
-
abort?.removeEventListener("abort", onAbort)
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
const parts = res.data?.parts ?? []
|
|
358
|
-
const text = (parts as any[])
|
|
359
|
-
.filter((p: any) => p.type === "text")
|
|
360
|
-
.map((p: any) => p.text)
|
|
361
|
-
.filter((t: any) => typeof t === "string" && t.length > 0)
|
|
362
|
-
.join("\n")
|
|
363
|
-
.trim()
|
|
364
|
-
|
|
434
|
+
const text = await streamCandidate(providerID, modelID)
|
|
365
435
|
if (text) {
|
|
366
436
|
result = { text, model: modelID, provider: providerID }
|
|
367
437
|
break
|
|
@@ -369,12 +439,6 @@ async function seeImageViaSDK(
|
|
|
369
439
|
errors.push(`${providerID}/${modelID}: no text in response`)
|
|
370
440
|
} catch (e: any) {
|
|
371
441
|
errors.push(`${providerID}/${modelID}: ${e?.message ?? e}`)
|
|
372
|
-
} finally {
|
|
373
|
-
if (sessionID) {
|
|
374
|
-
await client.session
|
|
375
|
-
.delete({ path: { id: sessionID } })
|
|
376
|
-
.catch(() => {})
|
|
377
|
-
}
|
|
378
442
|
}
|
|
379
443
|
}
|
|
380
444
|
|
|
@@ -549,12 +613,25 @@ const SeeImagePlugin: Plugin = async (ctx) => {
|
|
|
549
613
|
const b64 = resolved.dataUrl.split(",")[1] || ""
|
|
550
614
|
result = await seeImageViaHTTP(b64, resolved.mediaType, prompt, context.abort)
|
|
551
615
|
} else {
|
|
616
|
+
// Throttle live progress updates so we don't spam the UI while the
|
|
617
|
+
// vision model streams a long response.
|
|
618
|
+
let lastUpdate = 0
|
|
619
|
+
const onProgress: ProgressFn = (info) => {
|
|
620
|
+
const now = Date.now()
|
|
621
|
+
if (now - lastUpdate < 400) return
|
|
622
|
+
lastUpdate = now
|
|
623
|
+
context.metadata({
|
|
624
|
+
title: `see_image: reading… ${info.chars} chars (${info.model})`,
|
|
625
|
+
metadata: { streaming: true, chars: info.chars, preview: info.preview },
|
|
626
|
+
})
|
|
627
|
+
}
|
|
552
628
|
result = await seeImageViaSDK(
|
|
553
629
|
client,
|
|
554
630
|
resolved.dataUrl,
|
|
555
631
|
resolved.mediaType,
|
|
556
632
|
prompt,
|
|
557
633
|
context.abort,
|
|
634
|
+
onProgress,
|
|
558
635
|
)
|
|
559
636
|
}
|
|
560
637
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "opencode-see-image",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.10.0",
|
|
4
4
|
"description": "Give non-vision opencode models the ability to see images/screenshots by routing them to a vision-capable model (MiniMax M3 via opencode-go by default).",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.ts",
|