opencode-see-image 0.9.3 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,25 @@
12
12
  "Read(//Users/alfa/Documents/opencodeprojects/opencode-see-image/bun-types/**)",
13
13
  "Bash(bun run *)",
14
14
  "WebFetch(domain:docs.z.ai)",
15
- "Bash(npm publish *)"
15
+ "Bash(npm publish *)",
16
+ "Bash(python3 -c ' *)",
17
+ "Bash(open -a Preview \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/3.png\" \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/4.png\" \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/5.png\")",
18
+ "Bash(ps -o etime= -p 82196)",
19
+ "Bash(echo \"STILL RUNNING \\($\\(ps -o etime= -p 82196)",
20
+ "Bash(awk '/export type TextPart = \\\\{/,/\\\\};/' node_modules/@opencode-ai/sdk/dist/gen/types.gen.d.t)",
21
+ "Bash(awk '{print $2, $9, $11, $12, $13}')",
22
+ "Bash(pkill -f \"[o]pencode run\")",
23
+ "Bash(pkill -f \"[o]pencode-run\")",
24
+ "Bash(pkill -f \"seq 1 40\")",
25
+ "Bash(rm -f verify.json)",
26
+ "Bash(opencode run *)",
27
+ "Bash(echo \"exit=$? done=$\\(date +%T\\) bytes=$\\(wc -c < /tmp/verify.json\\)\")",
28
+ "Bash(pkill -9 -f \"opencode run\")",
29
+ "Bash(pkill -9 -f \"simple.json\\\\|verify.json\\\\|quick.json\\\\|strm\")",
30
+ "Bash(pkill -9 -f \"14.39.13\")",
31
+ "Bash(npm dist-tag *)",
32
+ "Bash(awk '{print $2, $11, $12, $13, $14}')",
33
+ "Bash(awk '/export type ToolState =/,/^};|^export \\(type|declare\\)/' sdk/dist/gen/types.gen.d.ts)"
16
34
  ]
17
35
  }
18
36
  }
package/README.md CHANGED
@@ -102,7 +102,15 @@ all settings are env-var overrides. The plugin uses opencode's SDK client by def
102
102
  | `SEE_IMAGE_ENDPOINT` | `https://opencode.ai/zen/go/v1/messages` | HTTP endpoint (only used if `SEE_IMAGE_API_KEY` is set) |
103
103
  | `SEE_IMAGE_API_VERSION` | `2023-06-01` | `anthropic-version` header (HTTP mode only) |
104
104
  | `SEE_IMAGE_USER_AGENT` | _(Chrome UA)_ | User-Agent header (HTTP mode only) |
105
- | `SEE_IMAGE_TIMEOUT` | `30000` | Per-candidate timeout in ms. Prevents hanging on slow models. |
105
+ | `SEE_IMAGE_TIMEOUT` | `30000` | Timeout in ms for session setup and HTTP-mode calls. |
106
+ | `SEE_IMAGE_STALL_TIMEOUT` | `60000` | Stall timeout in ms (SDK streaming). The call is only aborted if the vision model produces no new tokens for this long — so long transcriptions keep running as long as they're progressing. |
107
+ | `SEE_IMAGE_MAX_TIMEOUT` | `0` | Absolute cap in ms on a single streaming call. `0` = no cap. |
108
+
109
+ ### live progress
110
+
111
+ While the vision model works, the tool call shows an animated heartbeat bar plus live status, e.g. `see_image ░▒▓█▓▒░ reading… 1240 chars · 7s · minimax-m3`. The char count and a preview of the latest text update as tokens stream in, so you can see it's alive and watch the description form.
112
+
113
+ The preferred path streams from the vision model via opencode's event stream and uses a **stall timeout** (`SEE_IMAGE_STALL_TIMEOUT`) instead of a hard cutoff: a slow-but-progressing model (e.g. transcribing a huge table) runs to completion, while a genuinely silent/hung call is reaped. If streaming isn't available or a call is cut short, the plugin falls back to a reliable non-streaming CLI call to the same model (full answer, no live preview), then to the free model.
106
114
 
107
115
  ### using a different vision model
108
116
 
package/index.ts CHANGED
@@ -12,6 +12,22 @@ const ENDPOINT =
12
12
  const MODEL = process.env.SEE_IMAGE_MODEL || "minimax-m3"
13
13
  const PROVIDER_ID = process.env.SEE_IMAGE_PROVIDER || "opencode-go"
14
14
  const TIMEOUT = parseInt(process.env.SEE_IMAGE_TIMEOUT || "30000", 10)
15
+ // Stall timeout (SDK streaming path): abort only if the model produces no new
16
+ // tokens for this long. A slow-but-progressing call keeps running.
17
+ const STALL_TIMEOUT = parseInt(process.env.SEE_IMAGE_STALL_TIMEOUT || "60000", 10)
18
+ // Optional absolute cap on a single vision call, in ms (0 = no cap).
19
+ const MAX_TIMEOUT = parseInt(process.env.SEE_IMAGE_MAX_TIMEOUT || "0", 10)
20
+
21
+ // Animated heartbeat: a flowing gradient wave shown in the tool title while we
22
+ // wait, so the user can see the call is alive and not frozen.
23
+ const HEARTBEAT_FRAMES = ["░", "▒", "▓", "█", "▓", "▒", "░"]
24
+ function heartbeatBar(tick: number, width = 14): string {
25
+ let s = ""
26
+ for (let i = 0; i < width; i++) {
27
+ s += HEARTBEAT_FRAMES[(i + tick) % HEARTBEAT_FRAMES.length]
28
+ }
29
+ return s
30
+ }
15
31
  const API_VERSION = process.env.SEE_IMAGE_API_VERSION || "2023-06-01"
16
32
  const USER_AGENT =
17
33
  process.env.SEE_IMAGE_USER_AGENT ||
@@ -216,12 +232,15 @@ function readProviderKey(providerID: string): string | null {
216
232
  }
217
233
  }
218
234
 
235
+ type ProgressFn = (info: { chars: number; preview: string; model: string }) => void
236
+
219
237
  async function seeImageViaSDK(
220
238
  client: any,
221
239
  dataUrl: string,
222
240
  mediaType: string,
223
241
  prompt: string,
224
242
  abort?: AbortSignal,
243
+ onProgress?: ProgressFn,
225
244
  ): Promise<{ text: string; model: string; provider: string }> {
226
245
  const errors: string[] = []
227
246
 
@@ -245,13 +264,147 @@ async function seeImageViaSDK(
245
264
  return tmpPath
246
265
  }
247
266
 
248
- // For free opencode models, use CLI instead of SDK (SDK returns empty).
249
- // Use Bun.spawn (not $) so we get a killable handle: Bun's $ ShellPromise
250
- // has no .kill(), so racing it against a timeout would leak the process.
251
- // We kill the child on both timeout and external abort.
252
- const freeFallback = async (modelID: string, userPrompt: string): Promise<string | null> => {
267
+ // Two runners back the candidate list:
268
+ //
269
+ // streamViaSDK subscribes to opencode's event stream so we get text
270
+ // token-by-token. This drives the live content preview AND token-based
271
+ // stall detection (abort only after STALL_TIMEOUT of silence). It also
272
+ // races the prompt against a stall/max rejection, so a hung call can't
273
+ // block past the stall window even if the abort signal is ignored. Only
274
+ // used when an event stream is actually available (its whole point).
275
+ //
276
+ // runViaCLI — `opencode run -m <provider>/<model>` via Bun.spawn (killable).
277
+ // The proven, reliable fallback. It buffers --format json output until
278
+ // exit, so it gives no live preview, but it returns the full answer.
279
+ const streamViaSDK = async (
280
+ providerID: string,
281
+ modelID: string,
282
+ ): Promise<string | null> => {
283
+ const sessionRes = await Promise.race([
284
+ client.session.create({ body: {} }),
285
+ new Promise<never>((_, reject) =>
286
+ setTimeout(
287
+ () => reject(new Error(`session.create timed out after ${TIMEOUT}ms`)),
288
+ TIMEOUT,
289
+ ),
290
+ ),
291
+ ])
292
+ const sessionID: string | undefined = sessionRes.data?.id
293
+ if (!sessionID) throw new Error("no session ID")
294
+
295
+ const cleanupSession = () =>
296
+ client.session.delete({ path: { id: sessionID } }).catch(() => {})
297
+
298
+ // The SDK path exists for the live preview; if we can't get an event
299
+ // stream there's nothing to preview or to measure stalls against, so bail
300
+ // and let the loop fall through to the reliable CLI runner.
301
+ let stream: AsyncGenerator<any> | undefined
302
+ try {
303
+ stream = (await client.event.subscribe())?.stream
304
+ } catch {}
305
+ if (!stream) {
306
+ cleanupSession()
307
+ return null
308
+ }
309
+
310
+ const controller = new AbortController()
311
+ const onAbort = () => controller.abort()
312
+ abort?.addEventListener("abort", onAbort)
313
+
314
+ const partsByID = new Map<string, string>()
315
+ let streamedText = ""
316
+ let lastActivity = Date.now()
317
+ let finished = false
318
+
319
+ const consume = (async () => {
320
+ try {
321
+ for await (const ev of stream!) {
322
+ if (finished) break
323
+ const p = ev?.properties?.part
324
+ if (
325
+ ev?.type === "message.part.updated" &&
326
+ p?.type === "text" &&
327
+ p.sessionID === sessionID
328
+ ) {
329
+ partsByID.set(p.id, typeof p.text === "string" ? p.text : "")
330
+ streamedText = [...partsByID.values()].join("\n").trim()
331
+ lastActivity = Date.now()
332
+ onProgress?.({
333
+ chars: streamedText.length,
334
+ preview: streamedText.slice(-200),
335
+ model: modelID,
336
+ })
337
+ }
338
+ }
339
+ } catch {}
340
+ })()
341
+
342
+ let stallTimer: ReturnType<typeof setInterval> | undefined
343
+ let maxTimer: ReturnType<typeof setTimeout> | undefined
344
+ const guard = new Promise<never>((_, reject) => {
345
+ stallTimer = setInterval(() => {
346
+ if (Date.now() - lastActivity > STALL_TIMEOUT) {
347
+ controller.abort()
348
+ reject(new Error(`stalled: no tokens for ${STALL_TIMEOUT}ms`))
349
+ }
350
+ }, 1000)
351
+ if (MAX_TIMEOUT > 0) {
352
+ maxTimer = setTimeout(() => {
353
+ controller.abort()
354
+ reject(new Error(`exceeded MAX_TIMEOUT ${MAX_TIMEOUT}ms`))
355
+ }, MAX_TIMEOUT)
356
+ }
357
+ })
358
+
359
+ let res: any
360
+ try {
361
+ res = await Promise.race([
362
+ client.session.prompt({
363
+ path: { id: sessionID },
364
+ body: {
365
+ model: { providerID, modelID },
366
+ parts: [
367
+ { type: "file", mime: mediaType, url: dataUrl },
368
+ { type: "text", text: prompt },
369
+ ],
370
+ tools: {},
371
+ system:
372
+ "You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
373
+ },
374
+ signal: controller.signal,
375
+ }),
376
+ guard,
377
+ ])
378
+ } catch (e: any) {
379
+ // Stalled / aborted / errored — keep whatever streamed in so far.
380
+ if (!streamedText) throw e
381
+ } finally {
382
+ finished = true
383
+ if (stallTimer) clearInterval(stallTimer)
384
+ if (maxTimer) clearTimeout(maxTimer)
385
+ try { await stream.return?.(undefined) } catch {}
386
+ abort?.removeEventListener("abort", onAbort)
387
+ cleanupSession()
388
+ }
389
+
390
+ const finalText = (res?.data?.parts ?? [])
391
+ .filter((p: any) => p.type === "text")
392
+ .map((p: any) => p.text)
393
+ .filter((t: any) => typeof t === "string" && t.length > 0)
394
+ .join("\n")
395
+ .trim()
396
+
397
+ return finalText || streamedText || null
398
+ }
399
+
400
+ const runViaCLI = async (
401
+ providerID: string,
402
+ modelID: string,
403
+ ): Promise<string | null> => {
253
404
  const filePath = ensureTmpFile()
254
405
  if (!filePath) return null
406
+ onProgress?.({ chars: 0, preview: "", model: modelID })
407
+
255
408
  const proc = Bun.spawn(
256
409
  [
257
410
  "opencode",
@@ -259,122 +412,72 @@ async function seeImageViaSDK(
259
412
  "-f",
260
413
  filePath,
261
414
  "-m",
262
- `opencode/${modelID}`,
263
- userPrompt,
415
+ `${providerID}/${modelID}`,
416
+ prompt,
264
417
  "--format",
265
418
  "json",
266
419
  "--dangerously-skip-permissions",
267
420
  ],
268
421
  { stdout: "pipe", stderr: "ignore" },
269
422
  )
270
- const timer = setTimeout(() => proc.kill(), TIMEOUT)
271
423
  const onAbort = () => proc.kill()
272
424
  abort?.addEventListener("abort", onAbort)
425
+ const maxTimer =
426
+ MAX_TIMEOUT > 0 ? setTimeout(() => proc.kill(), MAX_TIMEOUT) : undefined
427
+
273
428
  try {
274
429
  const out = await new Response(proc.stdout).text()
275
430
  await proc.exited
431
+ const parts = new Map<string, string>()
276
432
  for (const line of out.split("\n").filter(Boolean)) {
277
433
  try {
278
- const parsed = JSON.parse(line)
279
- if (parsed?.part?.type === "text" && parsed?.part?.text) {
280
- return parsed.part.text
434
+ const p = JSON.parse(line)?.part
435
+ if (p?.type === "text" && typeof p.text === "string") {
436
+ parts.set(p.id ?? String(parts.size), p.text)
281
437
  }
282
438
  } catch {}
283
439
  }
284
- } catch {} finally {
285
- clearTimeout(timer)
440
+ return [...parts.values()].join("\n").trim() || null
441
+ } catch {
442
+ return null
443
+ } finally {
444
+ if (maxTimer) clearTimeout(maxTimer)
286
445
  abort?.removeEventListener("abort", onAbort)
287
446
  }
288
- return null
289
447
  }
290
448
 
291
449
  let result: { text: string; model: string; provider: string } | undefined
292
450
 
293
451
  try {
294
- const candidates: Array<{ providerID: string; modelID: string }> = []
452
+ const candidates: Array<{
453
+ providerID: string
454
+ modelID: string
455
+ mode: "sdk" | "cli"
456
+ }> = []
295
457
  const envProvider = process.env.SEE_IMAGE_PROVIDER
296
458
  const envModel = process.env.SEE_IMAGE_MODEL
297
459
  if (envProvider && envModel) {
298
- candidates.push({ providerID: envProvider, modelID: envModel })
460
+ candidates.push({ providerID: envProvider, modelID: envModel, mode: "sdk" })
299
461
  }
300
- candidates.push({ providerID: "opencode-go", modelID: "minimax-m3" })
301
- candidates.push({ providerID: "opencode", modelID: "mimo-v2.5-free" })
302
-
303
- for (const { providerID, modelID } of candidates) {
304
- if (providerID === "opencode") {
305
- // SDK session.prompt returns empty for free models; use CLI instead
306
- const text = await freeFallback(modelID, prompt)
307
- if (text) {
308
- result = { text, model: modelID, provider: providerID }
309
- break
310
- }
311
- errors.push(`${providerID}/${modelID}: no text from CLI fallback`)
312
- continue
313
- }
462
+ // Prefer streaming minimax (live preview); fall back to the same model via
463
+ // the proven CLI runner; then the free model via CLI.
464
+ candidates.push({ providerID: "opencode-go", modelID: "minimax-m3", mode: "sdk" })
465
+ candidates.push({ providerID: "opencode-go", modelID: "minimax-m3", mode: "cli" })
466
+ candidates.push({ providerID: "opencode", modelID: "mimo-v2.5-free", mode: "cli" })
314
467
 
315
- let sessionID: string | undefined
468
+ for (const { providerID, modelID, mode } of candidates) {
316
469
  try {
317
- const sessionRes = await Promise.race([
318
- client.session.create({ body: {} }),
319
- new Promise<never>((_, reject) =>
320
- setTimeout(
321
- () => reject(new Error(`session.create timed out after ${TIMEOUT}ms`)),
322
- TIMEOUT,
323
- ),
324
- ),
325
- ])
326
- sessionID = sessionRes.data?.id
327
- if (!sessionID) {
328
- errors.push(`${providerID}/${modelID}: no session ID`)
329
- continue
330
- }
331
-
332
- const controller = new AbortController()
333
- const onAbort = () => controller.abort()
334
- abort?.addEventListener("abort", onAbort)
335
- const timer = setTimeout(() => controller.abort(), TIMEOUT)
336
- let res
337
- try {
338
- res = await client.session.prompt({
339
- path: { id: sessionID },
340
- body: {
341
- model: { providerID, modelID },
342
- parts: [
343
- { type: "file", mime: mediaType, url: dataUrl },
344
- { type: "text", text: prompt },
345
- ],
346
- tools: {},
347
- system:
348
- "You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
349
- },
350
- signal: controller.signal,
351
- })
352
- } finally {
353
- clearTimeout(timer)
354
- abort?.removeEventListener("abort", onAbort)
355
- }
356
-
357
- const parts = res.data?.parts ?? []
358
- const text = (parts as any[])
359
- .filter((p: any) => p.type === "text")
360
- .map((p: any) => p.text)
361
- .filter((t: any) => typeof t === "string" && t.length > 0)
362
- .join("\n")
363
- .trim()
364
-
470
+ const text =
471
+ mode === "sdk"
472
+ ? await streamViaSDK(providerID, modelID)
473
+ : await runViaCLI(providerID, modelID)
365
474
  if (text) {
366
475
  result = { text, model: modelID, provider: providerID }
367
476
  break
368
477
  }
369
- errors.push(`${providerID}/${modelID}: no text in response`)
478
+ errors.push(`${providerID}/${modelID} (${mode}): no text`)
370
479
  } catch (e: any) {
371
- errors.push(`${providerID}/${modelID}: ${e?.message ?? e}`)
372
- } finally {
373
- if (sessionID) {
374
- await client.session
375
- .delete({ path: { id: sessionID } })
376
- .catch(() => {})
377
- }
480
+ errors.push(`${providerID}/${modelID} (${mode}): ${e?.message ?? e}`)
378
481
  }
379
482
  }
380
483
 
@@ -545,17 +648,57 @@ const SeeImagePlugin: Plugin = async (ctx) => {
545
648
 
546
649
  let result: { text: string; model: string; provider: string }
547
650
 
548
- if (process.env.SEE_IMAGE_API_KEY) {
549
- const b64 = resolved.dataUrl.split(",")[1] || ""
550
- result = await seeImageViaHTTP(b64, resolved.mediaType, prompt, context.abort)
551
- } else {
552
- result = await seeImageViaSDK(
553
- client,
554
- resolved.dataUrl,
555
- resolved.mediaType,
556
- prompt,
557
- context.abort,
558
- )
651
+ // Live feedback while we wait: an animated heartbeat bar plus, once the
652
+ // vision model starts streaming, a growing char count and a preview of
653
+ // the latest text. The timer ticks independently so the bar animates
654
+ // even before any tokens arrive; onProgress feeds it streamed content.
655
+ const started = Date.now()
656
+ let tick = 0
657
+ const live = { chars: 0, preview: "", model: "" }
658
+ const onProgress: ProgressFn = (info) => {
659
+ live.chars = info.chars
660
+ live.preview = info.preview
661
+ if (info.model) live.model = info.model
662
+ }
663
+ const render = () => {
664
+ const secs = Math.round((Date.now() - started) / 1000)
665
+ const bar = heartbeatBar(++tick)
666
+ const label = live.chars > 0 ? `reading… ${live.chars} chars` : "looking…"
667
+ const model = live.model ? ` · ${live.model}` : ""
668
+ context.metadata({
669
+ title: `see_image ${bar} ${label} · ${secs}s${model}`,
670
+ metadata: {
671
+ elapsedSeconds: secs,
672
+ chars: live.chars,
673
+ preview: live.preview,
674
+ model: live.model,
675
+ },
676
+ })
677
+ }
678
+ render()
679
+ const heartbeat = setInterval(render, 500)
680
+
681
+ try {
682
+ if (process.env.SEE_IMAGE_API_KEY) {
683
+ const b64 = resolved.dataUrl.split(",")[1] || ""
684
+ result = await seeImageViaHTTP(
685
+ b64,
686
+ resolved.mediaType,
687
+ prompt,
688
+ context.abort,
689
+ )
690
+ } else {
691
+ result = await seeImageViaSDK(
692
+ client,
693
+ resolved.dataUrl,
694
+ resolved.mediaType,
695
+ prompt,
696
+ context.abort,
697
+ onProgress,
698
+ )
699
+ }
700
+ } finally {
701
+ clearInterval(heartbeat)
559
702
  }
560
703
 
561
704
  context.metadata({
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-see-image",
3
- "version": "0.9.3",
3
+ "version": "0.10.1",
4
4
  "description": "Give non-vision opencode models the ability to see images/screenshots by routing them to a vision-capable model (MiniMax M3 via opencode-go by default).",
5
5
  "type": "module",
6
6
  "main": "index.ts",