opencode-see-image 0.10.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,7 +20,17 @@
20
20
  "Bash(awk '/export type TextPart = \\\\{/,/\\\\};/' node_modules/@opencode-ai/sdk/dist/gen/types.gen.d.t)",
21
21
  "Bash(awk '{print $2, $9, $11, $12, $13}')",
22
22
  "Bash(pkill -f \"[o]pencode run\")",
23
- "Bash(pkill -f \"[o]pencode-run\")"
23
+ "Bash(pkill -f \"[o]pencode-run\")",
24
+ "Bash(pkill -f \"seq 1 40\")",
25
+ "Bash(rm -f verify.json)",
26
+ "Bash(opencode run *)",
27
+ "Bash(echo \"exit=$? done=$\\(date +%T\\) bytes=$\\(wc -c < /tmp/verify.json\\)\")",
28
+ "Bash(pkill -9 -f \"opencode run\")",
29
+ "Bash(pkill -9 -f \"simple.json\\\\|verify.json\\\\|quick.json\\\\|strm\")",
30
+ "Bash(pkill -9 -f \"14.39.13\")",
31
+ "Bash(npm dist-tag *)",
32
+ "Bash(awk '{print $2, $11, $12, $13, $14}')",
33
+ "Bash(awk '/export type ToolState =/,/^};|^export \\(type|declare\\)/' sdk/dist/gen/types.gen.d.ts)"
24
34
  ]
25
35
  }
26
36
  }
package/README.md CHANGED
@@ -106,9 +106,11 @@ all settings are env-var overrides. The plugin uses opencode's SDK client by def
106
106
  | `SEE_IMAGE_STALL_TIMEOUT` | `60000` | Stall timeout in ms (SDK streaming). The call is only aborted if the vision model produces no new tokens for this long — so long transcriptions keep running as long as they're progressing. |
107
107
  | `SEE_IMAGE_MAX_TIMEOUT` | `0` | Absolute cap in ms on a single streaming call. `0` = no cap. |
108
108
 
109
- ### streaming
109
+ ### live progress
110
110
 
111
- On the SDK path the plugin streams the vision model's output and shows live progress in the tool call (`see_image: reading… N chars`). Instead of a hard timeout, it uses a **stall timeout** (`SEE_IMAGE_STALL_TIMEOUT`): a slow-but-progressing model (e.g. transcribing a huge table) runs to completion, while a genuinely hung call is still reaped. If a call is cut short, whatever was streamed so far is returned rather than nothing.
111
+ While the vision model works, the tool call shows an animated heartbeat bar plus live status, e.g. `see_image ░▒▓█▓▒░ reading… 1240 chars · 7s · minimax-m3`. The char count and a preview of the latest text update as tokens stream in, so you can see it's alive and watch the description form.
112
+
113
+ The preferred path streams from the vision model via opencode's event stream and uses a **stall timeout** (`SEE_IMAGE_STALL_TIMEOUT`) instead of a hard cutoff: a slow-but-progressing model (e.g. transcribing a huge table) runs to completion, while a genuinely silent/hung call is reaped. If streaming isn't available or a call is cut short, the plugin falls back to a reliable non-streaming CLI call to the same model (full answer, no live preview), then to the free model.
112
114
 
113
115
  ### using a different vision model
114
116
 
package/index.ts CHANGED
@@ -12,12 +12,22 @@ const ENDPOINT =
12
12
  const MODEL = process.env.SEE_IMAGE_MODEL || "minimax-m3"
13
13
  const PROVIDER_ID = process.env.SEE_IMAGE_PROVIDER || "opencode-go"
14
14
  const TIMEOUT = parseInt(process.env.SEE_IMAGE_TIMEOUT || "30000", 10)
15
- // Stall timeout: while streaming, only abort if the model produces no new
16
- // tokens for this long. Lets long transcriptions run as long as they keep
17
- // progressing. Used for the SDK streaming path.
15
+ // Stall timeout (SDK streaming path): abort only if the model produces no new
16
+ // tokens for this long. A slow-but-progressing call keeps running.
18
17
  const STALL_TIMEOUT = parseInt(process.env.SEE_IMAGE_STALL_TIMEOUT || "60000", 10)
19
- // Optional absolute cap on a single streaming call (0 = no cap).
18
+ // Optional absolute cap on a single vision call, in ms (0 = no cap).
20
19
  const MAX_TIMEOUT = parseInt(process.env.SEE_IMAGE_MAX_TIMEOUT || "0", 10)
20
+
21
+ // Animated heartbeat: a flowing gradient wave shown in the tool title while we
22
+ // wait, so the user can see the call is alive and not frozen.
23
+ const HEARTBEAT_FRAMES = ["░", "▒", "▓", "█", "▓", "▒", "░"]
24
+ function heartbeatBar(tick: number, width = 14): string {
25
+ let s = ""
26
+ for (let i = 0; i < width; i++) {
27
+ s += HEARTBEAT_FRAMES[(i + tick) % HEARTBEAT_FRAMES.length]
28
+ }
29
+ return s
30
+ }
21
31
  const API_VERSION = process.env.SEE_IMAGE_API_VERSION || "2023-06-01"
22
32
  const USER_AGENT =
23
33
  process.env.SEE_IMAGE_USER_AGENT ||
@@ -222,7 +232,7 @@ function readProviderKey(providerID: string): string | null {
222
232
  }
223
233
  }
224
234
 
225
- type ProgressFn = (info: { chars: number; preview: string; provider: string; model: string }) => void
235
+ type ProgressFn = (info: { chars: number; preview: string; model: string }) => void
226
236
 
227
237
  async function seeImageViaSDK(
228
238
  client: any,
@@ -254,55 +264,19 @@ async function seeImageViaSDK(
254
264
  return tmpPath
255
265
  }
256
266
 
257
- // For free opencode models, use CLI instead of SDK (SDK returns empty).
258
- // Use Bun.spawn (not $) so we get a killable handle: Bun's $ ShellPromise
259
- // has no .kill(), so racing it against a timeout would leak the process.
260
- // We kill the child on both timeout and external abort.
261
- const freeFallback = async (modelID: string, userPrompt: string): Promise<string | null> => {
262
- const filePath = ensureTmpFile()
263
- if (!filePath) return null
264
- const proc = Bun.spawn(
265
- [
266
- "opencode",
267
- "run",
268
- "-f",
269
- filePath,
270
- "-m",
271
- `opencode/${modelID}`,
272
- userPrompt,
273
- "--format",
274
- "json",
275
- "--dangerously-skip-permissions",
276
- ],
277
- { stdout: "pipe", stderr: "ignore" },
278
- )
279
- const timer = setTimeout(() => proc.kill(), TIMEOUT)
280
- const onAbort = () => proc.kill()
281
- abort?.addEventListener("abort", onAbort)
282
- try {
283
- const out = await new Response(proc.stdout).text()
284
- await proc.exited
285
- for (const line of out.split("\n").filter(Boolean)) {
286
- try {
287
- const parsed = JSON.parse(line)
288
- if (parsed?.part?.type === "text" && parsed?.part?.text) {
289
- return parsed.part.text
290
- }
291
- } catch {}
292
- }
293
- } catch {} finally {
294
- clearTimeout(timer)
295
- abort?.removeEventListener("abort", onAbort)
296
- }
297
- return null
298
- }
299
-
300
- // Stream a vision response from a paid/SDK provider. Subscribes to opencode's
301
- // event stream so we can (a) surface live progress and (b) use a *stall*
302
- // timeout — we only give up if the model goes quiet for STALL_TIMEOUT, so a
303
- // long transcription keeps running as long as it's producing tokens. Returns
304
- // whatever text was produced, even if a stall/abort cut it short (partial).
305
- const streamCandidate = async (
267
+ // Two runners back the candidate list:
268
+ //
269
+ // streamViaSDK subscribes to opencode's event stream so we get text
270
+ // token-by-token. This drives the live content preview AND token-based
271
+ // stall detection (abort only after STALL_TIMEOUT of silence). It also
272
+ // races the prompt against a stall/max rejection, so a hung call can't
273
+ // block past the stall window even if the abort signal is ignored. Only
274
+ // used when an event stream is actually available (its whole point).
275
+ //
276
+ // runViaCLI — `opencode run -m <provider>/<model>` via Bun.spawn (killable).
277
+ // The proven, reliable fallback. It buffers --format json output until
278
+ // exit, so it gives no live preview, but it returns the full answer.
279
+ const streamViaSDK = async (
306
280
  providerID: string,
307
281
  modelID: string,
308
282
  ): Promise<string | null> => {
@@ -318,16 +292,24 @@ async function seeImageViaSDK(
318
292
  const sessionID: string | undefined = sessionRes.data?.id
319
293
  if (!sessionID) throw new Error("no session ID")
320
294
 
321
- const controller = new AbortController()
322
- const onAbort = () => controller.abort()
323
- abort?.addEventListener("abort", onAbort)
295
+ const cleanupSession = () =>
296
+ client.session.delete({ path: { id: sessionID } }).catch(() => {})
324
297
 
325
- // Subscribe to events before prompting so we don't miss early tokens.
298
+ // The SDK path exists for the live preview; if we can't get an event
299
+ // stream there's nothing to preview or to measure stalls against, so bail
300
+ // and let the loop fall through to the reliable CLI runner.
326
301
  let stream: AsyncGenerator<any> | undefined
327
302
  try {
328
- const sub = await client.event.subscribe()
329
- stream = sub?.stream
303
+ stream = (await client.event.subscribe())?.stream
330
304
  } catch {}
305
+ if (!stream) {
306
+ cleanupSession()
307
+ return null
308
+ }
309
+
310
+ const controller = new AbortController()
311
+ const onAbort = () => controller.abort()
312
+ abort?.addEventListener("abort", onAbort)
331
313
 
332
314
  const partsByID = new Map<string, string>()
333
315
  let streamedText = ""
@@ -335,23 +317,21 @@ async function seeImageViaSDK(
335
317
  let finished = false
336
318
 
337
319
  const consume = (async () => {
338
- if (!stream) return
339
320
  try {
340
- for await (const ev of stream) {
321
+ for await (const ev of stream!) {
341
322
  if (finished) break
323
+ const p = ev?.properties?.part
342
324
  if (
343
325
  ev?.type === "message.part.updated" &&
344
- ev.properties?.part?.type === "text" &&
345
- ev.properties.part.sessionID === sessionID
326
+ p?.type === "text" &&
327
+ p.sessionID === sessionID
346
328
  ) {
347
- const p = ev.properties.part
348
329
  partsByID.set(p.id, typeof p.text === "string" ? p.text : "")
349
330
  streamedText = [...partsByID.values()].join("\n").trim()
350
331
  lastActivity = Date.now()
351
332
  onProgress?.({
352
333
  chars: streamedText.length,
353
- preview: streamedText.slice(-160),
354
- provider: providerID,
334
+ preview: streamedText.slice(-200),
355
335
  model: modelID,
356
336
  })
357
337
  }
@@ -359,41 +339,52 @@ async function seeImageViaSDK(
359
339
  } catch {}
360
340
  })()
361
341
 
362
- // Stall watchdog (only when we actually have a stream to measure activity).
363
- const stallTimer = stream
364
- ? setInterval(() => {
365
- if (Date.now() - lastActivity > STALL_TIMEOUT) controller.abort()
366
- }, 1000)
367
- : undefined
368
- const maxTimer =
369
- MAX_TIMEOUT > 0 ? setTimeout(() => controller.abort(), MAX_TIMEOUT) : undefined
342
+ let stallTimer: ReturnType<typeof setInterval> | undefined
343
+ let maxTimer: ReturnType<typeof setTimeout> | undefined
344
+ const guard = new Promise<never>((_, reject) => {
345
+ stallTimer = setInterval(() => {
346
+ if (Date.now() - lastActivity > STALL_TIMEOUT) {
347
+ controller.abort()
348
+ reject(new Error(`stalled: no tokens for ${STALL_TIMEOUT}ms`))
349
+ }
350
+ }, 1000)
351
+ if (MAX_TIMEOUT > 0) {
352
+ maxTimer = setTimeout(() => {
353
+ controller.abort()
354
+ reject(new Error(`exceeded MAX_TIMEOUT ${MAX_TIMEOUT}ms`))
355
+ }, MAX_TIMEOUT)
356
+ }
357
+ })
370
358
 
371
359
  let res: any
372
360
  try {
373
- res = await client.session.prompt({
374
- path: { id: sessionID },
375
- body: {
376
- model: { providerID, modelID },
377
- parts: [
378
- { type: "file", mime: mediaType, url: dataUrl },
379
- { type: "text", text: prompt },
380
- ],
381
- tools: {},
382
- system:
383
- "You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
384
- },
385
- signal: controller.signal,
386
- })
361
+ res = await Promise.race([
362
+ client.session.prompt({
363
+ path: { id: sessionID },
364
+ body: {
365
+ model: { providerID, modelID },
366
+ parts: [
367
+ { type: "file", mime: mediaType, url: dataUrl },
368
+ { type: "text", text: prompt },
369
+ ],
370
+ tools: {},
371
+ system:
372
+ "You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
373
+ },
374
+ signal: controller.signal,
375
+ }),
376
+ guard,
377
+ ])
387
378
  } catch (e: any) {
388
- // Aborted by stall/max/externalfall through to whatever we streamed.
379
+ // Stalled / aborted / errored keep whatever streamed in so far.
389
380
  if (!streamedText) throw e
390
381
  } finally {
391
382
  finished = true
392
383
  if (stallTimer) clearInterval(stallTimer)
393
384
  if (maxTimer) clearTimeout(maxTimer)
394
- try { await stream?.return?.(undefined) } catch {}
385
+ try { await stream.return?.(undefined) } catch {}
395
386
  abort?.removeEventListener("abort", onAbort)
396
- client.session.delete({ path: { id: sessionID } }).catch(() => {})
387
+ cleanupSession()
397
388
  }
398
389
 
399
390
  const finalText = (res?.data?.parts ?? [])
@@ -406,39 +397,87 @@ async function seeImageViaSDK(
406
397
  return finalText || streamedText || null
407
398
  }
408
399
 
400
+ const runViaCLI = async (
401
+ providerID: string,
402
+ modelID: string,
403
+ ): Promise<string | null> => {
404
+ const filePath = ensureTmpFile()
405
+ if (!filePath) return null
406
+ onProgress?.({ chars: 0, preview: "", model: modelID })
407
+
408
+ const proc = Bun.spawn(
409
+ [
410
+ "opencode",
411
+ "run",
412
+ "-f",
413
+ filePath,
414
+ "-m",
415
+ `${providerID}/${modelID}`,
416
+ prompt,
417
+ "--format",
418
+ "json",
419
+ "--dangerously-skip-permissions",
420
+ ],
421
+ { stdout: "pipe", stderr: "ignore" },
422
+ )
423
+ const onAbort = () => proc.kill()
424
+ abort?.addEventListener("abort", onAbort)
425
+ const maxTimer =
426
+ MAX_TIMEOUT > 0 ? setTimeout(() => proc.kill(), MAX_TIMEOUT) : undefined
427
+
428
+ try {
429
+ const out = await new Response(proc.stdout).text()
430
+ await proc.exited
431
+ const parts = new Map<string, string>()
432
+ for (const line of out.split("\n").filter(Boolean)) {
433
+ try {
434
+ const p = JSON.parse(line)?.part
435
+ if (p?.type === "text" && typeof p.text === "string") {
436
+ parts.set(p.id ?? String(parts.size), p.text)
437
+ }
438
+ } catch {}
439
+ }
440
+ return [...parts.values()].join("\n").trim() || null
441
+ } catch {
442
+ return null
443
+ } finally {
444
+ if (maxTimer) clearTimeout(maxTimer)
445
+ abort?.removeEventListener("abort", onAbort)
446
+ }
447
+ }
448
+
409
449
  let result: { text: string; model: string; provider: string } | undefined
410
450
 
411
451
  try {
412
- const candidates: Array<{ providerID: string; modelID: string }> = []
452
+ const candidates: Array<{
453
+ providerID: string
454
+ modelID: string
455
+ mode: "sdk" | "cli"
456
+ }> = []
413
457
  const envProvider = process.env.SEE_IMAGE_PROVIDER
414
458
  const envModel = process.env.SEE_IMAGE_MODEL
415
459
  if (envProvider && envModel) {
416
- candidates.push({ providerID: envProvider, modelID: envModel })
460
+ candidates.push({ providerID: envProvider, modelID: envModel, mode: "sdk" })
417
461
  }
418
- candidates.push({ providerID: "opencode-go", modelID: "minimax-m3" })
419
- candidates.push({ providerID: "opencode", modelID: "mimo-v2.5-free" })
420
-
421
- for (const { providerID, modelID } of candidates) {
422
- if (providerID === "opencode") {
423
- // SDK session.prompt returns empty for free models; use CLI instead
424
- const text = await freeFallback(modelID, prompt)
425
- if (text) {
426
- result = { text, model: modelID, provider: providerID }
427
- break
428
- }
429
- errors.push(`${providerID}/${modelID}: no text from CLI fallback`)
430
- continue
431
- }
462
+ // Prefer streaming minimax (live preview); fall back to the same model via
463
+ // the proven CLI runner; then the free model via CLI.
464
+ candidates.push({ providerID: "opencode-go", modelID: "minimax-m3", mode: "sdk" })
465
+ candidates.push({ providerID: "opencode-go", modelID: "minimax-m3", mode: "cli" })
466
+ candidates.push({ providerID: "opencode", modelID: "mimo-v2.5-free", mode: "cli" })
432
467
 
468
+ for (const { providerID, modelID, mode } of candidates) {
433
469
  try {
434
- const text = await streamCandidate(providerID, modelID)
470
+ const text =
471
+ mode === "sdk"
472
+ ? await streamViaSDK(providerID, modelID)
473
+ : await runViaCLI(providerID, modelID)
435
474
  if (text) {
436
475
  result = { text, model: modelID, provider: providerID }
437
476
  break
438
477
  }
439
- errors.push(`${providerID}/${modelID}: no text in response`)
478
+ errors.push(`${providerID}/${modelID} (${mode}): no text`)
440
479
  } catch (e: any) {
441
- errors.push(`${providerID}/${modelID}: ${e?.message ?? e}`)
480
+ errors.push(`${providerID}/${modelID} (${mode}): ${e?.message ?? e}`)
442
481
  }
443
482
  }
444
483
 
@@ -609,30 +648,57 @@ const SeeImagePlugin: Plugin = async (ctx) => {
609
648
 
610
649
  let result: { text: string; model: string; provider: string }
611
650
 
612
- if (process.env.SEE_IMAGE_API_KEY) {
613
- const b64 = resolved.dataUrl.split(",")[1] || ""
614
- result = await seeImageViaHTTP(b64, resolved.mediaType, prompt, context.abort)
615
- } else {
616
- // Throttle live progress updates so we don't spam the UI while the
617
- // vision model streams a long response.
618
- let lastUpdate = 0
619
- const onProgress: ProgressFn = (info) => {
620
- const now = Date.now()
621
- if (now - lastUpdate < 400) return
622
- lastUpdate = now
623
- context.metadata({
624
- title: `see_image: reading… ${info.chars} chars (${info.model})`,
625
- metadata: { streaming: true, chars: info.chars, preview: info.preview },
626
- })
651
+ // Live feedback while we wait: an animated heartbeat bar plus, once the
652
+ // vision model starts streaming, a growing char count and a preview of
653
+ // the latest text. The timer ticks independently so the bar animates
654
+ // even before any tokens arrive; onProgress feeds it streamed content.
655
+ const started = Date.now()
656
+ let tick = 0
657
+ const live = { chars: 0, preview: "", model: "" }
658
+ const onProgress: ProgressFn = (info) => {
659
+ live.chars = info.chars
660
+ live.preview = info.preview
661
+ if (info.model) live.model = info.model
662
+ }
663
+ const render = () => {
664
+ const secs = Math.round((Date.now() - started) / 1000)
665
+ const bar = heartbeatBar(++tick)
666
+ const label = live.chars > 0 ? `reading… ${live.chars} chars` : "looking…"
667
+ const model = live.model ? ` · ${live.model}` : ""
668
+ context.metadata({
669
+ title: `see_image ${bar} ${label} · ${secs}s${model}`,
670
+ metadata: {
671
+ elapsedSeconds: secs,
672
+ chars: live.chars,
673
+ preview: live.preview,
674
+ model: live.model,
675
+ },
676
+ })
677
+ }
678
+ render()
679
+ const heartbeat = setInterval(render, 500)
680
+
681
+ try {
682
+ if (process.env.SEE_IMAGE_API_KEY) {
683
+ const b64 = resolved.dataUrl.split(",")[1] || ""
684
+ result = await seeImageViaHTTP(
685
+ b64,
686
+ resolved.mediaType,
687
+ prompt,
688
+ context.abort,
689
+ )
690
+ } else {
691
+ result = await seeImageViaSDK(
692
+ client,
693
+ resolved.dataUrl,
694
+ resolved.mediaType,
695
+ prompt,
696
+ context.abort,
697
+ onProgress,
698
+ )
627
699
  }
628
- result = await seeImageViaSDK(
629
- client,
630
- resolved.dataUrl,
631
- resolved.mediaType,
632
- prompt,
633
- context.abort,
634
- onProgress,
635
- )
700
+ } finally {
701
+ clearInterval(heartbeat)
636
702
  }
637
703
 
638
704
  context.metadata({
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-see-image",
3
- "version": "0.10.0",
3
+ "version": "0.10.1",
4
4
  "description": "Give non-vision opencode models the ability to see images/screenshots by routing them to a vision-capable model (MiniMax M3 via opencode-go by default).",
5
5
  "type": "module",
6
6
  "main": "index.ts",