npm - opencode-see-image - Versions diffs - 0.9.3 → 0.10.1 - Mend

opencode-see-image 0.9.3 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/.claude/settings.local.json +19 -1
package/README.md +9 -1
package/index.ts +241 -98
package/package.json +1 -1

package/.claude/settings.local.json CHANGED Viewed

@@ -12,7 +12,25 @@
       "Read(//Users/alfa/Documents/opencodeprojects/opencode-see-image/bun-types/**)",
       "Bash(bun run *)",
       "WebFetch(domain:docs.z.ai)",
-      "Bash(npm publish *)"
+      "Bash(npm publish *)",
+      "Bash(python3 -c ' *)",
+      "Bash(open -a Preview \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/3.png\" \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/4.png\" \"/Users/alfa/.claude/image-cache/31fd2007-9418-45bb-a3e5-d273327f5f78/5.png\")",
+      "Bash(ps -o etime= -p 82196)",
+      "Bash(echo \"STILL RUNNING \\($\\(ps -o etime= -p 82196)",
+      "Bash(awk '/export type TextPart = \\\\{/,/\\\\};/' node_modules/@opencode-ai/sdk/dist/gen/types.gen.d.t)",
+      "Bash(awk '{print $2, $9, $11, $12, $13}')",
+      "Bash(pkill -f \"[o]pencode run\")",
+      "Bash(pkill -f \"[o]pencode-run\")",
+      "Bash(pkill -f \"seq 1 40\")",
+      "Bash(rm -f verify.json)",
+      "Bash(opencode run *)",
+      "Bash(echo \"exit=$? done=$\\(date +%T\\) bytes=$\\(wc -c < /tmp/verify.json\\)\")",
+      "Bash(pkill -9 -f \"opencode run\")",
+      "Bash(pkill -9 -f \"simple.json\\\\|verify.json\\\\|quick.json\\\\|strm\")",
+      "Bash(pkill -9 -f \"14.39.13\")",
+      "Bash(npm dist-tag *)",
+      "Bash(awk '{print $2, $11, $12, $13, $14}')",
+      "Bash(awk '/export type ToolState =/,/^};|^export \\(type|declare\\)/' sdk/dist/gen/types.gen.d.ts)"
     ]
   }
 }

package/README.md CHANGED Viewed

@@ -102,7 +102,15 @@ all settings are env-var overrides. The plugin uses opencode's SDK client by def
 | `SEE_IMAGE_ENDPOINT` | `https://opencode.ai/zen/go/v1/messages` | HTTP endpoint (only used if `SEE_IMAGE_API_KEY` is set) |
 | `SEE_IMAGE_API_VERSION` | `2023-06-01` | `anthropic-version` header (HTTP mode only) |
 | `SEE_IMAGE_USER_AGENT` | _(Chrome UA)_ | User-Agent header (HTTP mode only) |
-| `SEE_IMAGE_TIMEOUT` | `30000` | Per-candidate timeout in ms. Prevents hanging on slow models. |
+| `SEE_IMAGE_TIMEOUT` | `30000` | Timeout in ms for session setup and HTTP-mode calls. |
+| `SEE_IMAGE_STALL_TIMEOUT` | `60000` | Stall timeout in ms (SDK streaming). The call is only aborted if the vision model produces no new tokens for this long — so long transcriptions keep running as long as they're progressing. |
+| `SEE_IMAGE_MAX_TIMEOUT` | `0` | Absolute cap in ms on a single streaming call. `0` = no cap. |
+### live progress
+While the vision model works, the tool call shows an animated heartbeat bar plus live status, e.g. `see_image ░▒▓█▓▒░ reading… 1240 chars · 7s · minimax-m3`. The char count and a preview of the latest text update as tokens stream in, so you can see it's alive and watch the description form.
+The preferred path streams from the vision model via opencode's event stream and uses a **stall timeout** (`SEE_IMAGE_STALL_TIMEOUT`) instead of a hard cutoff: a slow-but-progressing model (e.g. transcribing a huge table) runs to completion, while a genuinely silent/hung call is reaped. If streaming isn't available or a call is cut short, the plugin falls back to a reliable non-streaming CLI call to the same model (full answer, no live preview), then to the free model.
 ### using a different vision model

package/index.ts CHANGED Viewed

@@ -12,6 +12,22 @@ const ENDPOINT =
 const MODEL = process.env.SEE_IMAGE_MODEL || "minimax-m3"
 const PROVIDER_ID = process.env.SEE_IMAGE_PROVIDER || "opencode-go"
 const TIMEOUT = parseInt(process.env.SEE_IMAGE_TIMEOUT || "30000", 10)
+// Stall timeout (SDK streaming path): abort only if the model produces no new
+// tokens for this long. A slow-but-progressing call keeps running.
+const STALL_TIMEOUT = parseInt(process.env.SEE_IMAGE_STALL_TIMEOUT || "60000", 10)
+// Optional absolute cap on a single vision call, in ms (0 = no cap).
+const MAX_TIMEOUT = parseInt(process.env.SEE_IMAGE_MAX_TIMEOUT || "0", 10)
+// Animated heartbeat: a flowing gradient wave shown in the tool title while we
+// wait, so the user can see the call is alive and not frozen.
+const HEARTBEAT_FRAMES = ["░", "▒", "▓", "█", "▓", "▒", "░"]
+function heartbeatBar(tick: number, width = 14): string {
+  let s = ""
+  for (let i = 0; i < width; i++) {
+    s += HEARTBEAT_FRAMES[(i + tick) % HEARTBEAT_FRAMES.length]
+  }
+  return s
+}
 const API_VERSION = process.env.SEE_IMAGE_API_VERSION || "2023-06-01"
 const USER_AGENT =
   process.env.SEE_IMAGE_USER_AGENT ||
@@ -216,12 +232,15 @@ function readProviderKey(providerID: string): string | null {
   }
 }
+type ProgressFn = (info: { chars: number; preview: string; model: string }) => void
 async function seeImageViaSDK(
   client: any,
   dataUrl: string,
   mediaType: string,
   prompt: string,
   abort?: AbortSignal,
+  onProgress?: ProgressFn,
 ): Promise<{ text: string; model: string; provider: string }> {
   const errors: string[] = []
@@ -245,13 +264,147 @@ async function seeImageViaSDK(
     return tmpPath
   }
-  // For free opencode models, use CLI instead of SDK (SDK returns empty).
-  // Use Bun.spawn (not $) so we get a killable handle: Bun's $ ShellPromise
-  // has no .kill(), so racing it against a timeout would leak the process.
-  // We kill the child on both timeout and external abort.
-  const freeFallback = async (modelID: string, userPrompt: string): Promise<string | null> => {
+  // Two runners back the candidate list:
+  //
+  // streamViaSDK — subscribes to opencode's event stream so we get text
+  //   token-by-token. This drives the live content preview AND token-based
+  //   stall detection (abort only after STALL_TIMEOUT of silence). It also
+  //   races the prompt against a stall/max rejection, so a hung call can't
+  //   block past the stall window even if the abort signal is ignored. Only
+  //   used when an event stream is actually available (its whole point).
+  //
+  // runViaCLI — `opencode run -m <provider>/<model>` via Bun.spawn (killable).
+  //   The proven, reliable fallback. It buffers --format json output until
+  //   exit, so it gives no live preview, but it returns the full answer.
+  const streamViaSDK = async (
+    providerID: string,
+    modelID: string,
+  ): Promise<string | null> => {
+    const sessionRes = await Promise.race([
+      client.session.create({ body: {} }),
+      new Promise<never>((_, reject) =>
+        setTimeout(
+          () => reject(new Error(`session.create timed out after ${TIMEOUT}ms`)),
+          TIMEOUT,
+        ),
+      ),
+    ])
+    const sessionID: string | undefined = sessionRes.data?.id
+    if (!sessionID) throw new Error("no session ID")
+    const cleanupSession = () =>
+      client.session.delete({ path: { id: sessionID } }).catch(() => {})
+    // The SDK path exists for the live preview; if we can't get an event
+    // stream there's nothing to preview or to measure stalls against, so bail
+    // and let the loop fall through to the reliable CLI runner.
+    let stream: AsyncGenerator<any> | undefined
+    try {
+      stream = (await client.event.subscribe())?.stream
+    } catch {}
+    if (!stream) {
+      cleanupSession()
+      return null
+    }
+    const controller = new AbortController()
+    const onAbort = () => controller.abort()
+    abort?.addEventListener("abort", onAbort)
+    const partsByID = new Map<string, string>()
+    let streamedText = ""
+    let lastActivity = Date.now()
+    let finished = false
+    const consume = (async () => {
+      try {
+        for await (const ev of stream!) {
+          if (finished) break
+          const p = ev?.properties?.part
+          if (
+            ev?.type === "message.part.updated" &&
+            p?.type === "text" &&
+            p.sessionID === sessionID
+          ) {
+            partsByID.set(p.id, typeof p.text === "string" ? p.text : "")
+            streamedText = [...partsByID.values()].join("\n").trim()
+            lastActivity = Date.now()
+            onProgress?.({
+              chars: streamedText.length,
+              preview: streamedText.slice(-200),
+              model: modelID,
+            })
+          }
+        }
+      } catch {}
+    })()
+    let stallTimer: ReturnType<typeof setInterval> | undefined
+    let maxTimer: ReturnType<typeof setTimeout> | undefined
+    const guard = new Promise<never>((_, reject) => {
+      stallTimer = setInterval(() => {
+        if (Date.now() - lastActivity > STALL_TIMEOUT) {
+          controller.abort()
+          reject(new Error(`stalled: no tokens for ${STALL_TIMEOUT}ms`))
+        }
+      }, 1000)
+      if (MAX_TIMEOUT > 0) {
+        maxTimer = setTimeout(() => {
+          controller.abort()
+          reject(new Error(`exceeded MAX_TIMEOUT ${MAX_TIMEOUT}ms`))
+        }, MAX_TIMEOUT)
+      }
+    })
+    let res: any
+    try {
+      res = await Promise.race([
+        client.session.prompt({
+          path: { id: sessionID },
+          body: {
+            model: { providerID, modelID },
+            parts: [
+              { type: "file", mime: mediaType, url: dataUrl },
+              { type: "text", text: prompt },
+            ],
+            tools: {},
+            system:
+              "You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
+          },
+          signal: controller.signal,
+        }),
+        guard,
+      ])
+    } catch (e: any) {
+      // Stalled / aborted / errored — keep whatever streamed in so far.
+      if (!streamedText) throw e
+    } finally {
+      finished = true
+      if (stallTimer) clearInterval(stallTimer)
+      if (maxTimer) clearTimeout(maxTimer)
+      try { await stream.return?.(undefined) } catch {}
+      abort?.removeEventListener("abort", onAbort)
+      cleanupSession()
+    }
+    const finalText = (res?.data?.parts ?? [])
+      .filter((p: any) => p.type === "text")
+      .map((p: any) => p.text)
+      .filter((t: any) => typeof t === "string" && t.length > 0)
+      .join("\n")
+      .trim()
+    return finalText || streamedText || null
+  }
+  const runViaCLI = async (
+    providerID: string,
+    modelID: string,
+  ): Promise<string | null> => {
     const filePath = ensureTmpFile()
     if (!filePath) return null
+    onProgress?.({ chars: 0, preview: "", model: modelID })
     const proc = Bun.spawn(
       [
         "opencode",
@@ -259,122 +412,72 @@ async function seeImageViaSDK(
         "-f",
         filePath,
         "-m",
-        `opencode/${modelID}`,
-        userPrompt,
+        `${providerID}/${modelID}`,
+        prompt,
         "--format",
         "json",
         "--dangerously-skip-permissions",
       ],
       { stdout: "pipe", stderr: "ignore" },
     )
-    const timer = setTimeout(() => proc.kill(), TIMEOUT)
     const onAbort = () => proc.kill()
     abort?.addEventListener("abort", onAbort)
+    const maxTimer =
+      MAX_TIMEOUT > 0 ? setTimeout(() => proc.kill(), MAX_TIMEOUT) : undefined
     try {
       const out = await new Response(proc.stdout).text()
       await proc.exited
+      const parts = new Map<string, string>()
       for (const line of out.split("\n").filter(Boolean)) {
         try {
-          const parsed = JSON.parse(line)
-          if (parsed?.part?.type === "text" && parsed?.part?.text) {
-            return parsed.part.text
+          const p = JSON.parse(line)?.part
+          if (p?.type === "text" && typeof p.text === "string") {
+            parts.set(p.id ?? String(parts.size), p.text)
           }
         } catch {}
       }
-    } catch {} finally {
-      clearTimeout(timer)
+      return [...parts.values()].join("\n").trim() || null
+    } catch {
+      return null
+    } finally {
+      if (maxTimer) clearTimeout(maxTimer)
       abort?.removeEventListener("abort", onAbort)
     }
-    return null
   }
   let result: { text: string; model: string; provider: string } | undefined
   try {
-    const candidates: Array<{ providerID: string; modelID: string }> = []
+    const candidates: Array<{
+      providerID: string
+      modelID: string
+      mode: "sdk" | "cli"
+    }> = []
     const envProvider = process.env.SEE_IMAGE_PROVIDER
     const envModel = process.env.SEE_IMAGE_MODEL
     if (envProvider && envModel) {
-      candidates.push({ providerID: envProvider, modelID: envModel })
+      candidates.push({ providerID: envProvider, modelID: envModel, mode: "sdk" })
     }
-    candidates.push({ providerID: "opencode-go", modelID: "minimax-m3" })
-    candidates.push({ providerID: "opencode", modelID: "mimo-v2.5-free" })
-    for (const { providerID, modelID } of candidates) {
-      if (providerID === "opencode") {
-        // SDK session.prompt returns empty for free models; use CLI instead
-        const text = await freeFallback(modelID, prompt)
-        if (text) {
-          result = { text, model: modelID, provider: providerID }
-          break
-        }
-        errors.push(`${providerID}/${modelID}: no text from CLI fallback`)
-        continue
-      }
+    // Prefer streaming minimax (live preview); fall back to the same model via
+    // the proven CLI runner; then the free model via CLI.
+    candidates.push({ providerID: "opencode-go", modelID: "minimax-m3", mode: "sdk" })
+    candidates.push({ providerID: "opencode-go", modelID: "minimax-m3", mode: "cli" })
+    candidates.push({ providerID: "opencode", modelID: "mimo-v2.5-free", mode: "cli" })
-      let sessionID: string | undefined
+    for (const { providerID, modelID, mode } of candidates) {
       try {
-        const sessionRes = await Promise.race([
-          client.session.create({ body: {} }),
-          new Promise<never>((_, reject) =>
-            setTimeout(
-              () => reject(new Error(`session.create timed out after ${TIMEOUT}ms`)),
-              TIMEOUT,
-            ),
-          ),
-        ])
-        sessionID = sessionRes.data?.id
-        if (!sessionID) {
-          errors.push(`${providerID}/${modelID}: no session ID`)
-          continue
-        }
-        const controller = new AbortController()
-        const onAbort = () => controller.abort()
-        abort?.addEventListener("abort", onAbort)
-        const timer = setTimeout(() => controller.abort(), TIMEOUT)
-        let res
-        try {
-          res = await client.session.prompt({
-            path: { id: sessionID },
-            body: {
-              model: { providerID, modelID },
-              parts: [
-                { type: "file", mime: mediaType, url: dataUrl },
-                { type: "text", text: prompt },
-              ],
-              tools: {},
-              system:
-                "You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
-            },
-            signal: controller.signal,
-          })
-        } finally {
-          clearTimeout(timer)
-          abort?.removeEventListener("abort", onAbort)
-        }
-        const parts = res.data?.parts ?? []
-        const text = (parts as any[])
-          .filter((p: any) => p.type === "text")
-          .map((p: any) => p.text)
-          .filter((t: any) => typeof t === "string" && t.length > 0)
-          .join("\n")
-          .trim()
+        const text =
+          mode === "sdk"
+            ? await streamViaSDK(providerID, modelID)
+            : await runViaCLI(providerID, modelID)
         if (text) {
           result = { text, model: modelID, provider: providerID }
           break
         }
-        errors.push(`${providerID}/${modelID}: no text in response`)
+        errors.push(`${providerID}/${modelID} (${mode}): no text`)
       } catch (e: any) {
-        errors.push(`${providerID}/${modelID}: ${e?.message ?? e}`)
-      } finally {
-        if (sessionID) {
-          await client.session
-            .delete({ path: { id: sessionID } })
-            .catch(() => {})
-        }
+        errors.push(`${providerID}/${modelID} (${mode}): ${e?.message ?? e}`)
       }
     }
@@ -545,17 +648,57 @@ const SeeImagePlugin: Plugin = async (ctx) => {
       let result: { text: string; model: string; provider: string }
-      if (process.env.SEE_IMAGE_API_KEY) {
-        const b64 = resolved.dataUrl.split(",")[1] || ""
-        result = await seeImageViaHTTP(b64, resolved.mediaType, prompt, context.abort)
-      } else {
-        result = await seeImageViaSDK(
-          client,
-          resolved.dataUrl,
-          resolved.mediaType,
-          prompt,
-          context.abort,
-        )
+      // Live feedback while we wait: an animated heartbeat bar plus, once the
+      // vision model starts streaming, a growing char count and a preview of
+      // the latest text. The timer ticks independently so the bar animates
+      // even before any tokens arrive; onProgress feeds it streamed content.
+      const started = Date.now()
+      let tick = 0
+      const live = { chars: 0, preview: "", model: "" }
+      const onProgress: ProgressFn = (info) => {
+        live.chars = info.chars
+        live.preview = info.preview
+        if (info.model) live.model = info.model
+      }
+      const render = () => {
+        const secs = Math.round((Date.now() - started) / 1000)
+        const bar = heartbeatBar(++tick)
+        const label = live.chars > 0 ? `reading… ${live.chars} chars` : "looking…"
+        const model = live.model ? ` · ${live.model}` : ""
+        context.metadata({
+          title: `see_image ${bar} ${label} · ${secs}s${model}`,
+          metadata: {
+            elapsedSeconds: secs,
+            chars: live.chars,
+            preview: live.preview,
+            model: live.model,
+          },
+        })
+      }
+      render()
+      const heartbeat = setInterval(render, 500)
+      try {
+        if (process.env.SEE_IMAGE_API_KEY) {
+          const b64 = resolved.dataUrl.split(",")[1] || ""
+          result = await seeImageViaHTTP(
+            b64,
+            resolved.mediaType,
+            prompt,
+            context.abort,
+          )
+        } else {
+          result = await seeImageViaSDK(
+            client,
+            resolved.dataUrl,
+            resolved.mediaType,
+            prompt,
+            context.abort,
+            onProgress,
+          )
+        }
+      } finally {
+        clearInterval(heartbeat)
       }
       context.metadata({

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "opencode-see-image",
-  "version": "0.9.3",
+  "version": "0.10.1",
   "description": "Give non-vision opencode models the ability to see images/screenshots by routing them to a vision-capable model (MiniMax M3 via opencode-go by default).",
   "type": "module",
   "main": "index.ts",