opencode-see-image 0.8.3 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/index.ts +91 -77
  2. package/package.json +1 -1
package/index.ts CHANGED
@@ -11,7 +11,7 @@ const ENDPOINT =
11
11
  "https://opencode.ai/zen/go/v1/messages"
12
12
  const MODEL = process.env.SEE_IMAGE_MODEL || "minimax-m3"
13
13
  const PROVIDER_ID = process.env.SEE_IMAGE_PROVIDER || "opencode-go"
14
- const TIMEOUT = parseInt(process.env.SEE_IMAGE_TIMEOUT || "30000", 10)
14
+ const TIMEOUT = parseInt(process.env.SEE_IMAGE_TIMEOUT || "10000", 10)
15
15
  const API_VERSION = process.env.SEE_IMAGE_API_VERSION || "2023-06-01"
16
16
  const USER_AGENT =
17
17
  process.env.SEE_IMAGE_USER_AGENT ||
@@ -224,88 +224,102 @@ async function seeImageViaSDK(
224
224
  ): Promise<{ text: string; model: string; provider: string }> {
225
225
  const errors: string[] = []
226
226
 
227
- // Create new sessions with specific provider/model candidates
228
- const envProvider = process.env.SEE_IMAGE_PROVIDER
229
- const envModel = process.env.SEE_IMAGE_MODEL
230
- const candidates: Array<{ providerID: string; modelID: string }> = []
231
- if (envProvider && envModel) {
232
- candidates.push({ providerID: envProvider, modelID: envModel })
233
- }
234
- candidates.push({ providerID: "opencode-go", modelID: "minimax-m3" })
235
- candidates.push({ providerID: "opencode", modelID: "mimo-v2.5-free" })
236
-
237
- for (const { providerID, modelID } of candidates) {
238
- let sessionID: string | undefined
239
- try {
240
- const sessionRes = await client.session.create({ body: {} })
241
- sessionID = sessionRes.data?.id
242
- if (!sessionID) {
243
- errors.push(`${providerID}/${modelID}: no session ID`)
244
- continue
245
- }
227
+ // Write image to a temp file so the server can read it directly
228
+ const b64 = dataUrl.split(",")[1] || ""
229
+ const tmpPath = path.join(os.tmpdir(), `see-image-${Date.now()}.png`)
230
+ try {
231
+ fs.writeFileSync(tmpPath, Buffer.from(b64, "base64"))
232
+ } catch {}
246
233
 
247
- // Per-candidate timeout so a slow model doesn't hang forever
248
- const controller = new AbortController()
249
- const timer = setTimeout(() => controller.abort(), TIMEOUT)
250
-
251
- const result = await client.session.prompt({
252
- path: { id: sessionID },
253
- body: {
254
- model: { providerID, modelID },
255
- parts: [
256
- { type: "file", mime: mediaType, url: dataUrl },
257
- { type: "text", text: prompt },
258
- ],
259
- tools: {},
260
- system:
261
- "You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
262
- },
263
- signal: controller.signal,
264
- })
265
- clearTimeout(timer)
266
-
267
- const parts = result.data?.parts ?? []
268
- const text = (parts as any[])
269
- .filter((p: any) => p.type === "text")
270
- .map((p: any) => p.text)
271
- .filter((t: any) => typeof t === "string" && t.length > 0)
272
- .join("\n")
273
- .trim()
274
-
275
- if (text) {
276
- return { text, model: modelID, provider: providerID }
277
- }
278
- errors.push(`${providerID}/${modelID}: no text in response`)
279
- } catch (e: any) {
280
- errors.push(`${providerID}/${modelID}: ${e?.message ?? e}`)
281
- } finally {
282
- if (sessionID) {
283
- await client.session
284
- .delete({ path: { id: sessionID } })
285
- .catch(() => {})
234
+ const fileUrl = `file://${tmpPath}`
235
+ let result: { text: string; model: string; provider: string } | undefined
236
+
237
+ try {
238
+ const candidates: Array<{ providerID: string; modelID: string }> = []
239
+ const envProvider = process.env.SEE_IMAGE_PROVIDER
240
+ const envModel = process.env.SEE_IMAGE_MODEL
241
+ if (envProvider && envModel) {
242
+ candidates.push({ providerID: envProvider, modelID: envModel })
243
+ }
244
+ candidates.push({ providerID: "opencode-go", modelID: "minimax-m3" })
245
+ candidates.push({ providerID: "opencode", modelID: "mimo-v2.5-free" })
246
+
247
+ for (const { providerID, modelID } of candidates) {
248
+ let sessionID: string | undefined
249
+ try {
250
+ const sessionRes = await client.session.create({ body: {} })
251
+ sessionID = sessionRes.data?.id
252
+ if (!sessionID) {
253
+ errors.push(`${providerID}/${modelID}: no session ID`)
254
+ continue
255
+ }
256
+
257
+ const controller = new AbortController()
258
+ const timer = setTimeout(() => controller.abort(), TIMEOUT)
259
+ const res = await client.session.prompt({
260
+ path: { id: sessionID },
261
+ body: {
262
+ model: { providerID, modelID },
263
+ parts: [
264
+ { type: "file", mime: mediaType, url: fileUrl },
265
+ { type: "text", text: prompt },
266
+ ],
267
+ tools: {},
268
+ system:
269
+ "You are a vision assistant. Describe the image accurately and concisely. Answer with text only.",
270
+ },
271
+ signal: controller.signal,
272
+ })
273
+ clearTimeout(timer)
274
+
275
+ const parts = res.data?.parts ?? []
276
+ const text = (parts as any[])
277
+ .filter((p: any) => p.type === "text")
278
+ .map((p: any) => p.text)
279
+ .filter((t: any) => typeof t === "string" && t.length > 0)
280
+ .join("\n")
281
+ .trim()
282
+
283
+ if (text) {
284
+ result = { text, model: modelID, provider: providerID }
285
+ break
286
+ }
287
+ errors.push(`${providerID}/${modelID}: no text in response`)
288
+ } catch (e: any) {
289
+ errors.push(`${providerID}/${modelID}: ${e?.message ?? e}`)
290
+ } finally {
291
+ if (sessionID) {
292
+ await client.session
293
+ .delete({ path: { id: sessionID } })
294
+ .catch(() => {})
295
+ }
286
296
  }
287
297
  }
288
- }
289
298
 
290
- // If user has an API key configured (from auth.json or env), try HTTP fallback
291
- const b64 = dataUrl.split(",")[1] || ""
292
- const apiKey =
293
- process.env.SEE_IMAGE_API_KEY || readProviderKey("opencode-go")
294
- if (apiKey) {
295
- try {
296
- return await seeImageViaHTTP(b64, mediaType, prompt, abort, apiKey)
297
- } catch (e: any) {
298
- errors.push(`http-fallback: ${e?.message ?? e}`)
299
+ if (!result) {
300
+ const apiKey =
301
+ process.env.SEE_IMAGE_API_KEY || readProviderKey("opencode-go")
302
+ if (apiKey) {
303
+ try {
304
+ result = await seeImageViaHTTP(b64, mediaType, prompt, abort, apiKey)
305
+ } catch (e: any) {
306
+ errors.push(`http-fallback: ${e?.message ?? e}`)
307
+ }
308
+ }
299
309
  }
300
- }
301
310
 
302
- const errMsg = errors.join("; ")
303
- const hint = errMsg.includes("usage limit")
304
- ? ` Enable usage from your balance at https://opencode.ai/workspace/wrk_01KVARG0A0Y87XV5JYBNJ0WRXB/go`
305
- : ""
306
- throw new Error(
307
- `see_image: SDK vision call failed for all candidates. ${errMsg}.${hint}`,
308
- )
311
+ if (result) return result
312
+
313
+ const errMsg = errors.join("; ")
314
+ const hint = errMsg.includes("usage limit")
315
+ ? ` Enable usage from your balance at https://opencode.ai/workspace/wrk_01KVARG0A0Y87XV5JYBNJ0WRXB/go`
316
+ : ""
317
+ throw new Error(
318
+ `see_image: SDK vision call failed for all candidates. ${errMsg}.${hint}`,
319
+ )
320
+ } finally {
321
+ try { fs.unlinkSync(tmpPath) } catch {}
322
+ }
309
323
  }
310
324
 
311
325
  async function seeImageViaHTTP(
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-see-image",
3
- "version": "0.8.3",
3
+ "version": "0.8.4",
4
4
  "description": "Give non-vision opencode models the ability to see images/screenshots by routing them to a vision-capable model (MiniMax M3 via opencode-go by default).",
5
5
  "type": "module",
6
6
  "main": "index.ts",