@yandy0725/pi-vision-tools 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -0
- package/README.zh.md +130 -0
- package/index.ts +218 -0
- package/package.json +45 -0
- package/src/compress.ts +67 -0
- package/src/config.ts +79 -0
- package/src/image.ts +77 -0
- package/src/reasoning.ts +22 -0
- package/src/state.ts +20 -0
- package/src/vision.ts +90 -0
package/README.md
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# pi-vision-tools
|
|
2
|
+
|
|
3
|
+
Lets non-multimodal models analyze images by delegating to a configured vision model. A single `describe_image` tool + `/vision` command.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **One tool** (`describe_image`) that sends an image + prompt to a vision-capable model and returns the text result to the calling model
|
|
8
|
+
- **Calling model controls cost/quality per call**: `compress` (on/off), `reasoning` (off through xhigh), and the prompt itself — no preconfiguration needed
|
|
9
|
+
- **Auto enable/disable** by calling model modality: if the current model already has image input, the tool disables itself; otherwise it's on
|
|
10
|
+
- **Footer indicator** (`👁 provider/model`) visible when the tool is active and a vision model is configured
|
|
11
|
+
- **No `/reload` required**: config changes take effect immediately
|
|
12
|
+
|
|
13
|
+
## How it works
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
calling model → describe_image → vision model → text back to calling model
|
|
17
|
+
(no vision) (image+prompt) (sees image)
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
1. The calling model invokes `describe_image` with an image and a prompt
|
|
21
|
+
2. The tool decodes the image, optionally compresses it with sharp, then calls the configured vision model
|
|
22
|
+
3. The vision model's text answer is returned as the tool result
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pi install npm:@yandy0725/pi-vision-tools
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Or add to `~/.pi/agent/settings.json`:
|
|
31
|
+
|
|
32
|
+
```json
|
|
33
|
+
{
|
|
34
|
+
"packages": ["npm:@yandy0725/pi-vision-tools"]
|
|
35
|
+
}
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Configuration
|
|
39
|
+
|
|
40
|
+
### `/vision` command
|
|
41
|
+
|
|
42
|
+
| Command | What it does |
|
|
43
|
+
|---------|-------------|
|
|
44
|
+
| `/vision` or `/vision status` | Show current config: provider/model, enabled state, effective on/off, whether the calling model has vision |
|
|
45
|
+
| `/vision config provider <p>` | Set the vision model provider (e.g. `openai`, `anthropic`) |
|
|
46
|
+
| `/vision config model <m>` | Set the vision model ID (e.g. `gpt-4o`, `claude-sonnet-4-20250514`) |
|
|
47
|
+
| `/vision config default-reasoning <level>` | Set default reasoning depth: `off`, `minimal`, `low`, `medium`, `high`, `xhigh` |
|
|
48
|
+
| `/vision on` | Force-enable the tool (even if the calling model has vision) |
|
|
49
|
+
| `/vision off` | Force-disable the tool |
|
|
50
|
+
| `/vision auto` | Auto mode: tool enabled only when the calling model lacks image input (default) |
|
|
51
|
+
|
|
52
|
+
Config is persisted to `~/.pi/agent/vision-tools.json` and takes effect immediately — no `/reload` needed.
|
|
53
|
+
|
|
54
|
+
### Optional: sharp
|
|
55
|
+
|
|
56
|
+
Install sharp for automatic image compression before sending:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
npm install sharp
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
sharp is optional. Without it, images are sent as-is (no error). Compression downsamples the longest edge to ≤1568px, removes alpha, and converts to JPEG.
|
|
63
|
+
|
|
64
|
+
#### Environment variables
|
|
65
|
+
|
|
66
|
+
| Variable | Default | Description |
|
|
67
|
+
|----------|---------|-------------|
|
|
68
|
+
| `PI_VISION_MAX_DIM` | `1568` | Longest-edge pixel limit (1–10000) |
|
|
69
|
+
| `PI_VISION_JPEG_QUALITY` | `85` | JPEG quality (1–100) |
|
|
70
|
+
|
|
71
|
+
Set `compress: false` on any call to skip compression for pixel-perfect needs (reading coordinates, inspecting tiny UI elements).
|
|
72
|
+
|
|
73
|
+
## Tool reference
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
describe_image(image_path: string, prompt: string, compress?: boolean, reasoning?: string)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
| Parameter | Required | Default | Description |
|
|
80
|
+
|-----------|:--------:|---------|-------------|
|
|
81
|
+
| `image_path` | yes | — | File path, `data:` URL, or raw base64 (>100 chars) |
|
|
82
|
+
| `prompt` | yes | — | Instruction for the vision model |
|
|
83
|
+
| `compress` | no | `true` | Compress before sending; set `false` for pixel-perfect |
|
|
84
|
+
| `reasoning` | no | `off` | Reasoning effort: `off`, `minimal`, `low`, `medium`, `high`, `xhigh` |
|
|
85
|
+
|
|
86
|
+
### Example tool call
|
|
87
|
+
|
|
88
|
+
```json
|
|
89
|
+
{
|
|
90
|
+
"image_path": "/home/user/screenshot.png",
|
|
91
|
+
"prompt": "Describe what you see in this screenshot.",
|
|
92
|
+
"compress": true,
|
|
93
|
+
"reasoning": "high"
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Example prompts
|
|
98
|
+
|
|
99
|
+
| Goal | prompt |
|
|
100
|
+
|------|--------|
|
|
101
|
+
| Describe an image | `"Describe this image in detail."` |
|
|
102
|
+
| Read coordinates/position | `"What are the pixel coordinates of the submit button?"` |
|
|
103
|
+
| Extract text (OCR) | `"Extract all visible text from this image."` |
|
|
104
|
+
| Find UI bugs | `"Inspect this screenshot for layout, alignment, or text overflow issues."` |
|
|
105
|
+
| Explain a diagram | `"Explain this architecture diagram step by step."` |
|
|
106
|
+
| Analyze an error | `"What does this error message mean and how can it be fixed?"` |
|
|
107
|
+
|
|
108
|
+
### Reasoning levels
|
|
109
|
+
|
|
110
|
+
| Level | When to use |
|
|
111
|
+
|-------|-------------|
|
|
112
|
+
| `off` | Simple description, text extraction, basic Q&A |
|
|
113
|
+
| `minimal` | Quick glance, "what is this?" |
|
|
114
|
+
| `low` | Slightly more thought, moderate detail |
|
|
115
|
+
| `medium` | Detailed description, UI inspection |
|
|
116
|
+
| `high` | Complex analysis, architecture diagrams, code screenshots |
|
|
117
|
+
| `xhigh` | Deep reasoning, bug hunting, multi-step visual puzzles |
|
|
118
|
+
|
|
119
|
+
## Image formats
|
|
120
|
+
|
|
121
|
+
Supported: PNG, JPEG, GIF, WebP, BMP.
|
|
122
|
+
|
|
123
|
+
Input can be:
|
|
124
|
+
- A file path (`/path/to/image.png`, `./relative.png`, `~`-prefixed)
|
|
125
|
+
- A `data:` URL (`data:image/png;base64,...`)
|
|
126
|
+
- Raw base64 (string >100 characters, auto-detected)
|
|
127
|
+
|
|
128
|
+
## License
|
|
129
|
+
|
|
130
|
+
MIT
|
package/README.zh.md
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# pi-vision-tools
|
|
2
|
+
|
|
3
|
+
让非多模态模型通过委托给配置好的视觉模型来分析图像。提供一个 `describe_image` 工具 + `/vision` 命令。
|
|
4
|
+
|
|
5
|
+
## 功能
|
|
6
|
+
|
|
7
|
+
- **一个工具**(`describe_image`),将图像 + 提示词发送给支持视觉的模型,并将文本结果返回给调用方模型
|
|
8
|
+
- **调用方模型按需控制成本/质量**:`compress`(开关)、`reasoning`(从 off 到 xhigh)以及提示词本身——无需预配置
|
|
9
|
+
- **按调用方模型模态自动开关**:如果当前模型已支持图像输入,工具自动禁用;否则启用
|
|
10
|
+
- **页脚指示器**(`👁 provider/model`),在工具激活且视觉模型已配置时可见
|
|
11
|
+
- **无需 `/reload`**:配置更改即时生效
|
|
12
|
+
|
|
13
|
+
## 工作原理
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
调用方模型 → describe_image → 视觉模型 → 文本返回给调用方模型
|
|
17
|
+
(无视觉能力) (图像+提示词) (能看图像)
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
1. 调用方模型调用 `describe_image`,传入图像和提示词
|
|
21
|
+
2. 工具解码图像,可选使用 sharp 压缩,然后调用配置好的视觉模型
|
|
22
|
+
3. 视觉模型的文本回答作为工具结果返回
|
|
23
|
+
|
|
24
|
+
## 安装
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pi install npm:@yandy0725/pi-vision-tools
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
或在 `~/.pi/agent/settings.json` 中添加:
|
|
31
|
+
|
|
32
|
+
```json
|
|
33
|
+
{
|
|
34
|
+
"packages": ["npm:@yandy0725/pi-vision-tools"]
|
|
35
|
+
}
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## 配置
|
|
39
|
+
|
|
40
|
+
### `/vision` 命令
|
|
41
|
+
|
|
42
|
+
| 命令 | 功能 |
|
|
43
|
+
|---------|------|
|
|
44
|
+
| `/vision` 或 `/vision status` | 显示当前配置:provider/model、启用状态、生效开关、调用方模型是否支持视觉 |
|
|
45
|
+
| `/vision config provider <p>` | 设置视觉模型提供商(如 `openai`、`anthropic`) |
|
|
46
|
+
| `/vision config model <m>` | 设置视觉模型 ID(如 `gpt-4o`、`claude-sonnet-4-20250514`) |
|
|
47
|
+
| `/vision config default-reasoning <level>` | 设置默认推理深度:`off`、`minimal`、`low`、`medium`、`high`、`xhigh` |
|
|
48
|
+
| `/vision on` | 强制启用工具(即使调用方模型支持视觉) |
|
|
49
|
+
| `/vision off` | 强制禁用工具 |
|
|
50
|
+
| `/vision auto` | 自动模式:仅当调用方模型不支持图像输入时启用(默认) |
|
|
51
|
+
|
|
52
|
+
配置持久化到 `~/.pi/agent/vision-tools.json`,即时生效——无需 `/reload`。
|
|
53
|
+
|
|
54
|
+
### 可选:sharp
|
|
55
|
+
|
|
56
|
+
安装 sharp 以在发送前自动压缩图像:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
npm install sharp
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
sharp 是可选的。没有它,图像将原样发送(不报错)。压缩会将最长边降采样至 ≤1568px,去除 alpha 通道,并转换为 JPEG 格式。
|
|
63
|
+
|
|
64
|
+
#### 环境变量
|
|
65
|
+
|
|
66
|
+
| 变量 | 默认值 | 说明 |
|
|
67
|
+
|----------|---------|------|
|
|
68
|
+
| `PI_VISION_MAX_DIM` | `1568` | 最长边像素上限(1–10000) |
|
|
69
|
+
| `PI_VISION_JPEG_QUALITY` | `85` | JPEG 质量(1–100) |
|
|
70
|
+
|
|
71
|
+
在任何调用中设置 `compress: false` 可跳过压缩,用于需要像素级精度的场景(读取坐标、检查微小 UI 元素)。
|
|
72
|
+
|
|
73
|
+
## 工具参考
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
describe_image(image_path: string, prompt: string, compress?: boolean, reasoning?: string)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
| 参数 | 必填 | 默认值 | 说明 |
|
|
80
|
+
|-----------|:--:|---------|------|
|
|
81
|
+
| `image_path` | 是 | — | 文件路径、`data:` URL 或原始 base64(>100 字符) |
|
|
82
|
+
| `prompt` | 是 | — | 给视觉模型的指令 |
|
|
83
|
+
| `compress` | 否 | `true` | 发送前是否压缩;设为 `false` 以获取像素级精度 |
|
|
84
|
+
| `reasoning` | 否 | `off` | 推理力度:`off`、`minimal`、`low`、`medium`、`high`、`xhigh` |
|
|
85
|
+
|
|
86
|
+
### 示例工具调用
|
|
87
|
+
|
|
88
|
+
```json
|
|
89
|
+
{
|
|
90
|
+
"image_path": "/home/user/screenshot.png",
|
|
91
|
+
"prompt": "请描述这张截图中看到的内容。",
|
|
92
|
+
"compress": true,
|
|
93
|
+
"reasoning": "high"
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### 示例提示词
|
|
98
|
+
|
|
99
|
+
| 用途 | 提示词 |
|
|
100
|
+
|------|--------|
|
|
101
|
+
| 描述图像 | `"请详细描述这张图片。"` |
|
|
102
|
+
| 读取坐标/位置 | `"提交按钮的像素坐标是多少?"` |
|
|
103
|
+
| 提取文字(OCR) | `"提取这张图片中所有可见的文字。"` |
|
|
104
|
+
| 查找 UI 缺陷 | `"检查这张截图中是否存在布局、对齐或文字溢出问题。"` |
|
|
105
|
+
| 解释图表 | `"请逐步解释这个架构图。"` |
|
|
106
|
+
| 分析错误 | `"这个错误信息是什么意思,如何修复?"` |
|
|
107
|
+
|
|
108
|
+
### 推理级别
|
|
109
|
+
|
|
110
|
+
| 级别 | 适用场景 |
|
|
111
|
+
|-------|------|
|
|
112
|
+
| `off` | 简单描述、文字提取、基础问答 |
|
|
113
|
+
| `minimal` | 快速浏览,"这是什么?" |
|
|
114
|
+
| `low` | 略多思考、中等细节 |
|
|
115
|
+
| `medium` | 详细描述、UI 检查 |
|
|
116
|
+
| `high` | 复杂分析、架构图、代码截图 |
|
|
117
|
+
| `xhigh` | 深度推理、bug 排查、多步骤视觉谜题 |
|
|
118
|
+
|
|
119
|
+
## 支持的图像格式
|
|
120
|
+
|
|
121
|
+
支持:PNG、JPEG、GIF、WebP、BMP。
|
|
122
|
+
|
|
123
|
+
输入可以是:
|
|
124
|
+
- 文件路径(`/path/to/image.png`、`./relative.png`,支持 `~` 前缀)
|
|
125
|
+
- `data:` URL(`data:image/png;base64,...`)
|
|
126
|
+
- 原始 base64(字符串 >100 字符,自动检测)
|
|
127
|
+
|
|
128
|
+
## 许可证
|
|
129
|
+
|
|
130
|
+
MIT
|
package/index.ts
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import { StringEnum } from "@earendil-works/pi-ai";
|
|
2
|
+
import { complete } from "@earendil-works/pi-ai/compat";
|
|
3
|
+
import { type ExtensionAPI, type ExtensionContext, getAgentDir } from "@earendil-works/pi-coding-agent";
|
|
4
|
+
import { Text } from "@earendil-works/pi-tui";
|
|
5
|
+
import { Type } from "typebox";
|
|
6
|
+
import { compressImage, readCompressionSettings } from "./src/compress.js";
|
|
7
|
+
import { loadConfig, saveConfig, type VisionConfig } from "./src/config.js";
|
|
8
|
+
import { type DecodedImage, decodeImage } from "./src/image.js";
|
|
9
|
+
import { effectiveReasoning, reasoningToOptions, type VisionReasoning } from "./src/reasoning.js";
|
|
10
|
+
import { callingModelHasVision, effectiveEnabled, footerLabel } from "./src/state.js";
|
|
11
|
+
import { callVision, resolveVisionModel } from "./src/vision.js";
|
|
12
|
+
|
|
13
|
+
const TOOL_NAME = "describe_image";
|
|
14
|
+
const STATUS_KEY = "pi-vision";
|
|
15
|
+
|
|
16
|
+
export default function (pi: ExtensionAPI) {
|
|
17
|
+
let config: VisionConfig = { enabled: "auto" };
|
|
18
|
+
let enabled = false;
|
|
19
|
+
|
|
20
|
+
const refresh = (ctx: ExtensionContext) => {
|
|
21
|
+
enabled = effectiveEnabled(config, ctx.model);
|
|
22
|
+
const active = pi.getActiveTools();
|
|
23
|
+
if (enabled && !active.includes(TOOL_NAME)) {
|
|
24
|
+
pi.setActiveTools([...active, TOOL_NAME]);
|
|
25
|
+
} else if (!enabled && active.includes(TOOL_NAME)) {
|
|
26
|
+
pi.setActiveTools(active.filter((t) => t !== TOOL_NAME));
|
|
27
|
+
}
|
|
28
|
+
if (ctx.hasUI) {
|
|
29
|
+
const label = footerLabel(config, enabled);
|
|
30
|
+
ctx.ui.setStatus(STATUS_KEY, label);
|
|
31
|
+
}
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
pi.on("session_start", async (_event, ctx) => {
|
|
35
|
+
config = await loadConfig(getAgentDir());
|
|
36
|
+
refresh(ctx);
|
|
37
|
+
});
|
|
38
|
+
pi.on("model_select", async (_event, ctx) => {
|
|
39
|
+
refresh(ctx);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
pi.registerTool({
|
|
43
|
+
name: TOOL_NAME,
|
|
44
|
+
label: "Describe Image",
|
|
45
|
+
description:
|
|
46
|
+
"Analyze an image by delegating to a vision-capable model. Lets non-multimodal models understand images. " +
|
|
47
|
+
"`image_path` is a file path, data: URL, or raw base64 (>100 chars). " +
|
|
48
|
+
"`compress` (default true) downscales/strips to speed up; set false for pixel-perfect needs. " +
|
|
49
|
+
"`reasoning` controls the vision model's thinking effort (off/minimal/low/medium/high/xhigh).",
|
|
50
|
+
promptSnippet: "describe_image: delegate image analysis to a vision model (non-multimodal models).",
|
|
51
|
+
promptGuidelines: [
|
|
52
|
+
"Use describe_image when you need to understand an image you cannot see (the calling model lacks vision).",
|
|
53
|
+
"Set compress:false when you need pixel-perfect accuracy (reading coordinates, tiny UI elements).",
|
|
54
|
+
"Set reasoning:'high'/'xhigh' for complex visual analysis (architecture diagrams, bug hunting).",
|
|
55
|
+
],
|
|
56
|
+
parameters: Type.Object({
|
|
57
|
+
image_path: Type.String({ description: "File path, data: URL, or raw base64 (>100 chars)." }),
|
|
58
|
+
prompt: Type.String({
|
|
59
|
+
description: "Instruction for the vision model, e.g. 'describe', 'extract text', 'find the bug'.",
|
|
60
|
+
}),
|
|
61
|
+
compress: Type.Optional(
|
|
62
|
+
Type.Boolean({ default: true, description: "Compress image before sending (default true)." }),
|
|
63
|
+
),
|
|
64
|
+
reasoning: Type.Optional(
|
|
65
|
+
StringEnum(["off", "minimal", "low", "medium", "high", "xhigh"] as const, {
|
|
66
|
+
description: "Vision model reasoning effort. Default off.",
|
|
67
|
+
}),
|
|
68
|
+
),
|
|
69
|
+
}),
|
|
70
|
+
renderCall(args, theme) {
|
|
71
|
+
const p = args as { image_path?: string; prompt?: string };
|
|
72
|
+
const target = p.image_path ? (p.image_path.length > 40 ? `${p.image_path.slice(0, 37)}...` : p.image_path) : "...";
|
|
73
|
+
return new Text(
|
|
74
|
+
theme.fg("toolTitle", theme.bold("describe_image ")) +
|
|
75
|
+
theme.fg("accent", target) +
|
|
76
|
+
theme.fg("dim", ` · ${p.prompt?.slice(0, 30) ?? ""}`),
|
|
77
|
+
0,
|
|
78
|
+
0,
|
|
79
|
+
);
|
|
80
|
+
},
|
|
81
|
+
renderResult(result, { expanded }, theme) {
|
|
82
|
+
const text = result.content?.[0];
|
|
83
|
+
const body = text?.type === "text" ? text.text : "";
|
|
84
|
+
const lines = body.split("\n");
|
|
85
|
+
if (!expanded) {
|
|
86
|
+
const preview = lines.slice(0, 6);
|
|
87
|
+
if (lines.length > 6) preview.push(theme.fg("dim", `... ${lines.length - 6} more lines · ctrl+o to expand`));
|
|
88
|
+
return new Text(preview.join("\n"), 0, 0);
|
|
89
|
+
}
|
|
90
|
+
return new Text(body, 0, 0);
|
|
91
|
+
},
|
|
92
|
+
async execute(_toolCallId, params, signal, onUpdate, ctx) {
|
|
93
|
+
if (!enabled) {
|
|
94
|
+
return {
|
|
95
|
+
content: [{ type: "text", text: "describe_image is disabled. Run /vision on to enable." }],
|
|
96
|
+
details: { error: "disabled" },
|
|
97
|
+
isError: true,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
const p = params as { image_path: string; prompt: string; compress?: boolean; reasoning?: VisionReasoning };
|
|
101
|
+
|
|
102
|
+
const resolved = resolveVisionModel(ctx.modelRegistry, config);
|
|
103
|
+
if (!resolved.ok) {
|
|
104
|
+
return { content: [{ type: "text", text: resolved.error }], details: { error: resolved.error }, isError: true };
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const auth = await ctx.modelRegistry.getApiKeyAndHeaders(resolved.model);
|
|
108
|
+
if (!auth.ok || !auth.apiKey) {
|
|
109
|
+
const msg = auth.ok ? `No API key for ${config.provider}/${config.model}` : auth.error;
|
|
110
|
+
return { content: [{ type: "text", text: msg }], details: { error: msg }, isError: true };
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
onUpdate?.({ content: [{ type: "text", text: "Decoding image..." }], details: {} });
|
|
114
|
+
|
|
115
|
+
let image: DecodedImage;
|
|
116
|
+
try {
|
|
117
|
+
image = await decodeImage(p.image_path);
|
|
118
|
+
} catch (e) {
|
|
119
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
120
|
+
return { content: [{ type: "text", text: `Image decode failed: ${msg}` }], details: { error: msg }, isError: true };
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const doCompress = p.compress !== false;
|
|
124
|
+
let compressed = false;
|
|
125
|
+
let mimeType = image.mimeType;
|
|
126
|
+
if (doCompress) {
|
|
127
|
+
onUpdate?.({ content: [{ type: "text", text: "Compressing..." }], details: {} });
|
|
128
|
+
const out = await compressImage(image, readCompressionSettings());
|
|
129
|
+
compressed = out !== image;
|
|
130
|
+
mimeType = out.mimeType;
|
|
131
|
+
image = out;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
onUpdate?.({ content: [{ type: "text", text: "Analyzing image..." }], details: {} });
|
|
135
|
+
|
|
136
|
+
const reasoningLevel = effectiveReasoning(p.reasoning, config.defaultReasoning);
|
|
137
|
+
const reasoning = reasoningToOptions(reasoningLevel);
|
|
138
|
+
const result = await callVision(
|
|
139
|
+
{
|
|
140
|
+
model: resolved.model,
|
|
141
|
+
auth: { apiKey: auth.apiKey, headers: auth.headers },
|
|
142
|
+
prompt: p.prompt,
|
|
143
|
+
images: [image],
|
|
144
|
+
reasoning,
|
|
145
|
+
signal: signal ?? undefined,
|
|
146
|
+
},
|
|
147
|
+
complete,
|
|
148
|
+
);
|
|
149
|
+
|
|
150
|
+
if (result.errorMessage) {
|
|
151
|
+
return {
|
|
152
|
+
content: [{ type: "text", text: `Vision model error: ${result.errorMessage}` }],
|
|
153
|
+
details: { error: result.errorMessage, model: `${config.provider}/${config.model}` },
|
|
154
|
+
isError: true,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return {
|
|
159
|
+
content: [{ type: "text", text: result.text }],
|
|
160
|
+
details: {
|
|
161
|
+
model: `${config.provider}/${config.model}`,
|
|
162
|
+
usage: result.usage,
|
|
163
|
+
compressed,
|
|
164
|
+
mimeType,
|
|
165
|
+
reasoning: reasoningLevel,
|
|
166
|
+
},
|
|
167
|
+
};
|
|
168
|
+
},
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
pi.registerCommand("vision", {
|
|
172
|
+
description: "Configure the vision model for describe_image (/vision config | on | off | status)",
|
|
173
|
+
handler: async (args, ctx) => {
|
|
174
|
+
const parts = args.trim().split(/\s+/).filter(Boolean);
|
|
175
|
+
const sub = parts[0];
|
|
176
|
+
|
|
177
|
+
const notifyConfig = () => {
|
|
178
|
+
const target = config.provider && config.model ? `${config.provider}/${config.model}` : "(unconfigured)";
|
|
179
|
+
const visionCap = callingModelHasVision(ctx.model) ? "yes" : "no";
|
|
180
|
+
ctx.ui.notify(
|
|
181
|
+
`vision: ${target}\nenabled: ${config.enabled} (effective: ${enabled ? "on" : "off"})\ncalling model has vision: ${visionCap}`,
|
|
182
|
+
"info",
|
|
183
|
+
);
|
|
184
|
+
};
|
|
185
|
+
|
|
186
|
+
if (!sub || sub === "status") {
|
|
187
|
+
notifyConfig();
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
if (sub === "on" || sub === "off" || sub === "auto") {
|
|
192
|
+
config = { ...config, enabled: sub as VisionConfig["enabled"] };
|
|
193
|
+
await saveConfig(getAgentDir(), config);
|
|
194
|
+
refresh(ctx);
|
|
195
|
+
ctx.ui.notify(`vision ${sub}`, "info");
|
|
196
|
+
return;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
if (sub === "config") {
|
|
200
|
+
const key = parts[1];
|
|
201
|
+
const val = parts[2];
|
|
202
|
+
if (key === "provider" && val) config = { ...config, provider: val };
|
|
203
|
+
else if (key === "model" && val) config = { ...config, model: val };
|
|
204
|
+
else if (key === "default-reasoning" && val) config = { ...config, defaultReasoning: val as VisionReasoning };
|
|
205
|
+
else {
|
|
206
|
+
ctx.ui.notify("Usage: /vision config provider <p> | model <m> | default-reasoning <level>", "warning");
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
await saveConfig(getAgentDir(), config);
|
|
210
|
+
refresh(ctx);
|
|
211
|
+
ctx.ui.notify(`vision ${key} = ${val}`, "info");
|
|
212
|
+
return;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
ctx.ui.notify("Usage: /vision [config provider <p> | config model <m> | on | off | auto | status]", "warning");
|
|
216
|
+
},
|
|
217
|
+
});
|
|
218
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@yandy0725/pi-vision-tools",
|
|
3
|
+
"publishConfig": {
|
|
4
|
+
"access": "public"
|
|
5
|
+
},
|
|
6
|
+
"version": "0.1.0",
|
|
7
|
+
"description": "pi package adding a describe_image tool that lets non-multimodal models delegate image analysis to a vision model",
|
|
8
|
+
"license": "MIT",
|
|
9
|
+
"repository": {
|
|
10
|
+
"type": "git",
|
|
11
|
+
"url": "https://github.com/yandy/pi-packages",
|
|
12
|
+
"directory": "pi-vision-tools"
|
|
13
|
+
},
|
|
14
|
+
"type": "module",
|
|
15
|
+
"keywords": [
|
|
16
|
+
"pi-package"
|
|
17
|
+
],
|
|
18
|
+
"files": [
|
|
19
|
+
"index.ts",
|
|
20
|
+
"src/"
|
|
21
|
+
],
|
|
22
|
+
"scripts": {
|
|
23
|
+
"test": "vitest run",
|
|
24
|
+
"test:watch": "vitest",
|
|
25
|
+
"typecheck": "tsc --noEmit",
|
|
26
|
+
"lint": "biome lint .",
|
|
27
|
+
"format": "biome format --write .",
|
|
28
|
+
"check": "biome check ."
|
|
29
|
+
},
|
|
30
|
+
"pi": {
|
|
31
|
+
"extensions": [
|
|
32
|
+
"./index.ts"
|
|
33
|
+
]
|
|
34
|
+
},
|
|
35
|
+
"peerDependencies": {
|
|
36
|
+
"@earendil-works/pi-coding-agent": ">=0.74.0"
|
|
37
|
+
},
|
|
38
|
+
"dependencies": {
|
|
39
|
+
"@earendil-works/pi-ai": "^0.80.2"
|
|
40
|
+
},
|
|
41
|
+
"devDependencies": {
|
|
42
|
+
"@earendil-works/pi-tui": "^0.79.9",
|
|
43
|
+
"typebox": "^1.1.38"
|
|
44
|
+
}
|
|
45
|
+
}
|
package/src/compress.ts
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import type { DecodedImage } from "./image.js";
|
|
2
|
+
|
|
3
|
+
export interface CompressionSettings {
|
|
4
|
+
maxDim: number;
|
|
5
|
+
jpegQuality: number;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
const DEFAULT_MAX_DIM = 1568;
|
|
9
|
+
const DEFAULT_JPEG_QUALITY = 85;
|
|
10
|
+
|
|
11
|
+
function parseIntOrDefault(v: string | undefined, dflt: number, min: number, max: number): number {
|
|
12
|
+
if (v == null) return dflt;
|
|
13
|
+
const n = Number.parseInt(v, 10);
|
|
14
|
+
if (!Number.isFinite(n) || n < min || n > max) return dflt;
|
|
15
|
+
return n;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export function readCompressionSettings(env: NodeJS.ProcessEnv = process.env): CompressionSettings {
|
|
19
|
+
return {
|
|
20
|
+
maxDim: parseIntOrDefault(env.PI_VISION_MAX_DIM, DEFAULT_MAX_DIM, 1, 10_000),
|
|
21
|
+
jpegQuality: parseIntOrDefault(env.PI_VISION_JPEG_QUALITY, DEFAULT_JPEG_QUALITY, 1, 100),
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Minimal structural type for the bits of sharp we use.
|
|
26
|
+
interface SharpPipeline {
|
|
27
|
+
resize(opts: { width?: number; height?: number; withoutEnlargement: boolean; fit: string }): SharpPipeline;
|
|
28
|
+
removeAlpha(): SharpPipeline;
|
|
29
|
+
jpeg(opts: { quality: number }): SharpPipeline;
|
|
30
|
+
toBuffer(): Promise<Buffer>;
|
|
31
|
+
}
|
|
32
|
+
type SharpModule = (data: Buffer) => SharpPipeline;
|
|
33
|
+
|
|
34
|
+
const defaultSharpLoader = async (): Promise<SharpModule | null> => {
|
|
35
|
+
try {
|
|
36
|
+
// @ts-ignore sharp is an optional dependency, may not be installed
|
|
37
|
+
const mod = (await import("sharp")) as { default?: SharpModule } & SharpModule;
|
|
38
|
+
return mod.default ?? (mod as unknown as SharpModule);
|
|
39
|
+
} catch {
|
|
40
|
+
return null;
|
|
41
|
+
}
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
export async function compressImage(
|
|
45
|
+
image: DecodedImage,
|
|
46
|
+
settings: CompressionSettings,
|
|
47
|
+
_sharpLoader: () => Promise<SharpModule | null> = defaultSharpLoader,
|
|
48
|
+
): Promise<DecodedImage> {
|
|
49
|
+
let sharp: SharpModule | null;
|
|
50
|
+
try {
|
|
51
|
+
sharp = await _sharpLoader();
|
|
52
|
+
} catch {
|
|
53
|
+
return image;
|
|
54
|
+
}
|
|
55
|
+
if (!sharp) return image;
|
|
56
|
+
|
|
57
|
+
try {
|
|
58
|
+
const buf = await sharp(image.data)
|
|
59
|
+
.resize({ width: settings.maxDim, height: settings.maxDim, withoutEnlargement: true, fit: "inside" })
|
|
60
|
+
.removeAlpha()
|
|
61
|
+
.jpeg({ quality: settings.jpegQuality })
|
|
62
|
+
.toBuffer();
|
|
63
|
+
return { data: buf, mimeType: "image/jpeg" };
|
|
64
|
+
} catch {
|
|
65
|
+
return image;
|
|
66
|
+
}
|
|
67
|
+
}
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import { mkdir, readFile, rename, writeFile } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import type { ThinkingLevel } from "@earendil-works/pi-ai";
|
|
4
|
+
|
|
5
|
+
export type VisionEnabledState = "auto" | "on" | "off";
|
|
6
|
+
|
|
7
|
+
export interface VisionConfig {
|
|
8
|
+
provider?: string;
|
|
9
|
+
model?: string;
|
|
10
|
+
enabled: VisionEnabledState;
|
|
11
|
+
defaultReasoning?: ThinkingLevel | "off";
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export const DEFAULT_CONFIG: VisionConfig = { enabled: "auto" };
|
|
15
|
+
|
|
16
|
+
const ENABLED_STATES = ["auto", "on", "off"] as const;
|
|
17
|
+
const REASONING_LEVELS = ["off", "minimal", "low", "medium", "high", "xhigh"] as const;
|
|
18
|
+
|
|
19
|
+
export function configPath(agentDir: string): string {
|
|
20
|
+
return join(agentDir, "vision-tools.json");
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function parseConfig(raw: unknown): VisionConfig {
|
|
24
|
+
if (raw == null || typeof raw !== "object") return { ...DEFAULT_CONFIG };
|
|
25
|
+
const obj = raw as Record<string, unknown>;
|
|
26
|
+
|
|
27
|
+
const cfg: VisionConfig = { enabled: "auto" };
|
|
28
|
+
|
|
29
|
+
if (obj.provider !== undefined) {
|
|
30
|
+
if (typeof obj.provider !== "string" || obj.provider.length === 0) {
|
|
31
|
+
throw new Error("vision-tools config: provider must be a non-empty string");
|
|
32
|
+
}
|
|
33
|
+
cfg.provider = obj.provider;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (obj.model !== undefined) {
|
|
37
|
+
if (typeof obj.model !== "string" || obj.model.length === 0) {
|
|
38
|
+
throw new Error("vision-tools config: model must be a non-empty string");
|
|
39
|
+
}
|
|
40
|
+
cfg.model = obj.model;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if (obj.enabled !== undefined) {
|
|
44
|
+
if (typeof obj.enabled !== "string" || !ENABLED_STATES.includes(obj.enabled as VisionEnabledState)) {
|
|
45
|
+
throw new Error(`vision-tools config: enabled must be one of ${ENABLED_STATES.join(", ")}`);
|
|
46
|
+
}
|
|
47
|
+
cfg.enabled = obj.enabled as VisionEnabledState;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (obj.defaultReasoning !== undefined) {
|
|
51
|
+
if (
|
|
52
|
+
typeof obj.defaultReasoning !== "string" ||
|
|
53
|
+
!REASONING_LEVELS.includes(obj.defaultReasoning as (typeof REASONING_LEVELS)[number])
|
|
54
|
+
) {
|
|
55
|
+
throw new Error(`vision-tools config: defaultReasoning must be one of ${REASONING_LEVELS.join(", ")}`);
|
|
56
|
+
}
|
|
57
|
+
cfg.defaultReasoning = obj.defaultReasoning as ThinkingLevel | "off";
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return cfg;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export async function loadConfig(agentDir: string): Promise<VisionConfig> {
|
|
64
|
+
try {
|
|
65
|
+
const text = await readFile(configPath(agentDir), "utf8");
|
|
66
|
+
return parseConfig(JSON.parse(text));
|
|
67
|
+
} catch {
|
|
68
|
+
return { ...DEFAULT_CONFIG };
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export async function saveConfig(agentDir: string, config: VisionConfig): Promise<void> {
|
|
73
|
+
await mkdir(agentDir, { recursive: true });
|
|
74
|
+
const text = JSON.stringify(config);
|
|
75
|
+
const target = configPath(agentDir);
|
|
76
|
+
const tmp = `${target}.tmp`;
|
|
77
|
+
await writeFile(tmp, text, "utf8");
|
|
78
|
+
await rename(tmp, target);
|
|
79
|
+
}
|
package/src/image.ts
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
import { extname } from "node:path";
|
|
3
|
+
|
|
4
|
+
export interface DecodedImage {
|
|
5
|
+
data: Buffer;
|
|
6
|
+
mimeType: string;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export interface ImageReadOptions {
|
|
10
|
+
readFile?: (path: string) => Promise<Buffer>;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
const EXT_TO_MIME: Record<string, string> = {
|
|
14
|
+
".png": "image/png",
|
|
15
|
+
".jpg": "image/jpeg",
|
|
16
|
+
".jpeg": "image/jpeg",
|
|
17
|
+
".gif": "image/gif",
|
|
18
|
+
".webp": "image/webp",
|
|
19
|
+
".bmp": "image/bmp",
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
const SUPPORTED_MIMES = new Set(Object.values(EXT_TO_MIME));
|
|
23
|
+
|
|
24
|
+
const DATA_URL_RE = /^data:([^;]+)?;base64,(.*)$/s;
|
|
25
|
+
|
|
26
|
+
function looksLikePath(s: string): boolean {
|
|
27
|
+
if (s.startsWith("/")) return true;
|
|
28
|
+
if (s.startsWith("./") || s.startsWith("../")) return true;
|
|
29
|
+
if (s.startsWith("~")) return true;
|
|
30
|
+
// has a dot-extension and is short enough not to be base64
|
|
31
|
+
const ext = extname(s).toLowerCase();
|
|
32
|
+
if (ext && EXT_TO_MIME[ext] && s.length <= 100) return true;
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export async function decodeImage(imagePath: string, opts?: ImageReadOptions): Promise<DecodedImage> {
|
|
37
|
+
const src = imagePath?.trim();
|
|
38
|
+
if (!src) throw new Error("image_path is required");
|
|
39
|
+
|
|
40
|
+
// 1. data URL
|
|
41
|
+
const m = DATA_URL_RE.exec(src);
|
|
42
|
+
if (m) {
|
|
43
|
+
const mime = (m[1] || "").toLowerCase();
|
|
44
|
+
if (!SUPPORTED_MIMES.has(mime)) {
|
|
45
|
+
throw new Error(`Unsupported image mime type: ${mime || "(missing)"}`);
|
|
46
|
+
}
|
|
47
|
+
const data = Buffer.from(m[2], "base64");
|
|
48
|
+
if (data.length === 0) throw new Error("Empty image data in data URL");
|
|
49
|
+
return { data, mimeType: mime };
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// 2. file path
|
|
53
|
+
if (looksLikePath(src)) {
|
|
54
|
+
const ext = extname(src).toLowerCase();
|
|
55
|
+
const mime = EXT_TO_MIME[ext];
|
|
56
|
+
if (!mime) throw new Error(`Unsupported image extension: ${ext || "(none)"}`);
|
|
57
|
+
const read = opts?.readFile ?? readFile;
|
|
58
|
+
let data: Buffer;
|
|
59
|
+
try {
|
|
60
|
+
data = await read(src);
|
|
61
|
+
} catch (e) {
|
|
62
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
63
|
+
throw new Error(`Failed to read image file ${src}: ${msg}`);
|
|
64
|
+
}
|
|
65
|
+
if (data.length === 0) throw new Error(`Empty image file: ${src}`);
|
|
66
|
+
return { data, mimeType: mime };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// 3. raw base64 (>100 chars)
|
|
70
|
+
if (src.length > 100) {
|
|
71
|
+
const data = Buffer.from(src, "base64");
|
|
72
|
+
if (data.length === 0) throw new Error("Invalid base64 image data");
|
|
73
|
+
return { data, mimeType: "image/png" };
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
throw new Error("image_path must be a file path, a data: URL, or raw base64 (>100 chars)");
|
|
77
|
+
}
|
package/src/reasoning.ts
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { ThinkingLevel } from "@earendil-works/pi-ai";
|
|
2
|
+
|
|
3
|
+
export type VisionReasoning = "off" | ThinkingLevel;
|
|
4
|
+
|
|
5
|
+
export interface ReasoningOptions {
|
|
6
|
+
reasoningEffort?: ThinkingLevel;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export function reasoningToOptions(level: VisionReasoning | undefined): ReasoningOptions {
|
|
10
|
+
if (!level || level === "off") return {};
|
|
11
|
+
return { reasoningEffort: level };
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Resolve the effective reasoning level: explicit param wins, else config default, else "off".
|
|
16
|
+
*/
|
|
17
|
+
export function effectiveReasoning(
|
|
18
|
+
param: VisionReasoning | undefined,
|
|
19
|
+
configDefault: VisionReasoning | undefined,
|
|
20
|
+
): VisionReasoning {
|
|
21
|
+
return param ?? configDefault ?? "off";
|
|
22
|
+
}
|
package/src/state.ts
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { Model } from "@earendil-works/pi-ai";
|
|
2
|
+
import type { VisionConfig } from "./config.js";
|
|
3
|
+
|
|
4
|
+
// biome-ignore lint/suspicious/noExplicitAny: generic Model type parameter
|
|
5
|
+
export function callingModelHasVision(model: Model<any> | undefined): boolean {
|
|
6
|
+
return !!model && Array.isArray(model.input) && model.input.includes("image");
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
// biome-ignore lint/suspicious/noExplicitAny: generic Model type parameter
|
|
10
|
+
export function effectiveEnabled(config: VisionConfig, model: Model<any> | undefined): boolean {
|
|
11
|
+
if (config.enabled === "on") return true;
|
|
12
|
+
if (config.enabled === "off") return false;
|
|
13
|
+
return !callingModelHasVision(model);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export function footerLabel(config: VisionConfig, enabled: boolean): string | undefined {
|
|
17
|
+
if (!enabled) return undefined;
|
|
18
|
+
if (!config.provider || !config.model) return undefined;
|
|
19
|
+
return `👁 ${config.provider}/${config.model}`;
|
|
20
|
+
}
|
package/src/vision.ts
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import type { AssistantMessage, Context, Model, UserMessage } from "@earendil-works/pi-ai";
|
|
2
|
+
import type { VisionConfig } from "./config.js";
|
|
3
|
+
import type { DecodedImage } from "./image.js";
|
|
4
|
+
import type { ReasoningOptions } from "./reasoning.js";
|
|
5
|
+
|
|
6
|
+
export type CompleteFn = (
|
|
7
|
+
model: Model<any>,
|
|
8
|
+
context: Context,
|
|
9
|
+
options?: Record<string, unknown>,
|
|
10
|
+
) => Promise<AssistantMessage>;
|
|
11
|
+
|
|
12
|
+
export interface VisionCallInput {
|
|
13
|
+
model: Model<any>;
|
|
14
|
+
auth: { apiKey?: string; headers?: Record<string, string> };
|
|
15
|
+
prompt: string;
|
|
16
|
+
images: DecodedImage[];
|
|
17
|
+
reasoning: ReasoningOptions;
|
|
18
|
+
signal?: AbortSignal;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface VisionCallResult {
|
|
22
|
+
text: string;
|
|
23
|
+
usage?: { input?: number; output?: number };
|
|
24
|
+
errorMessage?: string;
|
|
25
|
+
stopReason?: string;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export type ResolveResult = { ok: true; model: Model<any> } | { ok: false; error: string };
|
|
29
|
+
|
|
30
|
+
interface ModelLookup {
|
|
31
|
+
find(provider: string, id: string): Model<any> | undefined;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export function resolveVisionModel(registry: ModelLookup, config: VisionConfig): ResolveResult {
|
|
35
|
+
if (!config.provider || !config.model) {
|
|
36
|
+
return {
|
|
37
|
+
ok: false,
|
|
38
|
+
error: "Vision model not configured. Run: /vision config provider <p> ; /vision config model <m>",
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
const model = registry.find(config.provider, config.model);
|
|
42
|
+
if (!model) {
|
|
43
|
+
return { ok: false, error: `Vision model not found: ${config.provider}/${config.model}` };
|
|
44
|
+
}
|
|
45
|
+
if (!Array.isArray(model.input) || !model.input.includes("image")) {
|
|
46
|
+
return { ok: false, error: `Model ${config.provider}/${config.model} does not support image input` };
|
|
47
|
+
}
|
|
48
|
+
return { ok: true, model };
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export async function callVision(input: VisionCallInput, completeFn: CompleteFn): Promise<VisionCallResult> {
|
|
52
|
+
const userMessage: UserMessage = {
|
|
53
|
+
role: "user",
|
|
54
|
+
content: [
|
|
55
|
+
{ type: "text", text: input.prompt },
|
|
56
|
+
...input.images.map((img) => ({
|
|
57
|
+
type: "image" as const,
|
|
58
|
+
data: img.data.toString("base64"),
|
|
59
|
+
mimeType: img.mimeType,
|
|
60
|
+
})),
|
|
61
|
+
],
|
|
62
|
+
timestamp: Date.now(),
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
const context: Context = { messages: [userMessage] };
|
|
66
|
+
|
|
67
|
+
const options: Record<string, unknown> = {
|
|
68
|
+
apiKey: input.auth.apiKey,
|
|
69
|
+
headers: input.auth.headers,
|
|
70
|
+
...input.reasoning,
|
|
71
|
+
};
|
|
72
|
+
if (input.signal) options.signal = input.signal;
|
|
73
|
+
|
|
74
|
+
try {
|
|
75
|
+
const res = await completeFn(input.model, context, options);
|
|
76
|
+
const text = res.content
|
|
77
|
+
.filter((c): c is { type: "text"; text: string } => c.type === "text")
|
|
78
|
+
.map((c) => c.text)
|
|
79
|
+
.join("\n");
|
|
80
|
+
return {
|
|
81
|
+
text,
|
|
82
|
+
usage: { input: res.usage?.input, output: res.usage?.output },
|
|
83
|
+
stopReason: res.stopReason,
|
|
84
|
+
errorMessage: res.errorMessage,
|
|
85
|
+
};
|
|
86
|
+
} catch (e) {
|
|
87
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
88
|
+
return { text: "", errorMessage: msg };
|
|
89
|
+
}
|
|
90
|
+
}
|