pi-glm-image-summary 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +54 -0
- package/package.json +32 -0
- package/src/index.ts +310 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 kaofelix
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# pi-glm-image-summary
|
|
2
|
+
|
|
3
|
+
A [pi](https://github.com/badlogic/pi-mono) extension that intercepts image reads when using glm-4.7 and sends them to glm-4.6v for detailed analysis.
|
|
4
|
+
|
|
5
|
+
## Why?
|
|
6
|
+
|
|
7
|
+
GLM-4.7 is a powerful text model but has limited vision capabilities. GLM-4.6v, on the other hand, has stronger vision support. This extension automatically detects when you're using glm-4.7 and intercepts image reads, sending them to glm-4.6v for comprehensive analysis.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Automatic image interception**: When using glm-4.7/glm-4.7-long, image file reads are automatically redirected to glm-4.6v for analysis
|
|
12
|
+
- **Comprehensive analysis**: Extracts text content, visual elements, technical details, and more
|
|
13
|
+
- **Manual analysis command**: `/analyze-image <path>` to manually analyze any image
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
The extension is already installed in `~/.pi/agent/extensions/pi-glm-image-summary/`.
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
Load the extension when starting pi:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pi -e ~/.pi/agent/extensions/pi-glm-image-summary --provider zai --model glm-4.7
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Or add it to your pi config for automatic loading.
|
|
28
|
+
|
|
29
|
+
### Automatic Mode
|
|
30
|
+
|
|
31
|
+
When the extension detects:
|
|
32
|
+
1. Current model is `glm-4.7` or `glm-4.7-long`
|
|
33
|
+
2. A file being read is an image (jpg, jpeg, png, gif, webp)
|
|
34
|
+
|
|
35
|
+
It will automatically spawn a subprocess with glm-4.6v to analyze the image and return a detailed summary.
|
|
36
|
+
|
|
37
|
+
### Manual Analysis
|
|
38
|
+
|
|
39
|
+
Use the `/analyze-image` command to analyze any image:
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
/analyze-image ./screenshot.png
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Supported Image Formats
|
|
46
|
+
|
|
47
|
+
- JPEG (.jpg, .jpeg)
|
|
48
|
+
- PNG (.png)
|
|
49
|
+
- GIF (.gif)
|
|
50
|
+
- WebP (.webp)
|
|
51
|
+
|
|
52
|
+
## Configuration
|
|
53
|
+
|
|
54
|
+
The extension uses the ZAI provider for the vision model. Make sure you have proper API credentials configured.
|
package/package.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pi-glm-image-summary",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Pi extension that intercepts image reads when using glm-4.7 and sends them to glm-4.6v for detailed analysis",
|
|
6
|
+
"keywords": [
|
|
7
|
+
"pi-package",
|
|
8
|
+
"glm",
|
|
9
|
+
"image",
|
|
10
|
+
"vision",
|
|
11
|
+
"ai"
|
|
12
|
+
],
|
|
13
|
+
"author": "kaofelix",
|
|
14
|
+
"license": "MIT",
|
|
15
|
+
"repository": {
|
|
16
|
+
"type": "git",
|
|
17
|
+
"url": "git+https://github.com/kaofelix/pi-glm-image-summary.git"
|
|
18
|
+
},
|
|
19
|
+
"files": [
|
|
20
|
+
"src"
|
|
21
|
+
],
|
|
22
|
+
"scripts": {
|
|
23
|
+
"clean": "echo 'nothing to clean'",
|
|
24
|
+
"build": "echo 'nothing to build'",
|
|
25
|
+
"check": "echo 'nothing to check'"
|
|
26
|
+
},
|
|
27
|
+
"pi": {
|
|
28
|
+
"extensions": [
|
|
29
|
+
"./src/index.ts"
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GLM Image Summary Extension
|
|
3
|
+
*
|
|
4
|
+
* When using glm-4.7, this extension intercepts image reads and sends them
|
|
5
|
+
* to glm-4.6v for detailed analysis using a subprocess. This provides better
|
|
6
|
+
* image understanding since glm-4.6v has stronger vision capabilities.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* pi -e ~/.pi/agent/extensions/pi-glm-image-summary --provider zai --model glm-4.7
|
|
10
|
+
*
|
|
11
|
+
* The extension will:
|
|
12
|
+
* 1. Detect when glm-4.7 is the current model
|
|
13
|
+
* 2. Check if the file being read is an image
|
|
14
|
+
* 3. Call pi subprocess with glm-4.6v to analyze the image
|
|
15
|
+
* 4. Return the summary text to glm-4.7
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { spawn } from "node:child_process";
|
|
19
|
+
import type { TextContent } from "@mariozechner/pi-ai";
|
|
20
|
+
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
21
|
+
import {
|
|
22
|
+
BorderedLoader,
|
|
23
|
+
createReadTool,
|
|
24
|
+
type ReadOperations,
|
|
25
|
+
type ReadToolDetails,
|
|
26
|
+
} from "@mariozechner/pi-coding-agent";
|
|
27
|
+
import { constants } from "fs";
|
|
28
|
+
import { access as fsAccess, readFile as fsReadFile } from "fs/promises";
|
|
29
|
+
import { resolve } from "path";
|
|
30
|
+
|
|
31
|
+
const SUMMARY_PROMPT = `Please analyze this image comprehensively. Extract ALL information from the image including:
|
|
32
|
+
|
|
33
|
+
1. **Overall Description**: What type of content is this? (screenshot, diagram, document, photograph, UI, code, etc.)
|
|
34
|
+
2. **Text Content**: ALL visible text in the image, preserving structure and formatting. Include labels, buttons, error messages, file paths, code snippets, etc. Be exhaustive.
|
|
35
|
+
3. **Visual Elements**: Colors, layout, components, icons, graphical elements
|
|
36
|
+
4. **Technical Details**: For code, UI, diagrams - include exact values, class names, IDs, parameters, configurations
|
|
37
|
+
5. **Contextual Information**: Window titles, terminal prompts, file names, timestamps, status indicators
|
|
38
|
+
6. **Structure**: How elements are organized, relationships between components
|
|
39
|
+
7. **Actionable Information**: Any visible commands, settings, configurations, or parameters that could be useful
|
|
40
|
+
|
|
41
|
+
Format your response clearly with sections and bullet points. Be extremely thorough - the user needs to understand everything visible in this image to perform their task.`;
|
|
42
|
+
|
|
43
|
+
export default function (pi: ExtensionAPI) {
|
|
44
|
+
const localCwd = process.cwd();
|
|
45
|
+
const localRead = createReadTool(localCwd);
|
|
46
|
+
|
|
47
|
+
// Custom read operations that detect images
|
|
48
|
+
const readOps: ReadOperations = {
|
|
49
|
+
readFile: (path) => fsReadFile(path),
|
|
50
|
+
access: (path) => fsAccess(path, constants.R_OK),
|
|
51
|
+
detectImageMimeType: async (absolutePath: string) => {
|
|
52
|
+
// Simple MIME type detection
|
|
53
|
+
const ext = absolutePath.split(".").pop()?.toLowerCase();
|
|
54
|
+
const supported = ["jpg", "jpeg", "png", "gif", "webp"];
|
|
55
|
+
if (ext && supported.includes(ext)) {
|
|
56
|
+
return `image/${ext === "jpg" ? "jpeg" : ext}`;
|
|
57
|
+
}
|
|
58
|
+
return null;
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Override the read tool
|
|
63
|
+
pi.registerTool({
|
|
64
|
+
...localRead,
|
|
65
|
+
async execute(toolCallId, params, signal, onUpdate, ctx) {
|
|
66
|
+
const { path } = params;
|
|
67
|
+
const absolutePath = resolve(ctx.cwd, path);
|
|
68
|
+
|
|
69
|
+
// Check if current model is glm-4.7
|
|
70
|
+
const currentModel = ctx.model;
|
|
71
|
+
const isGlm4_7 = currentModel?.id === "glm-4.7" || currentModel?.id === "glm-4.7-long";
|
|
72
|
+
|
|
73
|
+
// If not glm-4.7, use standard read
|
|
74
|
+
if (!isGlm4_7) {
|
|
75
|
+
return localRead.execute(toolCallId, params, signal, onUpdate);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Check if file is an image
|
|
79
|
+
const mimeType = await readOps.detectImageMimeType?.(absolutePath);
|
|
80
|
+
if (!mimeType) {
|
|
81
|
+
// Not an image, use standard read
|
|
82
|
+
return localRead.execute(toolCallId, params, signal, onUpdate);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Call pi subprocess with glm-4.6v to analyze the image
|
|
86
|
+
onUpdate?.({
|
|
87
|
+
content: [{ type: "text", text: `[Analyzing image with glm-4.6v...]` }],
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
try {
|
|
91
|
+
const result = await new Promise<{ text: string }>((resolveResult, reject) => {
|
|
92
|
+
// Use @ prefix to indicate image attachment, and absolute path
|
|
93
|
+
const args = [
|
|
94
|
+
`@${absolutePath}`,
|
|
95
|
+
"--provider",
|
|
96
|
+
"zai",
|
|
97
|
+
"--model",
|
|
98
|
+
"glm-4.6v",
|
|
99
|
+
"-p",
|
|
100
|
+
SUMMARY_PROMPT,
|
|
101
|
+
"--json", // Get structured output
|
|
102
|
+
];
|
|
103
|
+
|
|
104
|
+
const child = spawn("pi", args, {
|
|
105
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
106
|
+
env: process.env,
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
let stdout = "";
|
|
110
|
+
let stderr = "";
|
|
111
|
+
|
|
112
|
+
child.stdout.on("data", (data) => {
|
|
113
|
+
stdout += data.toString();
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
child.stderr.on("data", (data) => {
|
|
117
|
+
stderr += data.toString();
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
child.on("error", (err) => {
|
|
121
|
+
reject(err);
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
child.on("close", (code) => {
|
|
125
|
+
if (code !== 0) {
|
|
126
|
+
reject(new Error(`pi subprocess failed (${code}): ${stderr}`));
|
|
127
|
+
} else {
|
|
128
|
+
resolveResult({ text: stdout.trim() });
|
|
129
|
+
}
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
// Handle abort signal
|
|
133
|
+
if (signal) {
|
|
134
|
+
const onAbort = () => {
|
|
135
|
+
child.kill();
|
|
136
|
+
reject(new Error("Operation aborted"));
|
|
137
|
+
};
|
|
138
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
139
|
+
child.on("close", () => {
|
|
140
|
+
signal.removeEventListener("abort", onAbort);
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
if (signal?.aborted) {
|
|
146
|
+
throw new Error("Operation aborted");
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Parse the result
|
|
150
|
+
let summaryText: string;
|
|
151
|
+
try {
|
|
152
|
+
// Try to parse as JSON first
|
|
153
|
+
const json = JSON.parse(result.text);
|
|
154
|
+
// Extract message content from the response
|
|
155
|
+
if (json.messages && Array.isArray(json.messages)) {
|
|
156
|
+
// Get the last assistant message
|
|
157
|
+
const assistantMsg = json.messages.findLast((m: any) => m.role === "assistant");
|
|
158
|
+
if (assistantMsg?.content) {
|
|
159
|
+
summaryText = assistantMsg.content
|
|
160
|
+
.filter((c: any) => c.type === "text")
|
|
161
|
+
.map((c: any) => c.text)
|
|
162
|
+
.join("\n");
|
|
163
|
+
} else {
|
|
164
|
+
summaryText = result.text;
|
|
165
|
+
}
|
|
166
|
+
} else {
|
|
167
|
+
summaryText = result.text;
|
|
168
|
+
}
|
|
169
|
+
} catch {
|
|
170
|
+
// Not JSON, use as-is
|
|
171
|
+
summaryText = result.text;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const readResult = {
|
|
175
|
+
content: [
|
|
176
|
+
{
|
|
177
|
+
type: "text",
|
|
178
|
+
text: `[Image analyzed with glm-4.6v]\n\n${summaryText}`,
|
|
179
|
+
} as TextContent,
|
|
180
|
+
],
|
|
181
|
+
details: { summaryModel: "glm-4.6v" } as ReadToolDetails,
|
|
182
|
+
};
|
|
183
|
+
|
|
184
|
+
onUpdate?.(readResult);
|
|
185
|
+
return readResult;
|
|
186
|
+
} catch (error: any) {
|
|
187
|
+
// Throw an error so it shows as red in the UI
|
|
188
|
+
const errorMsg = `Image analysis failed with glm-4.6v: ${error.message}. The image may not be supported (e.g., animated GIFs) or there was a connection issue.`;
|
|
189
|
+
const err = new Error(errorMsg);
|
|
190
|
+
(err as any).isToolError = true; // Mark as a tool error for better handling
|
|
191
|
+
throw err;
|
|
192
|
+
}
|
|
193
|
+
},
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
// Add a command to manually trigger image analysis
|
|
197
|
+
pi.registerCommand("analyze-image", {
|
|
198
|
+
description: "Analyze an image file using glm-4.6v",
|
|
199
|
+
handler: async (args, ctx) => {
|
|
200
|
+
if (!ctx.hasUI) {
|
|
201
|
+
ctx.ui.notify("analyze-image requires interactive mode", "error");
|
|
202
|
+
return;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const imagePath = args.trim();
|
|
206
|
+
if (!imagePath) {
|
|
207
|
+
ctx.ui.notify("Usage: /analyze-image <path-to-image>", "error");
|
|
208
|
+
return;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
const absolutePath = resolve(ctx.cwd, imagePath);
|
|
212
|
+
|
|
213
|
+
// Check if file is an image
|
|
214
|
+
const mimeType = await readOps.detectImageMimeType?.(absolutePath);
|
|
215
|
+
if (!mimeType) {
|
|
216
|
+
ctx.ui.notify("Not a supported image file", "error");
|
|
217
|
+
return;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Call pi subprocess with glm-4.6v to analyze the image
|
|
221
|
+
const result = await ctx.ui.custom<string | null>((tui, theme, _kb, done) => {
|
|
222
|
+
const loader = new BorderedLoader(tui, theme, `Analyzing ${imagePath}...`);
|
|
223
|
+
loader.onAbort = () => done(null);
|
|
224
|
+
|
|
225
|
+
// Use @ prefix to indicate image attachment, and absolute path
|
|
226
|
+
const args = [
|
|
227
|
+
`@${absolutePath}`,
|
|
228
|
+
"--provider",
|
|
229
|
+
"zai",
|
|
230
|
+
"--model",
|
|
231
|
+
"glm-4.6v",
|
|
232
|
+
"-p",
|
|
233
|
+
SUMMARY_PROMPT,
|
|
234
|
+
"--json",
|
|
235
|
+
];
|
|
236
|
+
|
|
237
|
+
const child = spawn("pi", args, {
|
|
238
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
239
|
+
env: process.env,
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
let stdout = "";
|
|
243
|
+
let stderr = "";
|
|
244
|
+
|
|
245
|
+
child.stdout.on("data", (data) => {
|
|
246
|
+
stdout += data.toString();
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
child.stderr.on("data", (data) => {
|
|
250
|
+
stderr += data.toString();
|
|
251
|
+
});
|
|
252
|
+
|
|
253
|
+
child.on("error", (err) => {
|
|
254
|
+
console.error("Image analysis failed:", err);
|
|
255
|
+
ctx.ui.notify(`Analysis failed: ${err.message}`, "error");
|
|
256
|
+
done(null);
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
child.on("close", (code) => {
|
|
260
|
+
if (code !== 0) {
|
|
261
|
+
console.error("Image analysis failed:", stderr);
|
|
262
|
+
ctx.ui.notify(`Analysis failed: ${stderr}`, "error");
|
|
263
|
+
done(null);
|
|
264
|
+
return;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
let summaryText: string;
|
|
268
|
+
try {
|
|
269
|
+
const json = JSON.parse(stdout);
|
|
270
|
+
if (json.messages && Array.isArray(json.messages)) {
|
|
271
|
+
const assistantMsg = json.messages.findLast((m: any) => m.role === "assistant");
|
|
272
|
+
if (assistantMsg?.content) {
|
|
273
|
+
summaryText = assistantMsg.content
|
|
274
|
+
.filter((c: any) => c.type === "text")
|
|
275
|
+
.map((c: any) => c.text)
|
|
276
|
+
.join("\n");
|
|
277
|
+
} else {
|
|
278
|
+
summaryText = stdout;
|
|
279
|
+
}
|
|
280
|
+
} else {
|
|
281
|
+
summaryText = stdout;
|
|
282
|
+
}
|
|
283
|
+
} catch {
|
|
284
|
+
summaryText = stdout;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
done(summaryText);
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
if (loader.signal.aborted) {
|
|
291
|
+
child.kill();
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
loader.signal.addEventListener("abort", () => {
|
|
295
|
+
child.kill();
|
|
296
|
+
});
|
|
297
|
+
|
|
298
|
+
return loader;
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
if (result === null) {
|
|
302
|
+
ctx.ui.notify("Cancelled", "info");
|
|
303
|
+
return;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Show the analysis
|
|
307
|
+
await ctx.ui.editor("Image Analysis", result);
|
|
308
|
+
},
|
|
309
|
+
});
|
|
310
|
+
}
|