pi-glm-image-summary 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +54 -0
  3. package/package.json +32 -0
  4. package/src/index.ts +310 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 kaofelix
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,54 @@
1
+ # pi-glm-image-summary
2
+
3
+ A [pi](https://github.com/badlogic/pi-mono) extension that intercepts image reads when using glm-4.7 and sends them to glm-4.6v for detailed analysis.
4
+
5
+ ## Why?
6
+
7
+ GLM-4.7 is a powerful text model but has limited vision capabilities. GLM-4.6v, on the other hand, has stronger vision support. This extension automatically detects when you're using glm-4.7 and intercepts image reads, sending them to glm-4.6v for comprehensive analysis.
8
+
9
+ ## Features
10
+
11
+ - **Automatic image interception**: When using glm-4.7/glm-4.7-long, image file reads are automatically redirected to glm-4.6v for analysis
12
+ - **Comprehensive analysis**: Extracts text content, visual elements, technical details, and more
13
+ - **Manual analysis command**: `/analyze-image <path>` to manually analyze any image
14
+
15
+ ## Installation
16
+
17
+ The extension is already installed in `~/.pi/agent/extensions/pi-glm-image-summary/`.
18
+
19
+ ## Usage
20
+
21
+ Load the extension when starting pi:
22
+
23
+ ```bash
24
+ pi -e ~/.pi/agent/extensions/pi-glm-image-summary --provider zai --model glm-4.7
25
+ ```
26
+
27
+ Or add it to your pi config for automatic loading.
28
+
29
+ ### Automatic Mode
30
+
31
+ When the extension detects:
32
+ 1. Current model is `glm-4.7` or `glm-4.7-long`
33
+ 2. A file being read is an image (jpg, jpeg, png, gif, webp)
34
+
35
+ It will automatically spawn a subprocess with glm-4.6v to analyze the image and return a detailed summary.
36
+
37
+ ### Manual Analysis
38
+
39
+ Use the `/analyze-image` command to analyze any image:
40
+
41
+ ```
42
+ /analyze-image ./screenshot.png
43
+ ```
44
+
45
+ ## Supported Image Formats
46
+
47
+ - JPEG (.jpg, .jpeg)
48
+ - PNG (.png)
49
+ - GIF (.gif)
50
+ - WebP (.webp)
51
+
52
+ ## Configuration
53
+
54
+ The extension uses the ZAI provider for the vision model. Make sure you have proper API credentials configured.
package/package.json ADDED
@@ -0,0 +1,32 @@
1
+ {
2
+ "name": "pi-glm-image-summary",
3
+ "version": "0.1.0",
4
+ "type": "module",
5
+ "description": "Pi extension that intercepts image reads when using glm-4.7 and sends them to glm-4.6v for detailed analysis",
6
+ "keywords": [
7
+ "pi-package",
8
+ "glm",
9
+ "image",
10
+ "vision",
11
+ "ai"
12
+ ],
13
+ "author": "kaofelix",
14
+ "license": "MIT",
15
+ "repository": {
16
+ "type": "git",
17
+ "url": "git+https://github.com/kaofelix/pi-glm-image-summary.git"
18
+ },
19
+ "files": [
20
+ "src"
21
+ ],
22
+ "scripts": {
23
+ "clean": "echo 'nothing to clean'",
24
+ "build": "echo 'nothing to build'",
25
+ "check": "echo 'nothing to check'"
26
+ },
27
+ "pi": {
28
+ "extensions": [
29
+ "./src/index.ts"
30
+ ]
31
+ }
32
+ }
package/src/index.ts ADDED
@@ -0,0 +1,310 @@
1
+ /**
2
+ * GLM Image Summary Extension
3
+ *
4
+ * When using glm-4.7, this extension intercepts image reads and sends them
5
+ * to glm-4.6v for detailed analysis using a subprocess. This provides better
6
+ * image understanding since glm-4.6v has stronger vision capabilities.
7
+ *
8
+ * Usage:
9
+ * pi -e ~/.pi/agent/extensions/pi-glm-image-summary --provider zai --model glm-4.7
10
+ *
11
+ * The extension will:
12
+ * 1. Detect when glm-4.7 is the current model
13
+ * 2. Check if the file being read is an image
14
+ * 3. Call pi subprocess with glm-4.6v to analyze the image
15
+ * 4. Return the summary text to glm-4.7
16
+ */
17
+
18
+ import { spawn } from "node:child_process";
19
+ import type { TextContent } from "@mariozechner/pi-ai";
20
+ import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
21
+ import {
22
+ BorderedLoader,
23
+ createReadTool,
24
+ type ReadOperations,
25
+ type ReadToolDetails,
26
+ } from "@mariozechner/pi-coding-agent";
27
+ import { constants } from "fs";
28
+ import { access as fsAccess, readFile as fsReadFile } from "fs/promises";
29
+ import { resolve } from "path";
30
+
31
+ const SUMMARY_PROMPT = `Please analyze this image comprehensively. Extract ALL information from the image including:
32
+
33
+ 1. **Overall Description**: What type of content is this? (screenshot, diagram, document, photograph, UI, code, etc.)
34
+ 2. **Text Content**: ALL visible text in the image, preserving structure and formatting. Include labels, buttons, error messages, file paths, code snippets, etc. Be exhaustive.
35
+ 3. **Visual Elements**: Colors, layout, components, icons, graphical elements
36
+ 4. **Technical Details**: For code, UI, diagrams - include exact values, class names, IDs, parameters, configurations
37
+ 5. **Contextual Information**: Window titles, terminal prompts, file names, timestamps, status indicators
38
+ 6. **Structure**: How elements are organized, relationships between components
39
+ 7. **Actionable Information**: Any visible commands, settings, configurations, or parameters that could be useful
40
+
41
+ Format your response clearly with sections and bullet points. Be extremely thorough - the user needs to understand everything visible in this image to perform their task.`;
42
+
43
+ export default function (pi: ExtensionAPI) {
44
+ const localCwd = process.cwd();
45
+ const localRead = createReadTool(localCwd);
46
+
47
+ // Custom read operations that detect images
48
+ const readOps: ReadOperations = {
49
+ readFile: (path) => fsReadFile(path),
50
+ access: (path) => fsAccess(path, constants.R_OK),
51
+ detectImageMimeType: async (absolutePath: string) => {
52
+ // Simple MIME type detection
53
+ const ext = absolutePath.split(".").pop()?.toLowerCase();
54
+ const supported = ["jpg", "jpeg", "png", "gif", "webp"];
55
+ if (ext && supported.includes(ext)) {
56
+ return `image/${ext === "jpg" ? "jpeg" : ext}`;
57
+ }
58
+ return null;
59
+ },
60
+ };
61
+
62
+ // Override the read tool
63
+ pi.registerTool({
64
+ ...localRead,
65
+ async execute(toolCallId, params, signal, onUpdate, ctx) {
66
+ const { path } = params;
67
+ const absolutePath = resolve(ctx.cwd, path);
68
+
69
+ // Check if current model is glm-4.7
70
+ const currentModel = ctx.model;
71
+ const isGlm4_7 = currentModel?.id === "glm-4.7" || currentModel?.id === "glm-4.7-long";
72
+
73
+ // If not glm-4.7, use standard read
74
+ if (!isGlm4_7) {
75
+ return localRead.execute(toolCallId, params, signal, onUpdate);
76
+ }
77
+
78
+ // Check if file is an image
79
+ const mimeType = await readOps.detectImageMimeType?.(absolutePath);
80
+ if (!mimeType) {
81
+ // Not an image, use standard read
82
+ return localRead.execute(toolCallId, params, signal, onUpdate);
83
+ }
84
+
85
+ // Call pi subprocess with glm-4.6v to analyze the image
86
+ onUpdate?.({
87
+ content: [{ type: "text", text: `[Analyzing image with glm-4.6v...]` }],
88
+ });
89
+
90
+ try {
91
+ const result = await new Promise<{ text: string }>((resolveResult, reject) => {
92
+ // Use @ prefix to indicate image attachment, and absolute path
93
+ const args = [
94
+ `@${absolutePath}`,
95
+ "--provider",
96
+ "zai",
97
+ "--model",
98
+ "glm-4.6v",
99
+ "-p",
100
+ SUMMARY_PROMPT,
101
+ "--json", // Get structured output
102
+ ];
103
+
104
+ const child = spawn("pi", args, {
105
+ stdio: ["ignore", "pipe", "pipe"],
106
+ env: process.env,
107
+ });
108
+
109
+ let stdout = "";
110
+ let stderr = "";
111
+
112
+ child.stdout.on("data", (data) => {
113
+ stdout += data.toString();
114
+ });
115
+
116
+ child.stderr.on("data", (data) => {
117
+ stderr += data.toString();
118
+ });
119
+
120
+ child.on("error", (err) => {
121
+ reject(err);
122
+ });
123
+
124
+ child.on("close", (code) => {
125
+ if (code !== 0) {
126
+ reject(new Error(`pi subprocess failed (${code}): ${stderr}`));
127
+ } else {
128
+ resolveResult({ text: stdout.trim() });
129
+ }
130
+ });
131
+
132
+ // Handle abort signal
133
+ if (signal) {
134
+ const onAbort = () => {
135
+ child.kill();
136
+ reject(new Error("Operation aborted"));
137
+ };
138
+ signal.addEventListener("abort", onAbort, { once: true });
139
+ child.on("close", () => {
140
+ signal.removeEventListener("abort", onAbort);
141
+ });
142
+ }
143
+ });
144
+
145
+ if (signal?.aborted) {
146
+ throw new Error("Operation aborted");
147
+ }
148
+
149
+ // Parse the result
150
+ let summaryText: string;
151
+ try {
152
+ // Try to parse as JSON first
153
+ const json = JSON.parse(result.text);
154
+ // Extract message content from the response
155
+ if (json.messages && Array.isArray(json.messages)) {
156
+ // Get the last assistant message
157
+ const assistantMsg = json.messages.findLast((m: any) => m.role === "assistant");
158
+ if (assistantMsg?.content) {
159
+ summaryText = assistantMsg.content
160
+ .filter((c: any) => c.type === "text")
161
+ .map((c: any) => c.text)
162
+ .join("\n");
163
+ } else {
164
+ summaryText = result.text;
165
+ }
166
+ } else {
167
+ summaryText = result.text;
168
+ }
169
+ } catch {
170
+ // Not JSON, use as-is
171
+ summaryText = result.text;
172
+ }
173
+
174
+ const readResult = {
175
+ content: [
176
+ {
177
+ type: "text",
178
+ text: `[Image analyzed with glm-4.6v]\n\n${summaryText}`,
179
+ } as TextContent,
180
+ ],
181
+ details: { summaryModel: "glm-4.6v" } as ReadToolDetails,
182
+ };
183
+
184
+ onUpdate?.(readResult);
185
+ return readResult;
186
+ } catch (error: any) {
187
+ // Throw an error so it shows as red in the UI
188
+ const errorMsg = `Image analysis failed with glm-4.6v: ${error.message}. The image may not be supported (e.g., animated GIFs) or there was a connection issue.`;
189
+ const err = new Error(errorMsg);
190
+ (err as any).isToolError = true; // Mark as a tool error for better handling
191
+ throw err;
192
+ }
193
+ },
194
+ });
195
+
196
+ // Add a command to manually trigger image analysis
197
+ pi.registerCommand("analyze-image", {
198
+ description: "Analyze an image file using glm-4.6v",
199
+ handler: async (args, ctx) => {
200
+ if (!ctx.hasUI) {
201
+ ctx.ui.notify("analyze-image requires interactive mode", "error");
202
+ return;
203
+ }
204
+
205
+ const imagePath = args.trim();
206
+ if (!imagePath) {
207
+ ctx.ui.notify("Usage: /analyze-image <path-to-image>", "error");
208
+ return;
209
+ }
210
+
211
+ const absolutePath = resolve(ctx.cwd, imagePath);
212
+
213
+ // Check if file is an image
214
+ const mimeType = await readOps.detectImageMimeType?.(absolutePath);
215
+ if (!mimeType) {
216
+ ctx.ui.notify("Not a supported image file", "error");
217
+ return;
218
+ }
219
+
220
+ // Call pi subprocess with glm-4.6v to analyze the image
221
+ const result = await ctx.ui.custom<string | null>((tui, theme, _kb, done) => {
222
+ const loader = new BorderedLoader(tui, theme, `Analyzing ${imagePath}...`);
223
+ loader.onAbort = () => done(null);
224
+
225
+ // Use @ prefix to indicate image attachment, and absolute path
226
+ const args = [
227
+ `@${absolutePath}`,
228
+ "--provider",
229
+ "zai",
230
+ "--model",
231
+ "glm-4.6v",
232
+ "-p",
233
+ SUMMARY_PROMPT,
234
+ "--json",
235
+ ];
236
+
237
+ const child = spawn("pi", args, {
238
+ stdio: ["ignore", "pipe", "pipe"],
239
+ env: process.env,
240
+ });
241
+
242
+ let stdout = "";
243
+ let stderr = "";
244
+
245
+ child.stdout.on("data", (data) => {
246
+ stdout += data.toString();
247
+ });
248
+
249
+ child.stderr.on("data", (data) => {
250
+ stderr += data.toString();
251
+ });
252
+
253
+ child.on("error", (err) => {
254
+ console.error("Image analysis failed:", err);
255
+ ctx.ui.notify(`Analysis failed: ${err.message}`, "error");
256
+ done(null);
257
+ });
258
+
259
+ child.on("close", (code) => {
260
+ if (code !== 0) {
261
+ console.error("Image analysis failed:", stderr);
262
+ ctx.ui.notify(`Analysis failed: ${stderr}`, "error");
263
+ done(null);
264
+ return;
265
+ }
266
+
267
+ let summaryText: string;
268
+ try {
269
+ const json = JSON.parse(stdout);
270
+ if (json.messages && Array.isArray(json.messages)) {
271
+ const assistantMsg = json.messages.findLast((m: any) => m.role === "assistant");
272
+ if (assistantMsg?.content) {
273
+ summaryText = assistantMsg.content
274
+ .filter((c: any) => c.type === "text")
275
+ .map((c: any) => c.text)
276
+ .join("\n");
277
+ } else {
278
+ summaryText = stdout;
279
+ }
280
+ } else {
281
+ summaryText = stdout;
282
+ }
283
+ } catch {
284
+ summaryText = stdout;
285
+ }
286
+
287
+ done(summaryText);
288
+ });
289
+
290
+ if (loader.signal.aborted) {
291
+ child.kill();
292
+ }
293
+
294
+ loader.signal.addEventListener("abort", () => {
295
+ child.kill();
296
+ });
297
+
298
+ return loader;
299
+ });
300
+
301
+ if (result === null) {
302
+ ctx.ui.notify("Cancelled", "info");
303
+ return;
304
+ }
305
+
306
+ // Show the analysis
307
+ await ctx.ui.editor("Image Analysis", result);
308
+ },
309
+ });
310
+ }