oh-my-opencode 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +193 -187
- package/README.md +249 -245
- package/dist/agents/multimodal-looker.d.ts +2 -0
- package/dist/agents/types.d.ts +1 -1
- package/dist/index.js +145 -2
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/look-at/constants.d.ts +2 -0
- package/dist/tools/look-at/index.d.ts +3 -0
- package/dist/tools/look-at/tools.d.ts +12 -0
- package/dist/tools/look-at/types.d.ts +4 -0
- package/package.json +1 -1
package/dist/agents/types.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
import type { AgentConfig } from "@opencode-ai/sdk";
|
|
2
|
-
export type AgentName = "oracle" | "librarian" | "explore" | "frontend-ui-ux-engineer" | "document-writer";
|
|
2
|
+
export type AgentName = "oracle" | "librarian" | "explore" | "frontend-ui-ux-engineer" | "document-writer" | "multimodal-looker";
|
|
3
3
|
export type AgentOverrideConfig = Partial<AgentConfig>;
|
|
4
4
|
export type AgentOverrides = Partial<Record<AgentName, AgentOverrideConfig>>;
|
package/dist/index.js
CHANGED
|
@@ -2358,6 +2358,47 @@ STOP HERE - DO NOT CONTINUE TO NEXT TASK
|
|
|
2358
2358
|
You are a technical writer who creates documentation that developers actually want to read.
|
|
2359
2359
|
</guide>`
|
|
2360
2360
|
};
|
|
2361
|
+
|
|
2362
|
+
// src/agents/multimodal-looker.ts
|
|
2363
|
+
var multimodalLookerAgent = {
|
|
2364
|
+
description: "Analyze media files (PDFs, images, diagrams) that require interpretation beyond raw text. Extracts specific information or summaries from documents, describes visual content. Use when you need analyzed/extracted data rather than literal file contents.",
|
|
2365
|
+
mode: "subagent",
|
|
2366
|
+
model: "google/gemini-2.5-flash",
|
|
2367
|
+
temperature: 0.1,
|
|
2368
|
+
tools: { Read: true },
|
|
2369
|
+
prompt: `You interpret media files that cannot be read as plain text.
|
|
2370
|
+
|
|
2371
|
+
Your job: examine the attached file and extract ONLY what was requested.
|
|
2372
|
+
|
|
2373
|
+
When to use you:
|
|
2374
|
+
- Media files the Read tool cannot interpret
|
|
2375
|
+
- Extracting specific information or summaries from documents
|
|
2376
|
+
- Describing visual content in images or diagrams
|
|
2377
|
+
- When analyzed/extracted data is needed, not raw file contents
|
|
2378
|
+
|
|
2379
|
+
When NOT to use you:
|
|
2380
|
+
- Source code or plain text files needing exact contents (use Read)
|
|
2381
|
+
- Files that need editing afterward (need literal content from Read)
|
|
2382
|
+
- Simple file reading where no interpretation is needed
|
|
2383
|
+
|
|
2384
|
+
How you work:
|
|
2385
|
+
1. Receive a file path and a goal describing what to extract
|
|
2386
|
+
2. Read and analyze the file deeply
|
|
2387
|
+
3. Return ONLY the relevant extracted information
|
|
2388
|
+
4. The main agent never processes the raw file - you save context tokens
|
|
2389
|
+
|
|
2390
|
+
For PDFs: extract text, structure, tables, data from specific sections
|
|
2391
|
+
For images: describe layouts, UI elements, text, diagrams, charts
|
|
2392
|
+
For diagrams: explain relationships, flows, architecture depicted
|
|
2393
|
+
|
|
2394
|
+
Response rules:
|
|
2395
|
+
- Return extracted information directly, no preamble
|
|
2396
|
+
- If info not found, state clearly what's missing
|
|
2397
|
+
- Match the language of the request
|
|
2398
|
+
- Be thorough on the goal, concise on everything else
|
|
2399
|
+
|
|
2400
|
+
Your output goes straight to the main agent for continued work.`
|
|
2401
|
+
};
|
|
2361
2402
|
// src/shared/frontmatter.ts
|
|
2362
2403
|
function parseFrontmatter(content) {
|
|
2363
2404
|
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/;
|
|
@@ -2717,7 +2758,8 @@ var allBuiltinAgents = {
|
|
|
2717
2758
|
librarian: librarianAgent,
|
|
2718
2759
|
explore: exploreAgent,
|
|
2719
2760
|
"frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
|
|
2720
|
-
"document-writer": documentWriterAgent
|
|
2761
|
+
"document-writer": documentWriterAgent,
|
|
2762
|
+
"multimodal-looker": multimodalLookerAgent
|
|
2721
2763
|
};
|
|
2722
2764
|
function mergeAgentConfig(base, override) {
|
|
2723
2765
|
return deepMerge(base, override);
|
|
@@ -24168,6 +24210,97 @@ session_id: ${sessionID}
|
|
|
24168
24210
|
`);
|
|
24169
24211
|
return output;
|
|
24170
24212
|
}
|
|
24213
|
+
// src/tools/look-at/constants.ts
|
|
24214
|
+
var MULTIMODAL_LOOKER_AGENT = "multimodal-looker";
|
|
24215
|
+
var LOOK_AT_DESCRIPTION = `Analyze media files (PDFs, images, diagrams) that require visual interpretation.
|
|
24216
|
+
|
|
24217
|
+
Use this tool to extract specific information from files that cannot be processed as plain text:
|
|
24218
|
+
- PDF documents: extract text, tables, structure, specific sections
|
|
24219
|
+
- Images: describe layouts, UI elements, text content, diagrams
|
|
24220
|
+
- Charts/Graphs: explain data, trends, relationships
|
|
24221
|
+
- Screenshots: identify UI components, text, visual elements
|
|
24222
|
+
- Architecture diagrams: explain flows, connections, components
|
|
24223
|
+
|
|
24224
|
+
Parameters:
|
|
24225
|
+
- file_path: Absolute path to the file to analyze
|
|
24226
|
+
- goal: What specific information to extract (be specific for better results)
|
|
24227
|
+
|
|
24228
|
+
Examples:
|
|
24229
|
+
- "Extract all API endpoints from this OpenAPI spec PDF"
|
|
24230
|
+
- "Describe the UI layout and components in this screenshot"
|
|
24231
|
+
- "Explain the data flow in this architecture diagram"
|
|
24232
|
+
- "List all table data from page 3 of this PDF"
|
|
24233
|
+
|
|
24234
|
+
This tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis,
|
|
24235
|
+
saving tokens in the main conversation while providing accurate visual interpretation.`;
|
|
24236
|
+
// src/tools/look-at/tools.ts
|
|
24237
|
+
function createLookAt(ctx) {
|
|
24238
|
+
return tool({
|
|
24239
|
+
description: LOOK_AT_DESCRIPTION,
|
|
24240
|
+
args: {
|
|
24241
|
+
file_path: tool.schema.string().describe("Absolute path to the file to analyze"),
|
|
24242
|
+
goal: tool.schema.string().describe("What specific information to extract from the file")
|
|
24243
|
+
},
|
|
24244
|
+
async execute(args, toolContext) {
|
|
24245
|
+
log(`[look_at] Analyzing file: ${args.file_path}, goal: ${args.goal}`);
|
|
24246
|
+
const prompt = `Analyze this file and extract the requested information.
|
|
24247
|
+
|
|
24248
|
+
File path: ${args.file_path}
|
|
24249
|
+
Goal: ${args.goal}
|
|
24250
|
+
|
|
24251
|
+
Read the file using the Read tool, then provide ONLY the extracted information that matches the goal.
|
|
24252
|
+
Be thorough on what was requested, concise on everything else.
|
|
24253
|
+
If the requested information is not found, clearly state what is missing.`;
|
|
24254
|
+
log(`[look_at] Creating session with parent: ${toolContext.sessionID}`);
|
|
24255
|
+
const createResult = await ctx.client.session.create({
|
|
24256
|
+
body: {
|
|
24257
|
+
parentID: toolContext.sessionID,
|
|
24258
|
+
title: `look_at: ${args.goal.substring(0, 50)}`
|
|
24259
|
+
}
|
|
24260
|
+
});
|
|
24261
|
+
if (createResult.error) {
|
|
24262
|
+
log(`[look_at] Session create error:`, createResult.error);
|
|
24263
|
+
return `Error: Failed to create session: ${createResult.error}`;
|
|
24264
|
+
}
|
|
24265
|
+
const sessionID = createResult.data.id;
|
|
24266
|
+
log(`[look_at] Created session: ${sessionID}`);
|
|
24267
|
+
log(`[look_at] Sending prompt to session ${sessionID}`);
|
|
24268
|
+
await ctx.client.session.prompt({
|
|
24269
|
+
path: { id: sessionID },
|
|
24270
|
+
body: {
|
|
24271
|
+
agent: MULTIMODAL_LOOKER_AGENT,
|
|
24272
|
+
tools: {
|
|
24273
|
+
task: false,
|
|
24274
|
+
call_omo_agent: false,
|
|
24275
|
+
look_at: false
|
|
24276
|
+
},
|
|
24277
|
+
parts: [{ type: "text", text: prompt }]
|
|
24278
|
+
}
|
|
24279
|
+
});
|
|
24280
|
+
log(`[look_at] Prompt sent, fetching messages...`);
|
|
24281
|
+
const messagesResult = await ctx.client.session.messages({
|
|
24282
|
+
path: { id: sessionID }
|
|
24283
|
+
});
|
|
24284
|
+
if (messagesResult.error) {
|
|
24285
|
+
log(`[look_at] Messages error:`, messagesResult.error);
|
|
24286
|
+
return `Error: Failed to get messages: ${messagesResult.error}`;
|
|
24287
|
+
}
|
|
24288
|
+
const messages = messagesResult.data;
|
|
24289
|
+
log(`[look_at] Got ${messages.length} messages`);
|
|
24290
|
+
const lastAssistantMessage = messages.filter((m) => m.info.role === "assistant").sort((a, b) => (b.info.time?.created || 0) - (a.info.time?.created || 0))[0];
|
|
24291
|
+
if (!lastAssistantMessage) {
|
|
24292
|
+
log(`[look_at] No assistant message found`);
|
|
24293
|
+
return `Error: No response from multimodal-looker agent`;
|
|
24294
|
+
}
|
|
24295
|
+
log(`[look_at] Found assistant message with ${lastAssistantMessage.parts.length} parts`);
|
|
24296
|
+
const textParts = lastAssistantMessage.parts.filter((p) => p.type === "text");
|
|
24297
|
+
const responseText = textParts.map((p) => p.text).join(`
|
|
24298
|
+
`);
|
|
24299
|
+
log(`[look_at] Got response, length: ${responseText.length}`);
|
|
24300
|
+
return responseText;
|
|
24301
|
+
}
|
|
24302
|
+
});
|
|
24303
|
+
}
|
|
24171
24304
|
// src/tools/index.ts
|
|
24172
24305
|
function createBackgroundTools(manager, client2) {
|
|
24173
24306
|
return {
|
|
@@ -24685,13 +24818,15 @@ var OhMyOpenCodePlugin = async (ctx) => {
|
|
|
24685
24818
|
const backgroundNotificationHook = isHookEnabled("background-notification") ? createBackgroundNotificationHook(backgroundManager) : null;
|
|
24686
24819
|
const backgroundTools = createBackgroundTools(backgroundManager, ctx.client);
|
|
24687
24820
|
const callOmoAgent = createCallOmoAgent(ctx, backgroundManager);
|
|
24821
|
+
const lookAt = createLookAt(ctx);
|
|
24688
24822
|
const googleAuthHooks = pluginConfig.google_auth ? await createGoogleAntigravityAuthPlugin(ctx) : null;
|
|
24689
24823
|
return {
|
|
24690
24824
|
...googleAuthHooks ? { auth: googleAuthHooks.auth } : {},
|
|
24691
24825
|
tool: {
|
|
24692
24826
|
...builtinTools,
|
|
24693
24827
|
...backgroundTools,
|
|
24694
|
-
call_omo_agent: callOmoAgent
|
|
24828
|
+
call_omo_agent: callOmoAgent,
|
|
24829
|
+
look_at: lookAt
|
|
24695
24830
|
},
|
|
24696
24831
|
"chat.message": async (input, output) => {
|
|
24697
24832
|
await claudeCodeHooks["chat.message"]?.(input, output);
|
|
@@ -24722,6 +24857,14 @@ var OhMyOpenCodePlugin = async (ctx) => {
|
|
|
24722
24857
|
call_omo_agent: false
|
|
24723
24858
|
};
|
|
24724
24859
|
}
|
|
24860
|
+
if (config3.agent["multimodal-looker"]) {
|
|
24861
|
+
config3.agent["multimodal-looker"].tools = {
|
|
24862
|
+
...config3.agent["multimodal-looker"].tools,
|
|
24863
|
+
task: false,
|
|
24864
|
+
call_omo_agent: false,
|
|
24865
|
+
look_at: false
|
|
24866
|
+
};
|
|
24867
|
+
}
|
|
24725
24868
|
const mcpResult = pluginConfig.claude_code?.mcp ?? true ? await loadMcpConfigs() : { servers: {} };
|
|
24726
24869
|
config3.mcp = {
|
|
24727
24870
|
...config3.mcp,
|
package/dist/tools/index.d.ts
CHANGED
|
@@ -2,6 +2,7 @@ import type { PluginInput } from "@opencode-ai/plugin";
|
|
|
2
2
|
import type { BackgroundManager } from "../features/background-agent";
|
|
3
3
|
type OpencodeClient = PluginInput["client"];
|
|
4
4
|
export { createCallOmoAgent } from "./call-omo-agent";
|
|
5
|
+
export { createLookAt } from "./look-at";
|
|
5
6
|
export declare function createBackgroundTools(manager: BackgroundManager, client: OpencodeClient): {
|
|
6
7
|
background_task: {
|
|
7
8
|
description: string;
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
export declare const MULTIMODAL_LOOKER_AGENT: "multimodal-looker";
|
|
2
|
+
export declare const LOOK_AT_DESCRIPTION = "Analyze media files (PDFs, images, diagrams) that require visual interpretation.\n\nUse this tool to extract specific information from files that cannot be processed as plain text:\n- PDF documents: extract text, tables, structure, specific sections\n- Images: describe layouts, UI elements, text content, diagrams\n- Charts/Graphs: explain data, trends, relationships\n- Screenshots: identify UI components, text, visual elements\n- Architecture diagrams: explain flows, connections, components\n\nParameters:\n- file_path: Absolute path to the file to analyze\n- goal: What specific information to extract (be specific for better results)\n\nExamples:\n- \"Extract all API endpoints from this OpenAPI spec PDF\"\n- \"Describe the UI layout and components in this screenshot\"\n- \"Explain the data flow in this architecture diagram\"\n- \"List all table data from page 3 of this PDF\"\n\nThis tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis,\nsaving tokens in the main conversation while providing accurate visual interpretation.";
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { type PluginInput } from "@opencode-ai/plugin";
|
|
2
|
+
export declare function createLookAt(ctx: PluginInput): {
|
|
3
|
+
description: string;
|
|
4
|
+
args: {
|
|
5
|
+
file_path: import("zod").ZodString;
|
|
6
|
+
goal: import("zod").ZodString;
|
|
7
|
+
};
|
|
8
|
+
execute(args: {
|
|
9
|
+
file_path: string;
|
|
10
|
+
goal: string;
|
|
11
|
+
}, context: import("@opencode-ai/plugin").ToolContext): Promise<string>;
|
|
12
|
+
};
|