npm - oh-my-opencode - Versions diffs - 1.0.2 → 1.1.1 - Mend

oh-my-opencode 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.ko.md +193 -187
package/README.md +249 -245
package/dist/agents/multimodal-looker.d.ts +2 -0
package/dist/agents/types.d.ts +1 -1
package/dist/index.js +146 -3
package/dist/tools/index.d.ts +1 -0
package/dist/tools/look-at/constants.d.ts +2 -0
package/dist/tools/look-at/index.d.ts +3 -0
package/dist/tools/look-at/tools.d.ts +12 -0
package/dist/tools/look-at/types.d.ts +4 -0
package/package.json +1 -1

package/dist/agents/multimodal-looker.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import type { AgentConfig } from "@opencode-ai/sdk";
2	+ export declare const multimodalLookerAgent: AgentConfig;

package/dist/agents/types.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
 import type { AgentConfig } from "@opencode-ai/sdk";
-export type AgentName = "oracle" | "librarian" | "explore" | "frontend-ui-ux-engineer" | "document-writer";
+export type AgentName = "oracle" | "librarian" | "explore" | "frontend-ui-ux-engineer" | "document-writer" | "multimodal-looker";
 export type AgentOverrideConfig = Partial<AgentConfig>;
 export type AgentOverrides = Partial<Record<AgentName, AgentOverrideConfig>>;

package/dist/index.js CHANGED Viewed

@@ -1534,7 +1534,7 @@ IMPORTANT: Only your last message is returned to the main agent and displayed to
 var librarianAgent = {
   description: "Specialized codebase understanding agent for multi-repository analysis, searching remote codebases, retrieving official documentation, and finding implementation examples using GitHub CLI, Context7, and Web Search. MUST BE USED when users ask to look up code in remote repositories, explain library internals, or find usage examples in open source.",
   mode: "subagent",
-  model: "anthropic/claude-haiku-4-5",
+  model: "anthropic/claude-sonnet-4",
   temperature: 0.1,
   tools: { write: false, edit: false },
   prompt: `# THE LIBRARIAN
@@ -2358,6 +2358,47 @@ STOP HERE - DO NOT CONTINUE TO NEXT TASK
 You are a technical writer who creates documentation that developers actually want to read.
 </guide>`
 };
+// src/agents/multimodal-looker.ts
+var multimodalLookerAgent = {
+  description: "Analyze media files (PDFs, images, diagrams) that require interpretation beyond raw text. Extracts specific information or summaries from documents, describes visual content. Use when you need analyzed/extracted data rather than literal file contents.",
+  mode: "subagent",
+  model: "google/gemini-2.5-flash",
+  temperature: 0.1,
+  tools: { Read: true },
+  prompt: `You interpret media files that cannot be read as plain text.
+Your job: examine the attached file and extract ONLY what was requested.
+When to use you:
+- Media files the Read tool cannot interpret
+- Extracting specific information or summaries from documents
+- Describing visual content in images or diagrams
+- When analyzed/extracted data is needed, not raw file contents
+When NOT to use you:
+- Source code or plain text files needing exact contents (use Read)
+- Files that need editing afterward (need literal content from Read)
+- Simple file reading where no interpretation is needed
+How you work:
+1. Receive a file path and a goal describing what to extract
+2. Read and analyze the file deeply
+3. Return ONLY the relevant extracted information
+4. The main agent never processes the raw file - you save context tokens
+For PDFs: extract text, structure, tables, data from specific sections
+For images: describe layouts, UI elements, text, diagrams, charts
+For diagrams: explain relationships, flows, architecture depicted
+Response rules:
+- Return extracted information directly, no preamble
+- If info not found, state clearly what's missing
+- Match the language of the request
+- Be thorough on the goal, concise on everything else
+Your output goes straight to the main agent for continued work.`
+};
 // src/shared/frontmatter.ts
 function parseFrontmatter(content) {
   const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/;
@@ -2717,7 +2758,8 @@ var allBuiltinAgents = {
   librarian: librarianAgent,
   explore: exploreAgent,
   "frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
-  "document-writer": documentWriterAgent
+  "document-writer": documentWriterAgent,
+  "multimodal-looker": multimodalLookerAgent
 };
 function mergeAgentConfig(base, override) {
   return deepMerge(base, override);
@@ -24168,6 +24210,97 @@ session_id: ${sessionID}
 `);
   return output;
 }
+// src/tools/look-at/constants.ts
+var MULTIMODAL_LOOKER_AGENT = "multimodal-looker";
+var LOOK_AT_DESCRIPTION = `Analyze media files (PDFs, images, diagrams) that require visual interpretation.
+Use this tool to extract specific information from files that cannot be processed as plain text:
+- PDF documents: extract text, tables, structure, specific sections
+- Images: describe layouts, UI elements, text content, diagrams
+- Charts/Graphs: explain data, trends, relationships
+- Screenshots: identify UI components, text, visual elements
+- Architecture diagrams: explain flows, connections, components
+Parameters:
+- file_path: Absolute path to the file to analyze
+- goal: What specific information to extract (be specific for better results)
+Examples:
+- "Extract all API endpoints from this OpenAPI spec PDF"
+- "Describe the UI layout and components in this screenshot"
+- "Explain the data flow in this architecture diagram"
+- "List all table data from page 3 of this PDF"
+This tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis,
+saving tokens in the main conversation while providing accurate visual interpretation.`;
+// src/tools/look-at/tools.ts
+function createLookAt(ctx) {
+  return tool({
+    description: LOOK_AT_DESCRIPTION,
+    args: {
+      file_path: tool.schema.string().describe("Absolute path to the file to analyze"),
+      goal: tool.schema.string().describe("What specific information to extract from the file")
+    },
+    async execute(args, toolContext) {
+      log(`[look_at] Analyzing file: ${args.file_path}, goal: ${args.goal}`);
+      const prompt = `Analyze this file and extract the requested information.
+File path: ${args.file_path}
+Goal: ${args.goal}
+Read the file using the Read tool, then provide ONLY the extracted information that matches the goal.
+Be thorough on what was requested, concise on everything else.
+If the requested information is not found, clearly state what is missing.`;
+      log(`[look_at] Creating session with parent: ${toolContext.sessionID}`);
+      const createResult = await ctx.client.session.create({
+        body: {
+          parentID: toolContext.sessionID,
+          title: `look_at: ${args.goal.substring(0, 50)}`
+        }
+      });
+      if (createResult.error) {
+        log(`[look_at] Session create error:`, createResult.error);
+        return `Error: Failed to create session: ${createResult.error}`;
+      }
+      const sessionID = createResult.data.id;
+      log(`[look_at] Created session: ${sessionID}`);
+      log(`[look_at] Sending prompt to session ${sessionID}`);
+      await ctx.client.session.prompt({
+        path: { id: sessionID },
+        body: {
+          agent: MULTIMODAL_LOOKER_AGENT,
+          tools: {
+            task: false,
+            call_omo_agent: false,
+            look_at: false
+          },
+          parts: [{ type: "text", text: prompt }]
+        }
+      });
+      log(`[look_at] Prompt sent, fetching messages...`);
+      const messagesResult = await ctx.client.session.messages({
+        path: { id: sessionID }
+      });
+      if (messagesResult.error) {
+        log(`[look_at] Messages error:`, messagesResult.error);
+        return `Error: Failed to get messages: ${messagesResult.error}`;
+      }
+      const messages = messagesResult.data;
+      log(`[look_at] Got ${messages.length} messages`);
+      const lastAssistantMessage = messages.filter((m) => m.info.role === "assistant").sort((a, b) => (b.info.time?.created || 0) - (a.info.time?.created || 0))[0];
+      if (!lastAssistantMessage) {
+        log(`[look_at] No assistant message found`);
+        return `Error: No response from multimodal-looker agent`;
+      }
+      log(`[look_at] Found assistant message with ${lastAssistantMessage.parts.length} parts`);
+      const textParts = lastAssistantMessage.parts.filter((p) => p.type === "text");
+      const responseText = textParts.map((p) => p.text).join(`
+`);
+      log(`[look_at] Got response, length: ${responseText.length}`);
+      return responseText;
+    }
+  });
+}
 // src/tools/index.ts
 function createBackgroundTools(manager, client2) {
   return {
@@ -24685,13 +24818,15 @@ var OhMyOpenCodePlugin = async (ctx) => {
   const backgroundNotificationHook = isHookEnabled("background-notification") ? createBackgroundNotificationHook(backgroundManager) : null;
   const backgroundTools = createBackgroundTools(backgroundManager, ctx.client);
   const callOmoAgent = createCallOmoAgent(ctx, backgroundManager);
+  const lookAt = createLookAt(ctx);
   const googleAuthHooks = pluginConfig.google_auth ? await createGoogleAntigravityAuthPlugin(ctx) : null;
   return {
     ...googleAuthHooks ? { auth: googleAuthHooks.auth } : {},
     tool: {
       ...builtinTools,
       ...backgroundTools,
-      call_omo_agent: callOmoAgent
+      call_omo_agent: callOmoAgent,
+      look_at: lookAt
     },
     "chat.message": async (input, output) => {
       await claudeCodeHooks["chat.message"]?.(input, output);
@@ -24722,6 +24857,14 @@ var OhMyOpenCodePlugin = async (ctx) => {
           call_omo_agent: false
         };
       }
+      if (config3.agent["multimodal-looker"]) {
+        config3.agent["multimodal-looker"].tools = {
+          ...config3.agent["multimodal-looker"].tools,
+          task: false,
+          call_omo_agent: false,
+          look_at: false
+        };
+      }
       const mcpResult = pluginConfig.claude_code?.mcp ?? true ? await loadMcpConfigs() : { servers: {} };
       config3.mcp = {
         ...config3.mcp,

package/dist/tools/index.d.ts CHANGED Viewed

@@ -2,6 +2,7 @@ import type { PluginInput } from "@opencode-ai/plugin";
 import type { BackgroundManager } from "../features/background-agent";
 type OpencodeClient = PluginInput["client"];
 export { createCallOmoAgent } from "./call-omo-agent";
+export { createLookAt } from "./look-at";
 export declare function createBackgroundTools(manager: BackgroundManager, client: OpencodeClient): {
     background_task: {
         description: string;

package/dist/tools/look-at/constants.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export declare const MULTIMODAL_LOOKER_AGENT: "multimodal-looker";
2	+ export declare const LOOK_AT_DESCRIPTION = "Analyze media files (PDFs, images, diagrams) that require visual interpretation.\n\nUse this tool to extract specific information from files that cannot be processed as plain text:\n- PDF documents: extract text, tables, structure, specific sections\n- Images: describe layouts, UI elements, text content, diagrams\n- Charts/Graphs: explain data, trends, relationships\n- Screenshots: identify UI components, text, visual elements\n- Architecture diagrams: explain flows, connections, components\n\nParameters:\n- file_path: Absolute path to the file to analyze\n- goal: What specific information to extract (be specific for better results)\n\nExamples:\n- \"Extract all API endpoints from this OpenAPI spec PDF\"\n- \"Describe the UI layout and components in this screenshot\"\n- \"Explain the data flow in this architecture diagram\"\n- \"List all table data from page 3 of this PDF\"\n\nThis tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis,\nsaving tokens in the main conversation while providing accurate visual interpretation.";

package/dist/tools/look-at/index.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export * from "./types";
+export * from "./constants";
+export { createLookAt } from "./tools";

package/dist/tools/look-at/tools.d.ts ADDED Viewed

@@ -0,0 +1,12 @@
+import { type PluginInput } from "@opencode-ai/plugin";
+export declare function createLookAt(ctx: PluginInput): {
+    description: string;
+    args: {
+        file_path: import("zod").ZodString;
+        goal: import("zod").ZodString;
+    };
+    execute(args: {
+        file_path: string;
+        goal: string;
+    }, context: import("@opencode-ai/plugin").ToolContext): Promise<string>;
+};

package/dist/tools/look-at/types.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+export interface LookAtArgs {
+    file_path: string;
+    goal: string;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "oh-my-opencode",
-  "version": "1.0.2",
+  "version": "1.1.1",
   "description": "OpenCode plugin - custom agents (oracle, librarian) and enhanced features",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",