oh-my-opencode 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ import type { AgentConfig } from "@opencode-ai/sdk";
2
+ export declare const multimodalLookerAgent: AgentConfig;
@@ -1,4 +1,4 @@
1
1
  import type { AgentConfig } from "@opencode-ai/sdk";
2
- export type AgentName = "oracle" | "librarian" | "explore" | "frontend-ui-ux-engineer" | "document-writer";
2
+ export type AgentName = "oracle" | "librarian" | "explore" | "frontend-ui-ux-engineer" | "document-writer" | "multimodal-looker";
3
3
  export type AgentOverrideConfig = Partial<AgentConfig>;
4
4
  export type AgentOverrides = Partial<Record<AgentName, AgentOverrideConfig>>;
package/dist/index.js CHANGED
@@ -1534,7 +1534,7 @@ IMPORTANT: Only your last message is returned to the main agent and displayed to
1534
1534
  var librarianAgent = {
1535
1535
  description: "Specialized codebase understanding agent for multi-repository analysis, searching remote codebases, retrieving official documentation, and finding implementation examples using GitHub CLI, Context7, and Web Search. MUST BE USED when users ask to look up code in remote repositories, explain library internals, or find usage examples in open source.",
1536
1536
  mode: "subagent",
1537
- model: "anthropic/claude-haiku-4-5",
1537
+ model: "anthropic/claude-sonnet-4",
1538
1538
  temperature: 0.1,
1539
1539
  tools: { write: false, edit: false },
1540
1540
  prompt: `# THE LIBRARIAN
@@ -2358,6 +2358,47 @@ STOP HERE - DO NOT CONTINUE TO NEXT TASK
2358
2358
  You are a technical writer who creates documentation that developers actually want to read.
2359
2359
  </guide>`
2360
2360
  };
2361
+
2362
+ // src/agents/multimodal-looker.ts
2363
+ var multimodalLookerAgent = {
2364
+ description: "Analyze media files (PDFs, images, diagrams) that require interpretation beyond raw text. Extracts specific information or summaries from documents, describes visual content. Use when you need analyzed/extracted data rather than literal file contents.",
2365
+ mode: "subagent",
2366
+ model: "google/gemini-2.5-flash",
2367
+ temperature: 0.1,
2368
+ tools: { Read: true },
2369
+ prompt: `You interpret media files that cannot be read as plain text.
2370
+
2371
+ Your job: examine the attached file and extract ONLY what was requested.
2372
+
2373
+ When to use you:
2374
+ - Media files the Read tool cannot interpret
2375
+ - Extracting specific information or summaries from documents
2376
+ - Describing visual content in images or diagrams
2377
+ - When analyzed/extracted data is needed, not raw file contents
2378
+
2379
+ When NOT to use you:
2380
+ - Source code or plain text files needing exact contents (use Read)
2381
+ - Files that need editing afterward (need literal content from Read)
2382
+ - Simple file reading where no interpretation is needed
2383
+
2384
+ How you work:
2385
+ 1. Receive a file path and a goal describing what to extract
2386
+ 2. Read and analyze the file deeply
2387
+ 3. Return ONLY the relevant extracted information
2388
+ 4. The main agent never processes the raw file - you save context tokens
2389
+
2390
+ For PDFs: extract text, structure, tables, data from specific sections
2391
+ For images: describe layouts, UI elements, text, diagrams, charts
2392
+ For diagrams: explain relationships, flows, architecture depicted
2393
+
2394
+ Response rules:
2395
+ - Return extracted information directly, no preamble
2396
+ - If info not found, state clearly what's missing
2397
+ - Match the language of the request
2398
+ - Be thorough on the goal, concise on everything else
2399
+
2400
+ Your output goes straight to the main agent for continued work.`
2401
+ };
2361
2402
  // src/shared/frontmatter.ts
2362
2403
  function parseFrontmatter(content) {
2363
2404
  const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/;
@@ -2717,7 +2758,8 @@ var allBuiltinAgents = {
2717
2758
  librarian: librarianAgent,
2718
2759
  explore: exploreAgent,
2719
2760
  "frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
2720
- "document-writer": documentWriterAgent
2761
+ "document-writer": documentWriterAgent,
2762
+ "multimodal-looker": multimodalLookerAgent
2721
2763
  };
2722
2764
  function mergeAgentConfig(base, override) {
2723
2765
  return deepMerge(base, override);
@@ -24168,6 +24210,97 @@ session_id: ${sessionID}
24168
24210
  `);
24169
24211
  return output;
24170
24212
  }
24213
+ // src/tools/look-at/constants.ts
24214
+ var MULTIMODAL_LOOKER_AGENT = "multimodal-looker";
24215
+ var LOOK_AT_DESCRIPTION = `Analyze media files (PDFs, images, diagrams) that require visual interpretation.
24216
+
24217
+ Use this tool to extract specific information from files that cannot be processed as plain text:
24218
+ - PDF documents: extract text, tables, structure, specific sections
24219
+ - Images: describe layouts, UI elements, text content, diagrams
24220
+ - Charts/Graphs: explain data, trends, relationships
24221
+ - Screenshots: identify UI components, text, visual elements
24222
+ - Architecture diagrams: explain flows, connections, components
24223
+
24224
+ Parameters:
24225
+ - file_path: Absolute path to the file to analyze
24226
+ - goal: What specific information to extract (be specific for better results)
24227
+
24228
+ Examples:
24229
+ - "Extract all API endpoints from this OpenAPI spec PDF"
24230
+ - "Describe the UI layout and components in this screenshot"
24231
+ - "Explain the data flow in this architecture diagram"
24232
+ - "List all table data from page 3 of this PDF"
24233
+
24234
+ This tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis,
24235
+ saving tokens in the main conversation while providing accurate visual interpretation.`;
24236
+ // src/tools/look-at/tools.ts
24237
+ function createLookAt(ctx) {
24238
+ return tool({
24239
+ description: LOOK_AT_DESCRIPTION,
24240
+ args: {
24241
+ file_path: tool.schema.string().describe("Absolute path to the file to analyze"),
24242
+ goal: tool.schema.string().describe("What specific information to extract from the file")
24243
+ },
24244
+ async execute(args, toolContext) {
24245
+ log(`[look_at] Analyzing file: ${args.file_path}, goal: ${args.goal}`);
24246
+ const prompt = `Analyze this file and extract the requested information.
24247
+
24248
+ File path: ${args.file_path}
24249
+ Goal: ${args.goal}
24250
+
24251
+ Read the file using the Read tool, then provide ONLY the extracted information that matches the goal.
24252
+ Be thorough on what was requested, concise on everything else.
24253
+ If the requested information is not found, clearly state what is missing.`;
24254
+ log(`[look_at] Creating session with parent: ${toolContext.sessionID}`);
24255
+ const createResult = await ctx.client.session.create({
24256
+ body: {
24257
+ parentID: toolContext.sessionID,
24258
+ title: `look_at: ${args.goal.substring(0, 50)}`
24259
+ }
24260
+ });
24261
+ if (createResult.error) {
24262
+ log(`[look_at] Session create error:`, createResult.error);
24263
+ return `Error: Failed to create session: ${createResult.error}`;
24264
+ }
24265
+ const sessionID = createResult.data.id;
24266
+ log(`[look_at] Created session: ${sessionID}`);
24267
+ log(`[look_at] Sending prompt to session ${sessionID}`);
24268
+ await ctx.client.session.prompt({
24269
+ path: { id: sessionID },
24270
+ body: {
24271
+ agent: MULTIMODAL_LOOKER_AGENT,
24272
+ tools: {
24273
+ task: false,
24274
+ call_omo_agent: false,
24275
+ look_at: false
24276
+ },
24277
+ parts: [{ type: "text", text: prompt }]
24278
+ }
24279
+ });
24280
+ log(`[look_at] Prompt sent, fetching messages...`);
24281
+ const messagesResult = await ctx.client.session.messages({
24282
+ path: { id: sessionID }
24283
+ });
24284
+ if (messagesResult.error) {
24285
+ log(`[look_at] Messages error:`, messagesResult.error);
24286
+ return `Error: Failed to get messages: ${messagesResult.error}`;
24287
+ }
24288
+ const messages = messagesResult.data;
24289
+ log(`[look_at] Got ${messages.length} messages`);
24290
+ const lastAssistantMessage = messages.filter((m) => m.info.role === "assistant").sort((a, b) => (b.info.time?.created || 0) - (a.info.time?.created || 0))[0];
24291
+ if (!lastAssistantMessage) {
24292
+ log(`[look_at] No assistant message found`);
24293
+ return `Error: No response from multimodal-looker agent`;
24294
+ }
24295
+ log(`[look_at] Found assistant message with ${lastAssistantMessage.parts.length} parts`);
24296
+ const textParts = lastAssistantMessage.parts.filter((p) => p.type === "text");
24297
+ const responseText = textParts.map((p) => p.text).join(`
24298
+ `);
24299
+ log(`[look_at] Got response, length: ${responseText.length}`);
24300
+ return responseText;
24301
+ }
24302
+ });
24303
+ }
24171
24304
  // src/tools/index.ts
24172
24305
  function createBackgroundTools(manager, client2) {
24173
24306
  return {
@@ -24685,13 +24818,15 @@ var OhMyOpenCodePlugin = async (ctx) => {
24685
24818
  const backgroundNotificationHook = isHookEnabled("background-notification") ? createBackgroundNotificationHook(backgroundManager) : null;
24686
24819
  const backgroundTools = createBackgroundTools(backgroundManager, ctx.client);
24687
24820
  const callOmoAgent = createCallOmoAgent(ctx, backgroundManager);
24821
+ const lookAt = createLookAt(ctx);
24688
24822
  const googleAuthHooks = pluginConfig.google_auth ? await createGoogleAntigravityAuthPlugin(ctx) : null;
24689
24823
  return {
24690
24824
  ...googleAuthHooks ? { auth: googleAuthHooks.auth } : {},
24691
24825
  tool: {
24692
24826
  ...builtinTools,
24693
24827
  ...backgroundTools,
24694
- call_omo_agent: callOmoAgent
24828
+ call_omo_agent: callOmoAgent,
24829
+ look_at: lookAt
24695
24830
  },
24696
24831
  "chat.message": async (input, output) => {
24697
24832
  await claudeCodeHooks["chat.message"]?.(input, output);
@@ -24722,6 +24857,14 @@ var OhMyOpenCodePlugin = async (ctx) => {
24722
24857
  call_omo_agent: false
24723
24858
  };
24724
24859
  }
24860
+ if (config3.agent["multimodal-looker"]) {
24861
+ config3.agent["multimodal-looker"].tools = {
24862
+ ...config3.agent["multimodal-looker"].tools,
24863
+ task: false,
24864
+ call_omo_agent: false,
24865
+ look_at: false
24866
+ };
24867
+ }
24725
24868
  const mcpResult = pluginConfig.claude_code?.mcp ?? true ? await loadMcpConfigs() : { servers: {} };
24726
24869
  config3.mcp = {
24727
24870
  ...config3.mcp,
@@ -2,6 +2,7 @@ import type { PluginInput } from "@opencode-ai/plugin";
2
2
  import type { BackgroundManager } from "../features/background-agent";
3
3
  type OpencodeClient = PluginInput["client"];
4
4
  export { createCallOmoAgent } from "./call-omo-agent";
5
+ export { createLookAt } from "./look-at";
5
6
  export declare function createBackgroundTools(manager: BackgroundManager, client: OpencodeClient): {
6
7
  background_task: {
7
8
  description: string;
@@ -0,0 +1,2 @@
1
+ export declare const MULTIMODAL_LOOKER_AGENT: "multimodal-looker";
2
+ export declare const LOOK_AT_DESCRIPTION = "Analyze media files (PDFs, images, diagrams) that require visual interpretation.\n\nUse this tool to extract specific information from files that cannot be processed as plain text:\n- PDF documents: extract text, tables, structure, specific sections\n- Images: describe layouts, UI elements, text content, diagrams\n- Charts/Graphs: explain data, trends, relationships\n- Screenshots: identify UI components, text, visual elements\n- Architecture diagrams: explain flows, connections, components\n\nParameters:\n- file_path: Absolute path to the file to analyze\n- goal: What specific information to extract (be specific for better results)\n\nExamples:\n- \"Extract all API endpoints from this OpenAPI spec PDF\"\n- \"Describe the UI layout and components in this screenshot\"\n- \"Explain the data flow in this architecture diagram\"\n- \"List all table data from page 3 of this PDF\"\n\nThis tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis,\nsaving tokens in the main conversation while providing accurate visual interpretation.";
@@ -0,0 +1,3 @@
1
+ export * from "./types";
2
+ export * from "./constants";
3
+ export { createLookAt } from "./tools";
@@ -0,0 +1,12 @@
1
+ import { type PluginInput } from "@opencode-ai/plugin";
2
+ export declare function createLookAt(ctx: PluginInput): {
3
+ description: string;
4
+ args: {
5
+ file_path: import("zod").ZodString;
6
+ goal: import("zod").ZodString;
7
+ };
8
+ execute(args: {
9
+ file_path: string;
10
+ goal: string;
11
+ }, context: import("@opencode-ai/plugin").ToolContext): Promise<string>;
12
+ };
@@ -0,0 +1,4 @@
1
+ export interface LookAtArgs {
2
+ file_path: string;
3
+ goal: string;
4
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "oh-my-opencode",
3
- "version": "1.0.2",
3
+ "version": "1.1.1",
4
4
  "description": "OpenCode plugin - custom agents (oracle, librarian) and enhanced features",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",