npm - llmist - Versions diffs - 2.3.0 → 2.5.0 - Mend

llmist 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +7 -0
package/dist/{chunk-ZDNV7DDO.js → chunk-IHSZUAYN.js} +4 -2
package/dist/chunk-IHSZUAYN.js.map +1 -0
package/dist/{chunk-GANXNBIZ.js → chunk-YHS2DYXP.js} +2839 -579
package/dist/chunk-YHS2DYXP.js.map +1 -0
package/dist/cli.cjs +2717 -198
package/dist/cli.cjs.map +1 -1
package/dist/cli.js +638 -47
package/dist/cli.js.map +1 -1
package/dist/index.cjs +2496 -220
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +109 -20
package/dist/index.d.ts +109 -20
package/dist/index.js +34 -2
package/dist/{mock-stream-wRfUqXx4.d.cts → mock-stream-ga4KIiwX.d.cts} +1121 -12
package/dist/{mock-stream-wRfUqXx4.d.ts → mock-stream-ga4KIiwX.d.ts} +1121 -12
package/dist/testing/index.cjs +2771 -559
package/dist/testing/index.cjs.map +1 -1
package/dist/testing/index.d.cts +2 -2
package/dist/testing/index.d.ts +2 -2
package/dist/testing/index.js +1 -1
package/package.json +1 -1
package/dist/chunk-GANXNBIZ.js.map +0 -1
package/dist/chunk-ZDNV7DDO.js.map +0 -1

package/dist/cli.cjs CHANGED Viewed

@@ -46,6 +46,137 @@ var init_constants = __esm({
   }
 });
+// src/core/input-content.ts
+function text(content) {
+  return { type: "text", text: content };
+}
+function imageFromUrl(url) {
+  return {
+    type: "image",
+    source: { type: "url", url }
+  };
+}
+function detectImageMimeType(data) {
+  const bytes = data instanceof Buffer ? data : Buffer.from(data);
+  for (const { bytes: magic, mimeType } of IMAGE_MAGIC_BYTES) {
+    if (bytes.length >= magic.length) {
+      let matches = true;
+      for (let i = 0; i < magic.length; i++) {
+        if (bytes[i] !== magic[i]) {
+          matches = false;
+          break;
+        }
+      }
+      if (matches) {
+        if (mimeType === "image/webp") {
+          if (bytes.length >= 12) {
+            const webpMarker = bytes[8] === 87 && bytes[9] === 69 && bytes[10] === 66 && bytes[11] === 80;
+            if (!webpMarker) continue;
+          }
+        }
+        return mimeType;
+      }
+    }
+  }
+  return null;
+}
+function detectAudioMimeType(data) {
+  const bytes = data instanceof Buffer ? data : Buffer.from(data);
+  for (const { bytes: magic, mimeType } of AUDIO_MAGIC_BYTES) {
+    if (bytes.length >= magic.length) {
+      let matches = true;
+      for (let i = 0; i < magic.length; i++) {
+        if (bytes[i] !== magic[i]) {
+          matches = false;
+          break;
+        }
+      }
+      if (matches) {
+        if (mimeType === "audio/wav") {
+          if (bytes.length >= 12) {
+            const waveMarker = bytes[8] === 87 && bytes[9] === 65 && bytes[10] === 86 && bytes[11] === 69;
+            if (!waveMarker) continue;
+          }
+        }
+        return mimeType;
+      }
+    }
+  }
+  return null;
+}
+function toBase64(data) {
+  if (typeof data === "string") {
+    return data;
+  }
+  return Buffer.from(data).toString("base64");
+}
+function imageFromBuffer(buffer, mediaType) {
+  const detectedType = mediaType ?? detectImageMimeType(buffer);
+  if (!detectedType) {
+    throw new Error(
+      "Could not detect image MIME type. Please provide the mediaType parameter explicitly."
+    );
+  }
+  return {
+    type: "image",
+    source: {
+      type: "base64",
+      mediaType: detectedType,
+      data: toBase64(buffer)
+    }
+  };
+}
+function audioFromBuffer(buffer, mediaType) {
+  const detectedType = mediaType ?? detectAudioMimeType(buffer);
+  if (!detectedType) {
+    throw new Error(
+      "Could not detect audio MIME type. Please provide the mediaType parameter explicitly."
+    );
+  }
+  return {
+    type: "audio",
+    source: {
+      type: "base64",
+      mediaType: detectedType,
+      data: toBase64(buffer)
+    }
+  };
+}
+function isDataUrl(input) {
+  return input.startsWith("data:");
+}
+function parseDataUrl(url) {
+  const match = url.match(/^data:([^;]+);base64,(.+)$/);
+  if (!match) return null;
+  return { mimeType: match[1], data: match[2] };
+}
+var IMAGE_MAGIC_BYTES, AUDIO_MAGIC_BYTES;
+var init_input_content = __esm({
+  "src/core/input-content.ts"() {
+    "use strict";
+    IMAGE_MAGIC_BYTES = [
+      { bytes: [255, 216, 255], mimeType: "image/jpeg" },
+      { bytes: [137, 80, 78, 71], mimeType: "image/png" },
+      { bytes: [71, 73, 70, 56], mimeType: "image/gif" },
+      // WebP starts with RIFF....WEBP
+      { bytes: [82, 73, 70, 70], mimeType: "image/webp" }
+    ];
+    AUDIO_MAGIC_BYTES = [
+      // MP3 frame sync
+      { bytes: [255, 251], mimeType: "audio/mp3" },
+      { bytes: [255, 250], mimeType: "audio/mp3" },
+      // ID3 tag (MP3)
+      { bytes: [73, 68, 51], mimeType: "audio/mp3" },
+      // OGG
+      { bytes: [79, 103, 103, 83], mimeType: "audio/ogg" },
+      // WAV (RIFF)
+      { bytes: [82, 73, 70, 70], mimeType: "audio/wav" },
+      // WebM
+      { bytes: [26, 69, 223, 163], mimeType: "audio/webm" }
+    ];
+  }
+});
 // src/core/model-shortcuts.ts
 function isKnownModelPattern(model) {
   const normalized = model.toLowerCase();
@@ -375,7 +506,9 @@ var init_prompt_config = __esm({
       rules: () => [
         "Output ONLY plain text with the exact markers - never use function/tool calling",
         "You can invoke multiple gadgets in a single response",
-        "For dependent gadgets, invoke the first one and wait for the result"
+        "Gadgets without dependencies execute immediately (in parallel if multiple)",
+        "Use :invocation_id:dep1,dep2 syntax when a gadget needs results from prior gadgets",
+        "If any dependency fails, dependent gadgets are automatically skipped"
       ],
       customExamples: null
     };
@@ -383,11 +516,24 @@ var init_prompt_config = __esm({
 });
 // src/core/messages.ts
+function normalizeContent(content) {
+  if (typeof content === "string") {
+    return [{ type: "text", text: content }];
+  }
+  return content;
+}
+function extractText(content) {
+  if (typeof content === "string") {
+    return content;
+  }
+  return content.filter((part) => part.type === "text").map((part) => part.text).join("");
+}
 var LLMMessageBuilder;
 var init_messages = __esm({
   "src/core/messages.ts"() {
     "use strict";
     init_constants();
+    init_input_content();
     init_prompt_config();
     LLMMessageBuilder = class {
       messages = [];
@@ -489,6 +635,10 @@ CRITICAL: ${criticalUsage}
         parts.push(`
   1. Start marker: ${this.startPrefix}gadget_name`);
         parts.push(`
+     With ID: ${this.startPrefix}gadget_name:my_id`);
+        parts.push(`
+     With dependencies: ${this.startPrefix}gadget_name:my_id:dep1,dep2`);
+        parts.push(`
   2. ${formatDescription}`);
         parts.push(`
   3. End marker: ${this.endPrefix}`);
@@ -538,6 +688,25 @@ ${this.endPrefix}`;
 EXAMPLE (Multiple Gadgets):
 ${multipleExample}`);
+        const dependencyExample = `${this.startPrefix}fetch_data:fetch_1
+${this.argPrefix}url
+https://api.example.com/users
+${this.endPrefix}
+${this.startPrefix}fetch_data:fetch_2
+${this.argPrefix}url
+https://api.example.com/orders
+${this.endPrefix}
+${this.startPrefix}merge_data:merge_1:fetch_1,fetch_2
+${this.argPrefix}format
+json
+${this.endPrefix}`;
+        parts.push(`
+EXAMPLE (With Dependencies):
+merge_1 waits for fetch_1 AND fetch_2 to complete.
+If either fails, merge_1 is automatically skipped.
+${dependencyExample}`);
         parts.push(`
 BLOCK FORMAT SYNTAX:
@@ -588,6 +757,25 @@ Produces: { "items": ["first", "second"] }`);
         }
         return parts.join("");
       }
+      /**
+       * Add a user message.
+       * Content can be a string (text only) or an array of content parts (multimodal).
+       *
+       * @param content - Message content
+       * @param metadata - Optional metadata
+       *
+       * @example
+       * ```typescript
+       * // Text only
+       * builder.addUser("Hello!");
+       *
+       * // Multimodal
+       * builder.addUser([
+       *   text("What's in this image?"),
+       *   imageFromBuffer(imageData),
+       * ]);
+       * ```
+       */
       addUser(content, metadata) {
         this.messages.push({ role: "user", content, metadata });
         return this;
@@ -596,6 +784,104 @@ Produces: { "items": ["first", "second"] }`);
         this.messages.push({ role: "assistant", content, metadata });
         return this;
       }
+      /**
+       * Add a user message with an image attachment.
+       *
+       * @param textContent - Text prompt
+       * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
+       * @param mimeType - Optional MIME type (auto-detected if not provided)
+       *
+       * @example
+       * ```typescript
+       * builder.addUserWithImage(
+       *   "What's in this image?",
+       *   await fs.readFile("photo.jpg"),
+       *   "image/jpeg"  // Optional - auto-detected
+       * );
+       * ```
+       */
+      addUserWithImage(textContent, imageData, mimeType) {
+        const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
+        const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
+        if (!detectedMime) {
+          throw new Error(
+            "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
+          );
+        }
+        const content = [
+          text(textContent),
+          {
+            type: "image",
+            source: {
+              type: "base64",
+              mediaType: detectedMime,
+              data: toBase64(imageBuffer)
+            }
+          }
+        ];
+        this.messages.push({ role: "user", content });
+        return this;
+      }
+      /**
+       * Add a user message with an image URL (OpenAI only).
+       *
+       * @param textContent - Text prompt
+       * @param imageUrl - URL to the image
+       *
+       * @example
+       * ```typescript
+       * builder.addUserWithImageUrl(
+       *   "What's in this image?",
+       *   "https://example.com/image.jpg"
+       * );
+       * ```
+       */
+      addUserWithImageUrl(textContent, imageUrl) {
+        const content = [text(textContent), imageFromUrl(imageUrl)];
+        this.messages.push({ role: "user", content });
+        return this;
+      }
+      /**
+       * Add a user message with an audio attachment (Gemini only).
+       *
+       * @param textContent - Text prompt
+       * @param audioData - Audio data (Buffer, Uint8Array, or base64 string)
+       * @param mimeType - Optional MIME type (auto-detected if not provided)
+       *
+       * @example
+       * ```typescript
+       * builder.addUserWithAudio(
+       *   "Transcribe this audio",
+       *   await fs.readFile("recording.mp3"),
+       *   "audio/mp3"  // Optional - auto-detected
+       * );
+       * ```
+       */
+      addUserWithAudio(textContent, audioData, mimeType) {
+        const audioBuffer = typeof audioData === "string" ? Buffer.from(audioData, "base64") : audioData;
+        const content = [text(textContent), audioFromBuffer(audioBuffer, mimeType)];
+        this.messages.push({ role: "user", content });
+        return this;
+      }
+      /**
+       * Add a user message with multiple content parts.
+       * Provides full flexibility for complex multimodal messages.
+       *
+       * @param parts - Array of content parts
+       *
+       * @example
+       * ```typescript
+       * builder.addUserMultimodal([
+       *   text("Compare these images:"),
+       *   imageFromBuffer(image1),
+       *   imageFromBuffer(image2),
+       * ]);
+       * ```
+       */
+      addUserMultimodal(parts) {
+        this.messages.push({ role: "user", content: parts });
+        return this;
+      }
       addGadgetCall(gadget, parameters, result) {
         const paramStr = this.formatBlockParameters(parameters, "");
         this.messages.push({
@@ -1914,7 +2200,7 @@ var init_conversation_manager = __esm({
           if (msg.role === "user") {
             this.historyBuilder.addUser(msg.content);
           } else if (msg.role === "assistant") {
-            this.historyBuilder.addAssistant(msg.content);
+            this.historyBuilder.addAssistant(extractText(msg.content));
           }
         }
       }
@@ -1935,8 +2221,10 @@ async function runWithHandlers(agentGenerator, handlers) {
         if (handlers.onGadgetCall) {
           await handlers.onGadgetCall({
             gadgetName: event.call.gadgetName,
+            invocationId: event.call.invocationId,
             parameters: event.call.parameters,
-            parametersRaw: event.call.parametersRaw
+            parametersRaw: event.call.parametersRaw,
+            dependencies: event.call.dependencies
           });
         }
         break;
@@ -2498,7 +2786,27 @@ var init_cost_reporting_client = __esm({
       constructor(client, reportCost) {
         this.client = client;
         this.reportCost = reportCost;
+        this.image = {
+          generate: async (options) => {
+            const result = await this.client.image.generate(options);
+            if (result.cost !== void 0 && result.cost > 0) {
+              this.reportCost(result.cost);
+            }
+            return result;
+          }
+        };
+        this.speech = {
+          generate: async (options) => {
+            const result = await this.client.speech.generate(options);
+            if (result.cost !== void 0 && result.cost > 0) {
+              this.reportCost(result.cost);
+            }
+            return result;
+          }
+        };
       }
+      image;
+      speech;
       /**
        * Access to model registry for cost estimation.
        */
@@ -2763,15 +3071,37 @@ var init_parser = __esm({
         return segment.trim().length > 0 ? segment : void 0;
       }
       /**
-       * Parse gadget name, handling both old format (name:invocationId) and new format (just name).
-       * For new format, generates a unique invocation ID.
+       * Parse gadget name with optional invocation ID and dependencies.
+       *
+       * Supported formats:
+       * - `GadgetName` - Auto-generate ID, no dependencies
+       * - `GadgetName:my_id` - Explicit ID, no dependencies
+       * - `GadgetName:my_id:dep1,dep2` - Explicit ID with dependencies
+       *
+       * Dependencies must be comma-separated invocation IDs.
        */
       parseGadgetName(gadgetName) {
-        if (gadgetName.includes(":")) {
-          const parts = gadgetName.split(":");
-          return { actualName: parts[0], invocationId: parts[1] };
+        const parts = gadgetName.split(":");
+        if (parts.length === 1) {
+          return {
+            actualName: parts[0],
+            invocationId: `gadget_${++globalInvocationCounter}`,
+            dependencies: []
+          };
+        } else if (parts.length === 2) {
+          return {
+            actualName: parts[0],
+            invocationId: parts[1].trim(),
+            dependencies: []
+          };
+        } else {
+          const deps = parts[2].split(",").map((d) => d.trim()).filter((d) => d.length > 0);
+          return {
+            actualName: parts[0],
+            invocationId: parts[1].trim(),
+            dependencies: deps
+          };
         }
-        return { actualName: gadgetName, invocationId: `gadget_${++globalInvocationCounter}` };
       }
       /**
        * Extract the error message from a parse error.
@@ -2807,39 +3137,20 @@ var init_parser = __esm({
           const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
           if (metadataEndIndex === -1) break;
           const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
-          const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
+          const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
           const contentStartIndex = metadataEndIndex + 1;
           let partEndIndex;
           let endMarkerLength = 0;
-          if (gadgetName.includes(":")) {
-            const oldEndMarker = `${this.endPrefix + actualGadgetName}:${invocationId}`;
-            partEndIndex = this.buffer.indexOf(oldEndMarker, contentStartIndex);
-            if (partEndIndex === -1) break;
-            endMarkerLength = oldEndMarker.length;
+          const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
+          const endPos = this.buffer.indexOf(this.endPrefix, contentStartIndex);
+          if (nextStartPos !== -1 && (endPos === -1 || nextStartPos < endPos)) {
+            partEndIndex = nextStartPos;
+            endMarkerLength = 0;
+          } else if (endPos !== -1) {
+            partEndIndex = endPos;
+            endMarkerLength = this.endPrefix.length;
           } else {
-            const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
-            let validEndPos = -1;
-            let searchPos = contentStartIndex;
-            while (true) {
-              const endPos = this.buffer.indexOf(this.endPrefix, searchPos);
-              if (endPos === -1) break;
-              const afterEnd = this.buffer.substring(endPos + this.endPrefix.length);
-              if (afterEnd.startsWith("\n") || afterEnd.startsWith("\r") || afterEnd.startsWith(this.startPrefix) || afterEnd.length === 0) {
-                validEndPos = endPos;
-                break;
-              } else {
-                searchPos = endPos + this.endPrefix.length;
-              }
-            }
-            if (nextStartPos !== -1 && (validEndPos === -1 || nextStartPos < validEndPos)) {
-              partEndIndex = nextStartPos;
-              endMarkerLength = 0;
-            } else if (validEndPos !== -1) {
-              partEndIndex = validEndPos;
-              endMarkerLength = this.endPrefix.length;
-            } else {
-              break;
-            }
+            break;
           }
           const parametersRaw = this.buffer.substring(contentStartIndex, partEndIndex).trim();
           const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2850,7 +3161,8 @@ var init_parser = __esm({
               invocationId,
               parametersRaw,
               parameters,
-              parseError
+              parseError,
+              dependencies
             }
           };
           startIndex = partEndIndex + endMarkerLength;
@@ -2873,7 +3185,7 @@ var init_parser = __esm({
           const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
           if (metadataEndIndex !== -1) {
             const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
-            const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
+            const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
             const contentStartIndex = metadataEndIndex + 1;
             const parametersRaw = this.buffer.substring(contentStartIndex).trim();
             const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2884,7 +3196,8 @@ var init_parser = __esm({
                 invocationId,
                 parametersRaw,
                 parameters,
-                parseError
+                parseError,
+                dependencies
               }
             };
             return;
@@ -3254,6 +3567,13 @@ var init_stream_processor = __esm({
       accumulatedText = "";
       shouldStopExecution = false;
       observerFailureCount = 0;
+      // Dependency tracking for gadget execution DAG
+      /** Gadgets waiting for their dependencies to complete */
+      pendingGadgets = /* @__PURE__ */ new Map();
+      /** Completed gadget results, keyed by invocation ID */
+      completedResults = /* @__PURE__ */ new Map();
+      /** Invocation IDs of gadgets that have failed (error or skipped due to dependency) */
+      failedInvocations = /* @__PURE__ */ new Set();
       constructor(options) {
         this.iteration = options.iteration;
         this.registry = options.registry;
@@ -3354,6 +3674,16 @@ var init_stream_processor = __esm({
               }
             }
           }
+          const finalPendingEvents = await this.processPendingGadgets();
+          outputs.push(...finalPendingEvents);
+          if (finalPendingEvents.some((e) => e.type === "gadget_result")) {
+            didExecuteGadgets = true;
+          }
+          for (const evt of finalPendingEvents) {
+            if (evt.type === "gadget_result" && evt.result.breaksLoop) {
+              shouldBreakLoop = true;
+            }
+          }
         }
         let finalMessage = this.accumulatedText;
         if (this.hooks.interceptors?.interceptAssistantMessage) {
@@ -3405,7 +3735,11 @@ var init_stream_processor = __esm({
         return [{ type: "text", content }];
       }
       /**
-       * Process a gadget call through the full lifecycle.
+       * Process a gadget call through the full lifecycle, handling dependencies.
+       *
+       * Gadgets without dependencies (or with all dependencies satisfied) execute immediately.
+       * Gadgets with unsatisfied dependencies are queued for later execution.
+       * After each execution, pending gadgets are checked to see if they can now run.
        */
       async processGadgetCall(call) {
         if (this.shouldStopExecution) {
@@ -3416,6 +3750,53 @@ var init_stream_processor = __esm({
         }
         const events = [];
         events.push({ type: "gadget_call", call });
+        if (call.dependencies.length > 0) {
+          if (call.dependencies.includes(call.invocationId)) {
+            this.logger.warn("Gadget has self-referential dependency (depends on itself)", {
+              gadgetName: call.gadgetName,
+              invocationId: call.invocationId
+            });
+            this.failedInvocations.add(call.invocationId);
+            const skipEvent = {
+              type: "gadget_skipped",
+              gadgetName: call.gadgetName,
+              invocationId: call.invocationId,
+              parameters: call.parameters ?? {},
+              failedDependency: call.invocationId,
+              failedDependencyError: `Gadget "${call.invocationId}" cannot depend on itself (self-referential dependency)`
+            };
+            events.push(skipEvent);
+            return events;
+          }
+          const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
+          if (failedDep) {
+            const skipEvents = await this.handleFailedDependency(call, failedDep);
+            events.push(...skipEvents);
+            return events;
+          }
+          const unsatisfied = call.dependencies.filter((dep) => !this.completedResults.has(dep));
+          if (unsatisfied.length > 0) {
+            this.logger.debug("Queueing gadget for later - waiting on dependencies", {
+              gadgetName: call.gadgetName,
+              invocationId: call.invocationId,
+              waitingOn: unsatisfied
+            });
+            this.pendingGadgets.set(call.invocationId, call);
+            return events;
+          }
+        }
+        const executeEvents = await this.executeGadgetWithHooks(call);
+        events.push(...executeEvents);
+        const triggeredEvents = await this.processPendingGadgets();
+        events.push(...triggeredEvents);
+        return events;
+      }
+      /**
+       * Execute a gadget through the full hook lifecycle.
+       * This is the core execution logic, extracted from processGadgetCall.
+       */
+      async executeGadgetWithHooks(call) {
+        const events = [];
         if (call.parseError) {
           this.logger.warn("Gadget has parse error", {
             gadgetName: call.gadgetName,
@@ -3546,6 +3927,10 @@ var init_stream_processor = __esm({
           });
         }
         await this.runObserversInParallel(completeObservers);
+        this.completedResults.set(result.invocationId, result);
+        if (result.error) {
+          this.failedInvocations.add(result.invocationId);
+        }
         events.push({ type: "gadget_result", result });
         if (result.error) {
           const errorType = this.determineErrorType(call, result);
@@ -3561,6 +3946,162 @@ var init_stream_processor = __esm({
         }
         return events;
       }
+      /**
+       * Handle a gadget that cannot execute because a dependency failed.
+       * Calls the onDependencySkipped controller to allow customization.
+       */
+      async handleFailedDependency(call, failedDep) {
+        const events = [];
+        const depResult = this.completedResults.get(failedDep);
+        const depError = depResult?.error ?? "Dependency failed";
+        let action = { action: "skip" };
+        if (this.hooks.controllers?.onDependencySkipped) {
+          const context = {
+            iteration: this.iteration,
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            parameters: call.parameters ?? {},
+            failedDependency: failedDep,
+            failedDependencyError: depError,
+            logger: this.logger
+          };
+          action = await this.hooks.controllers.onDependencySkipped(context);
+        }
+        if (action.action === "skip") {
+          this.failedInvocations.add(call.invocationId);
+          const skipEvent = {
+            type: "gadget_skipped",
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            parameters: call.parameters ?? {},
+            failedDependency: failedDep,
+            failedDependencyError: depError
+          };
+          events.push(skipEvent);
+          if (this.hooks.observers?.onGadgetSkipped) {
+            const observeContext = {
+              iteration: this.iteration,
+              gadgetName: call.gadgetName,
+              invocationId: call.invocationId,
+              parameters: call.parameters ?? {},
+              failedDependency: failedDep,
+              failedDependencyError: depError,
+              logger: this.logger
+            };
+            await this.safeObserve(() => this.hooks.observers.onGadgetSkipped(observeContext));
+          }
+          this.logger.info("Gadget skipped due to failed dependency", {
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            failedDependency: failedDep
+          });
+        } else if (action.action === "execute_anyway") {
+          this.logger.info("Executing gadget despite failed dependency (controller override)", {
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            failedDependency: failedDep
+          });
+          const executeEvents = await this.executeGadgetWithHooks(call);
+          events.push(...executeEvents);
+        } else if (action.action === "use_fallback") {
+          const fallbackResult = {
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            parameters: call.parameters ?? {},
+            result: action.fallbackResult,
+            executionTimeMs: 0
+          };
+          this.completedResults.set(call.invocationId, fallbackResult);
+          events.push({ type: "gadget_result", result: fallbackResult });
+          this.logger.info("Using fallback result for gadget with failed dependency", {
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            failedDependency: failedDep
+          });
+        }
+        return events;
+      }
+      /**
+       * Process pending gadgets whose dependencies are now satisfied.
+       * Executes ready gadgets in parallel and continues until no more can be triggered.
+       */
+      async processPendingGadgets() {
+        const events = [];
+        let progress = true;
+        while (progress && this.pendingGadgets.size > 0) {
+          progress = false;
+          const readyToExecute = [];
+          const readyToSkip = [];
+          for (const [invocationId, call] of this.pendingGadgets) {
+            const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
+            if (failedDep) {
+              readyToSkip.push({ call, failedDep });
+              continue;
+            }
+            const allSatisfied = call.dependencies.every((dep) => this.completedResults.has(dep));
+            if (allSatisfied) {
+              readyToExecute.push(call);
+            }
+          }
+          for (const { call, failedDep } of readyToSkip) {
+            this.pendingGadgets.delete(call.invocationId);
+            const skipEvents = await this.handleFailedDependency(call, failedDep);
+            events.push(...skipEvents);
+            progress = true;
+          }
+          if (readyToExecute.length > 0) {
+            this.logger.debug("Executing ready gadgets in parallel", {
+              count: readyToExecute.length,
+              invocationIds: readyToExecute.map((c) => c.invocationId)
+            });
+            for (const call of readyToExecute) {
+              this.pendingGadgets.delete(call.invocationId);
+            }
+            const executePromises = readyToExecute.map((call) => this.executeGadgetWithHooks(call));
+            const results = await Promise.all(executePromises);
+            for (const executeEvents of results) {
+              events.push(...executeEvents);
+            }
+            progress = true;
+          }
+        }
+        if (this.pendingGadgets.size > 0) {
+          const pendingIds = new Set(this.pendingGadgets.keys());
+          for (const [invocationId, call] of this.pendingGadgets) {
+            const missingDeps = call.dependencies.filter((dep) => !this.completedResults.has(dep));
+            const circularDeps = missingDeps.filter((dep) => pendingIds.has(dep));
+            const trulyMissingDeps = missingDeps.filter((dep) => !pendingIds.has(dep));
+            let errorMessage;
+            let logLevel = "warn";
+            if (circularDeps.length > 0 && trulyMissingDeps.length > 0) {
+              errorMessage = `Dependencies unresolvable: circular=[${circularDeps.join(", ")}], missing=[${trulyMissingDeps.join(", ")}]`;
+              logLevel = "error";
+            } else if (circularDeps.length > 0) {
+              errorMessage = `Circular dependency detected: "${invocationId}" depends on "${circularDeps[0]}" which also depends on "${invocationId}" (directly or indirectly)`;
+            } else {
+              errorMessage = `Dependency "${missingDeps[0]}" was never executed - check that the invocation ID exists and is spelled correctly`;
+            }
+            this.logger[logLevel]("Gadget has unresolvable dependencies", {
+              gadgetName: call.gadgetName,
+              invocationId,
+              circularDependencies: circularDeps,
+              missingDependencies: trulyMissingDeps
+            });
+            this.failedInvocations.add(invocationId);
+            const skipEvent = {
+              type: "gadget_skipped",
+              gadgetName: call.gadgetName,
+              invocationId,
+              parameters: call.parameters ?? {},
+              failedDependency: missingDeps[0],
+              failedDependencyError: errorMessage
+            };
+            events.push(skipEvent);
+          }
+          this.pendingGadgets.clear();
+        }
+        return events;
+      }
       /**
        * Safely execute an observer, catching and logging any errors.
        * Observers are non-critical, so errors are logged but don't crash the system.
@@ -3998,9 +4539,9 @@ var init_agent = __esm({
                   if (msg.role === "user") {
                     this.conversation.addUserMessage(msg.content);
                   } else if (msg.role === "assistant") {
-                    this.conversation.addAssistantMessage(msg.content);
+                    this.conversation.addAssistantMessage(extractText(msg.content));
                   } else if (msg.role === "system") {
-                    this.conversation.addUserMessage(`[System] ${msg.content}`);
+                    this.conversation.addUserMessage(`[System] ${extractText(msg.content)}`);
                   }
                 }
               }
@@ -4579,6 +5120,7 @@ var init_anthropic = __esm({
   "src/providers/anthropic.ts"() {
     "use strict";
     import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
+    init_messages();
     init_anthropic_models();
     init_base_provider();
     init_constants2();
@@ -4591,11 +5133,33 @@ var init_anthropic = __esm({
       getModelSpecs() {
         return ANTHROPIC_MODELS;
       }
+      // =========================================================================
+      // Image Generation (Not Supported)
+      // =========================================================================
+      supportsImageGeneration(_modelId) {
+        return false;
+      }
+      async generateImage() {
+        throw new Error(
+          "Anthropic does not support image generation. Use OpenAI (DALL-E, GPT Image) or Google Gemini (Imagen) instead."
+        );
+      }
+      // =========================================================================
+      // Speech Generation (Not Supported)
+      // =========================================================================
+      supportsSpeechGeneration(_modelId) {
+        return false;
+      }
+      async generateSpeech() {
+        throw new Error(
+          "Anthropic does not support speech generation. Use OpenAI (TTS) or Google Gemini (TTS) instead."
+        );
+      }
       buildRequestPayload(options, descriptor, spec, messages) {
         const systemMessages = messages.filter((message) => message.role === "system");
         const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
           type: "text",
-          text: m.content,
+          text: extractText(m.content),
           // Add cache_control to the LAST system message block
           ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
         })) : void 0;
@@ -4608,14 +5172,10 @@ var init_anthropic = __esm({
         );
         const conversation = nonSystemMessages.map((message, index) => ({
           role: message.role,
-          content: [
-            {
-              type: "text",
-              text: message.content,
-              // Add cache_control to the LAST user message
-              ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
-            }
-          ]
+          content: this.convertToAnthropicContent(
+            message.content,
+            message.role === "user" && index === lastUserIndex
+          )
         }));
         const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
         const payload = {
@@ -4631,15 +5191,61 @@ var init_anthropic = __esm({
         };
         return payload;
       }
-      async executeStreamRequest(payload, signal) {
-        const client = this.client;
-        const stream2 = await client.messages.create(payload, signal ? { signal } : void 0);
-        return stream2;
+      /**
+       * Convert llmist content to Anthropic's content block format.
+       * Handles text, images (base64 only), and applies cache_control.
+       */
+      convertToAnthropicContent(content, addCacheControl) {
+        const parts = normalizeContent(content);
+        return parts.map((part, index) => {
+          const isLastPart = index === parts.length - 1;
+          const cacheControl = addCacheControl && isLastPart ? { cache_control: { type: "ephemeral" } } : {};
+          if (part.type === "text") {
+            return {
+              type: "text",
+              text: part.text,
+              ...cacheControl
+            };
+          }
+          if (part.type === "image") {
+            return this.convertImagePart(part, cacheControl);
+          }
+          if (part.type === "audio") {
+            throw new Error(
+              "Anthropic does not support audio input. Use Google Gemini for audio processing."
+            );
+          }
+          throw new Error(`Unsupported content type: ${part.type}`);
+        });
       }
-      async *wrapStream(iterable) {
-        const stream2 = iterable;
-        let inputTokens = 0;
-        let cachedInputTokens = 0;
+      /**
+       * Convert an image content part to Anthropic's image block format.
+       */
+      convertImagePart(part, cacheControl) {
+        if (part.source.type === "url") {
+          throw new Error(
+            "Anthropic does not support image URLs. Please provide base64-encoded image data instead."
+          );
+        }
+        return {
+          type: "image",
+          source: {
+            type: "base64",
+            media_type: part.source.mediaType,
+            data: part.source.data
+          },
+          ...cacheControl
+        };
+      }
+      async executeStreamRequest(payload, signal) {
+        const client = this.client;
+        const stream2 = await client.messages.create(payload, signal ? { signal } : void 0);
+        return stream2;
+      }
+      async *wrapStream(iterable) {
+        const stream2 = iterable;
+        let inputTokens = 0;
+        let cachedInputTokens = 0;
         let cacheCreationInputTokens = 0;
         for await (const event of stream2) {
           if (event.type === "message_start") {
@@ -4713,17 +5319,12 @@ var init_anthropic = __esm({
       async countTokens(messages, descriptor, _spec) {
         const client = this.client;
         const systemMessages = messages.filter((message) => message.role === "system");
-        const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
+        const system = systemMessages.length > 0 ? systemMessages.map((m) => extractText(m.content)).join("\n\n") : void 0;
         const conversation = messages.filter(
           (message) => message.role !== "system"
         ).map((message) => ({
           role: message.role,
-          content: [
-            {
-              type: "text",
-              text: message.content
-            }
-          ]
+          content: this.convertToAnthropicContent(message.content, false)
         }));
         try {
           const response = await client.messages.countTokens({
@@ -4737,14 +5338,201 @@ var init_anthropic = __esm({
             `Token counting failed for ${descriptor.name}, using fallback estimation:`,
             error
           );
-          const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
-          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
+          let totalChars = 0;
+          let imageCount = 0;
+          for (const msg of messages) {
+            const parts = normalizeContent(msg.content);
+            for (const part of parts) {
+              if (part.type === "text") {
+                totalChars += part.text.length;
+              } else if (part.type === "image") {
+                imageCount++;
+              }
+            }
+          }
+          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 1e3;
         }
       }
     };
   }
 });
+// src/providers/gemini-image-models.ts
+function getGeminiImageModelSpec(modelId) {
+  return geminiImageModels.find((m) => m.modelId === modelId);
+}
+function isGeminiImageModel(modelId) {
+  return geminiImageModels.some((m) => m.modelId === modelId);
+}
+function calculateGeminiImageCost(modelId, size = "1:1", n = 1) {
+  const spec = getGeminiImageModelSpec(modelId);
+  if (!spec) return void 0;
+  if (spec.pricing.perImage !== void 0) {
+    return spec.pricing.perImage * n;
+  }
+  if (spec.pricing.bySize) {
+    const sizePrice = spec.pricing.bySize[size];
+    if (typeof sizePrice === "number") {
+      return sizePrice * n;
+    }
+  }
+  return void 0;
+}
+var IMAGEN4_ASPECT_RATIOS, GEMINI_IMAGE_ASPECT_RATIOS, geminiImageModels;
+var init_gemini_image_models = __esm({
+  "src/providers/gemini-image-models.ts"() {
+    "use strict";
+    IMAGEN4_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
+    GEMINI_IMAGE_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
+    geminiImageModels = [
+      // Imagen 4 Family (standalone image generation)
+      {
+        provider: "gemini",
+        modelId: "imagen-4.0-fast-generate-001",
+        displayName: "Imagen 4 Fast",
+        pricing: {
+          perImage: 0.02
+        },
+        supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
+        maxImages: 4,
+        defaultSize: "1:1",
+        features: {
+          textRendering: true
+        }
+      },
+      {
+        provider: "gemini",
+        modelId: "imagen-4.0-generate-001",
+        displayName: "Imagen 4",
+        pricing: {
+          perImage: 0.04
+        },
+        supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
+        maxImages: 4,
+        defaultSize: "1:1",
+        features: {
+          textRendering: true
+        }
+      },
+      {
+        provider: "gemini",
+        modelId: "imagen-4.0-ultra-generate-001",
+        displayName: "Imagen 4 Ultra",
+        pricing: {
+          perImage: 0.06
+        },
+        supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
+        maxImages: 4,
+        defaultSize: "1:1",
+        features: {
+          textRendering: true
+        }
+      },
+      // Preview versions
+      {
+        provider: "gemini",
+        modelId: "imagen-4.0-generate-preview-06-06",
+        displayName: "Imagen 4 (Preview)",
+        pricing: {
+          perImage: 0.04
+        },
+        supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
+        maxImages: 4,
+        defaultSize: "1:1",
+        features: {
+          textRendering: true
+        }
+      },
+      {
+        provider: "gemini",
+        modelId: "imagen-4.0-ultra-generate-preview-06-06",
+        displayName: "Imagen 4 Ultra (Preview)",
+        pricing: {
+          perImage: 0.06
+        },
+        supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
+        maxImages: 4,
+        defaultSize: "1:1",
+        features: {
+          textRendering: true
+        }
+      },
+      // Gemini Native Image Generation (multimodal models)
+      {
+        provider: "gemini",
+        modelId: "gemini-2.5-flash-image",
+        displayName: "Gemini 2.5 Flash Image",
+        pricing: {
+          perImage: 0.039
+        },
+        supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
+        maxImages: 1,
+        defaultSize: "1:1",
+        features: {
+          conversational: true,
+          textRendering: true
+        }
+      },
+      {
+        provider: "gemini",
+        modelId: "gemini-2.5-flash-image-preview",
+        displayName: "Gemini 2.5 Flash Image (Preview)",
+        pricing: {
+          perImage: 0.039
+        },
+        supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
+        maxImages: 1,
+        defaultSize: "1:1",
+        features: {
+          conversational: true,
+          textRendering: true
+        }
+      },
+      {
+        provider: "gemini",
+        modelId: "gemini-3-pro-image-preview",
+        displayName: "Gemini 3 Pro Image (Preview)",
+        pricing: {
+          // Token-based: ~$0.134 per 1K/2K image, $0.24 per 4K
+          // Using 2K as default
+          bySize: {
+            "1K": 0.134,
+            "2K": 0.134,
+            "4K": 0.24
+          }
+        },
+        supportedSizes: ["1K", "2K", "4K"],
+        maxImages: 1,
+        defaultSize: "2K",
+        features: {
+          conversational: true,
+          textRendering: true
+        }
+      },
+      // Alias: nano-banana-pro-preview is gemini-3-pro-image-preview
+      {
+        provider: "gemini",
+        modelId: "nano-banana-pro-preview",
+        displayName: "Nano Banana Pro (Gemini 3 Pro Image)",
+        pricing: {
+          bySize: {
+            "1K": 0.134,
+            "2K": 0.134,
+            "4K": 0.24
+          }
+        },
+        supportedSizes: ["1K", "2K", "4K"],
+        maxImages: 1,
+        defaultSize: "2K",
+        features: {
+          conversational: true,
+          textRendering: true
+        }
+      }
+    ];
+  }
+});
 // src/providers/gemini-models.ts
 var GEMINI_MODELS;
 var init_gemini_models = __esm({
@@ -4918,7 +5706,171 @@ var init_gemini_models = __esm({
   }
 });
+// src/providers/gemini-speech-models.ts
+function getGeminiSpeechModelSpec(modelId) {
+  return geminiSpeechModels.find((m) => m.modelId === modelId);
+}
+function isGeminiSpeechModel(modelId) {
+  return geminiSpeechModels.some((m) => m.modelId === modelId);
+}
+function calculateGeminiSpeechCost(modelId, characterCount, estimatedMinutes) {
+  const spec = getGeminiSpeechModelSpec(modelId);
+  if (!spec) return void 0;
+  if (spec.pricing.perMinute !== void 0) {
+    if (estimatedMinutes !== void 0) {
+      return estimatedMinutes * spec.pricing.perMinute;
+    }
+    const approxMinutes = characterCount / 750;
+    return approxMinutes * spec.pricing.perMinute;
+  }
+  return void 0;
+}
+var GEMINI_TTS_VOICES, GEMINI_TTS_FORMATS, geminiSpeechModels;
+var init_gemini_speech_models = __esm({
+  "src/providers/gemini-speech-models.ts"() {
+    "use strict";
+    GEMINI_TTS_VOICES = [
+      "Zephyr",
+      // Bright
+      "Puck",
+      // Upbeat
+      "Charon",
+      // Informative
+      "Kore",
+      // Firm
+      "Fenrir",
+      // Excitable
+      "Leda",
+      // Youthful
+      "Orus",
+      // Firm
+      "Aoede",
+      // Breezy
+      "Callirrhoe",
+      // Easy-going
+      "Autonoe",
+      // Bright
+      "Enceladus",
+      // Breathy
+      "Iapetus",
+      // Clear
+      "Umbriel",
+      // Easy-going
+      "Algieba",
+      // Smooth
+      "Despina",
+      // Smooth
+      "Erinome",
+      // Clear
+      "Algenib",
+      // Gravelly
+      "Rasalgethi",
+      // Informative
+      "Laomedeia",
+      // Upbeat
+      "Achernar",
+      // Soft
+      "Alnilam",
+      // Firm
+      "Schedar",
+      // Even
+      "Gacrux",
+      // Mature
+      "Pulcherrima",
+      // Forward
+      "Achird",
+      // Friendly
+      "Zubenelgenubi",
+      // Casual
+      "Vindemiatrix",
+      // Gentle
+      "Sadachbia",
+      // Lively
+      "Sadaltager",
+      // Knowledgeable
+      "Sulafat"
+      // Warm
+    ];
+    GEMINI_TTS_FORMATS = ["pcm", "wav"];
+    geminiSpeechModels = [
+      {
+        provider: "gemini",
+        modelId: "gemini-2.5-flash-preview-tts",
+        displayName: "Gemini 2.5 Flash TTS (Preview)",
+        pricing: {
+          // $0.50 per 1M input tokens = $0.0000005 per token
+          perInputToken: 5e-7,
+          // $10.00 per 1M audio output tokens = $0.00001 per token
+          perAudioOutputToken: 1e-5,
+          // Rough estimate: ~$0.01 per minute of audio
+          perMinute: 0.01
+        },
+        voices: [...GEMINI_TTS_VOICES],
+        formats: GEMINI_TTS_FORMATS,
+        maxInputLength: 8e3,
+        // bytes (text + prompt combined)
+        defaultVoice: "Zephyr",
+        defaultFormat: "wav",
+        features: {
+          multiSpeaker: true,
+          languages: 24,
+          voiceInstructions: true
+        }
+      },
+      {
+        provider: "gemini",
+        modelId: "gemini-2.5-pro-preview-tts",
+        displayName: "Gemini 2.5 Pro TTS (Preview)",
+        pricing: {
+          // $1.00 per 1M input tokens = $0.000001 per token
+          perInputToken: 1e-6,
+          // $20.00 per 1M audio output tokens = $0.00002 per token
+          perAudioOutputToken: 2e-5,
+          // Rough estimate: ~$0.02 per minute of audio
+          perMinute: 0.02
+        },
+        voices: [...GEMINI_TTS_VOICES],
+        formats: GEMINI_TTS_FORMATS,
+        maxInputLength: 8e3,
+        // bytes
+        defaultVoice: "Zephyr",
+        defaultFormat: "wav",
+        features: {
+          multiSpeaker: true,
+          languages: 24,
+          voiceInstructions: true
+        }
+      }
+    ];
+  }
+});
 // src/providers/gemini.ts
+function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
+  const byteRate = sampleRate * numChannels * bitsPerSample / 8;
+  const blockAlign = numChannels * bitsPerSample / 8;
+  const dataSize = pcmData.length;
+  const headerSize = 44;
+  const fileSize = headerSize + dataSize - 8;
+  const buffer = new ArrayBuffer(headerSize + dataSize);
+  const view = new DataView(buffer);
+  const uint8 = new Uint8Array(buffer);
+  view.setUint32(0, 1380533830, false);
+  view.setUint32(4, fileSize, true);
+  view.setUint32(8, 1463899717, false);
+  view.setUint32(12, 1718449184, false);
+  view.setUint32(16, 16, true);
+  view.setUint16(20, 1, true);
+  view.setUint16(22, numChannels, true);
+  view.setUint32(24, sampleRate, true);
+  view.setUint32(28, byteRate, true);
+  view.setUint16(32, blockAlign, true);
+  view.setUint16(34, bitsPerSample, true);
+  view.setUint32(36, 1684108385, false);
+  view.setUint32(40, dataSize, true);
+  uint8.set(pcmData, headerSize);
+  return buffer;
+}
 function createGeminiProviderFromEnv() {
   return createProviderFromEnv("GEMINI_API_KEY", import_genai.GoogleGenAI, GeminiGenerativeProvider);
 }
@@ -4927,9 +5879,12 @@ var init_gemini = __esm({
   "src/providers/gemini.ts"() {
     "use strict";
     import_genai = require("@google/genai");
+    init_messages();
     init_base_provider();
     init_constants2();
+    init_gemini_image_models();
     init_gemini_models();
+    init_gemini_speech_models();
     init_utils();
     GEMINI_ROLE_MAP = {
       system: "user",
@@ -4944,6 +5899,139 @@ var init_gemini = __esm({
       getModelSpecs() {
         return GEMINI_MODELS;
       }
+      // =========================================================================
+      // Image Generation
+      // =========================================================================
+      getImageModelSpecs() {
+        return geminiImageModels;
+      }
+      supportsImageGeneration(modelId) {
+        return isGeminiImageModel(modelId);
+      }
+      async generateImage(options) {
+        const client = this.client;
+        const spec = getGeminiImageModelSpec(options.model);
+        const isImagenModel = options.model.startsWith("imagen");
+        const aspectRatio = options.size ?? spec?.defaultSize ?? "1:1";
+        const n = options.n ?? 1;
+        if (isImagenModel) {
+          const response2 = await client.models.generateImages({
+            model: options.model,
+            prompt: options.prompt,
+            config: {
+              numberOfImages: n,
+              aspectRatio,
+              outputMimeType: options.responseFormat === "b64_json" ? "image/png" : "image/jpeg"
+            }
+          });
+          const images2 = response2.generatedImages ?? [];
+          const cost2 = calculateGeminiImageCost(options.model, aspectRatio, images2.length);
+          return {
+            // Gemini's imageBytes is already base64 encoded, so use it directly
+            images: images2.map((img) => ({
+              b64Json: img.image?.imageBytes ?? void 0
+            })),
+            model: options.model,
+            usage: {
+              imagesGenerated: images2.length,
+              size: aspectRatio,
+              quality: "standard"
+            },
+            cost: cost2
+          };
+        }
+        const response = await client.models.generateContent({
+          model: options.model,
+          contents: [{ role: "user", parts: [{ text: options.prompt }] }],
+          config: {
+            responseModalities: [import_genai.Modality.IMAGE, import_genai.Modality.TEXT]
+          }
+        });
+        const images = [];
+        const candidate = response.candidates?.[0];
+        if (candidate?.content?.parts) {
+          for (const part of candidate.content.parts) {
+            if ("inlineData" in part && part.inlineData) {
+              images.push({
+                b64Json: part.inlineData.data
+              });
+            }
+          }
+        }
+        const cost = calculateGeminiImageCost(options.model, aspectRatio, images.length);
+        return {
+          images,
+          model: options.model,
+          usage: {
+            imagesGenerated: images.length,
+            size: aspectRatio,
+            quality: "standard"
+          },
+          cost
+        };
+      }
+      // =========================================================================
+      // Speech Generation
+      // =========================================================================
+      getSpeechModelSpecs() {
+        return geminiSpeechModels;
+      }
+      supportsSpeechGeneration(modelId) {
+        return isGeminiSpeechModel(modelId);
+      }
+      async generateSpeech(options) {
+        const client = this.client;
+        const spec = getGeminiSpeechModelSpec(options.model);
+        const voice = options.voice ?? spec?.defaultVoice ?? "Zephyr";
+        const response = await client.models.generateContent({
+          model: options.model,
+          contents: [
+            {
+              role: "user",
+              parts: [{ text: options.input }]
+            }
+          ],
+          config: {
+            responseModalities: [import_genai.Modality.AUDIO],
+            speechConfig: {
+              voiceConfig: {
+                prebuiltVoiceConfig: {
+                  voiceName: voice
+                }
+              }
+            }
+          }
+        });
+        let pcmData;
+        const candidate = response.candidates?.[0];
+        if (candidate?.content?.parts) {
+          for (const part of candidate.content.parts) {
+            if ("inlineData" in part && part.inlineData?.data) {
+              const base64 = part.inlineData.data;
+              const binary = atob(base64);
+              pcmData = new Uint8Array(binary.length);
+              for (let i = 0; i < binary.length; i++) {
+                pcmData[i] = binary.charCodeAt(i);
+              }
+              break;
+            }
+          }
+        }
+        if (!pcmData) {
+          throw new Error("No audio data in Gemini TTS response");
+        }
+        const audioData = wrapPcmInWav(pcmData, 24e3, 16, 1);
+        const cost = calculateGeminiSpeechCost(options.model, options.input.length);
+        return {
+          audio: audioData,
+          model: options.model,
+          usage: {
+            characterCount: options.input.length
+          },
+          cost,
+          format: spec?.defaultFormat ?? "wav"
+        };
+      }
       buildRequestPayload(options, descriptor, _spec, messages) {
         const contents = this.convertMessagesToContents(messages);
         const generationConfig = this.buildGenerationConfig(options);
@@ -4961,7 +6049,7 @@ var init_gemini = __esm({
         };
         return {
           model: descriptor.name,
-          contents: this.convertContentsForNewSDK(contents),
+          contents,
           config
         };
       }
@@ -4996,18 +6084,25 @@ var init_gemini = __esm({
           if (message.role === "system") {
             expandedMessages.push({
               role: "user",
-              content: message.content
+              content: extractText(message.content)
             });
             expandedMessages.push({
               role: "assistant",
               content: "Understood."
             });
           } else {
-            expandedMessages.push(message);
+            expandedMessages.push({
+              role: message.role,
+              content: message.content
+            });
           }
         }
         return this.mergeConsecutiveMessages(expandedMessages);
       }
+      /**
+       * Merge consecutive messages with the same role (required by Gemini).
+       * Handles multimodal content by converting to Gemini's part format.
+       */
       mergeConsecutiveMessages(messages) {
         if (messages.length === 0) {
           return [];
@@ -5016,15 +6111,16 @@ var init_gemini = __esm({
         let currentGroup = null;
         for (const message of messages) {
           const geminiRole = GEMINI_ROLE_MAP[message.role];
+          const geminiParts = this.convertToGeminiParts(message.content);
           if (currentGroup && currentGroup.role === geminiRole) {
-            currentGroup.parts.push({ text: message.content });
+            currentGroup.parts.push(...geminiParts);
           } else {
             if (currentGroup) {
               result.push(currentGroup);
             }
             currentGroup = {
               role: geminiRole,
-              parts: [{ text: message.content }]
+              parts: geminiParts
             };
           }
         }
@@ -5033,11 +6129,39 @@ var init_gemini = __esm({
         }
         return result;
       }
-      convertContentsForNewSDK(contents) {
-        return contents.map((content) => ({
-          role: content.role,
-          parts: content.parts.map((part) => ({ text: part.text }))
-        }));
+      /**
+       * Convert llmist content to Gemini's part format.
+       * Handles text, images, and audio (Gemini supports all three).
+       */
+      convertToGeminiParts(content) {
+        const parts = normalizeContent(content);
+        return parts.map((part) => {
+          if (part.type === "text") {
+            return { text: part.text };
+          }
+          if (part.type === "image") {
+            if (part.source.type === "url") {
+              throw new Error(
+                "Gemini does not support image URLs directly. Please provide base64-encoded image data."
+              );
+            }
+            return {
+              inlineData: {
+                mimeType: part.source.mediaType,
+                data: part.source.data
+              }
+            };
+          }
+          if (part.type === "audio") {
+            return {
+              inlineData: {
+                mimeType: part.source.mediaType,
+                data: part.source.data
+              }
+            };
+          }
+          throw new Error(`Unsupported content type: ${part.type}`);
+        });
       }
       buildGenerationConfig(options) {
         const config = {};
@@ -5058,9 +6182,9 @@ var init_gemini = __esm({
       async *wrapStream(iterable) {
         const stream2 = iterable;
         for await (const chunk of stream2) {
-          const text = this.extractText(chunk);
-          if (text) {
-            yield { text, rawEvent: chunk };
+          const text3 = this.extractText(chunk);
+          if (text3) {
+            yield { text: text3, rawEvent: chunk };
           }
           const finishReason = this.extractFinishReason(chunk);
           const usage = this.extractUsage(chunk);
@@ -5121,7 +6245,7 @@ var init_gemini = __esm({
         try {
           const response = await client.models.countTokens({
             model: descriptor.name,
-            contents: this.convertContentsForNewSDK(contents)
+            contents
             // Note: systemInstruction not used - it's not supported by countTokens()
             // and would cause a 2100% token counting error
           });
@@ -5131,14 +6255,140 @@ var init_gemini = __esm({
             `Token counting failed for ${descriptor.name}, using fallback estimation:`,
             error
           );
-          const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
-          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
+          let totalChars = 0;
+          let mediaCount = 0;
+          for (const msg of messages) {
+            const parts = normalizeContent(msg.content);
+            for (const part of parts) {
+              if (part.type === "text") {
+                totalChars += part.text.length;
+              } else if (part.type === "image" || part.type === "audio") {
+                mediaCount++;
+              }
+            }
+          }
+          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + mediaCount * 258;
         }
       }
     };
   }
 });
+// src/providers/openai-image-models.ts
+function getOpenAIImageModelSpec(modelId) {
+  return openaiImageModels.find((m) => m.modelId === modelId);
+}
+function isOpenAIImageModel(modelId) {
+  return openaiImageModels.some((m) => m.modelId === modelId);
+}
+function calculateOpenAIImageCost(modelId, size, quality = "standard", n = 1) {
+  const spec = getOpenAIImageModelSpec(modelId);
+  if (!spec) return void 0;
+  const sizePrice = spec.pricing.bySize?.[size];
+  if (sizePrice === void 0) return void 0;
+  let pricePerImage;
+  if (typeof sizePrice === "number") {
+    pricePerImage = sizePrice;
+  } else {
+    pricePerImage = sizePrice[quality];
+    if (pricePerImage === void 0) return void 0;
+  }
+  return pricePerImage * n;
+}
+var GPT_IMAGE_SIZES, GPT_IMAGE_QUALITIES, DALLE3_SIZES, DALLE3_QUALITIES, DALLE2_SIZES, openaiImageModels;
+var init_openai_image_models = __esm({
+  "src/providers/openai-image-models.ts"() {
+    "use strict";
+    GPT_IMAGE_SIZES = ["1024x1024", "1024x1536", "1536x1024"];
+    GPT_IMAGE_QUALITIES = ["low", "medium", "high"];
+    DALLE3_SIZES = ["1024x1024", "1024x1792", "1792x1024"];
+    DALLE3_QUALITIES = ["standard", "hd"];
+    DALLE2_SIZES = ["256x256", "512x512", "1024x1024"];
+    openaiImageModels = [
+      // GPT Image 1 Family (flagship)
+      {
+        provider: "openai",
+        modelId: "gpt-image-1",
+        displayName: "GPT Image 1",
+        pricing: {
+          bySize: {
+            "1024x1024": { low: 0.011, medium: 0.04, high: 0.17 },
+            "1024x1536": { low: 0.016, medium: 0.06, high: 0.25 },
+            "1536x1024": { low: 0.016, medium: 0.06, high: 0.25 }
+          }
+        },
+        supportedSizes: [...GPT_IMAGE_SIZES],
+        supportedQualities: [...GPT_IMAGE_QUALITIES],
+        maxImages: 1,
+        defaultSize: "1024x1024",
+        defaultQuality: "medium",
+        features: {
+          textRendering: true,
+          transparency: true
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "gpt-image-1-mini",
+        displayName: "GPT Image 1 Mini",
+        pricing: {
+          bySize: {
+            "1024x1024": { low: 5e-3, medium: 0.02, high: 0.052 },
+            "1024x1536": { low: 75e-4, medium: 0.03, high: 0.078 },
+            "1536x1024": { low: 75e-4, medium: 0.03, high: 0.078 }
+          }
+        },
+        supportedSizes: [...GPT_IMAGE_SIZES],
+        supportedQualities: [...GPT_IMAGE_QUALITIES],
+        maxImages: 1,
+        defaultSize: "1024x1024",
+        defaultQuality: "medium",
+        features: {
+          textRendering: true,
+          transparency: true
+        }
+      },
+      // DALL-E Family
+      {
+        provider: "openai",
+        modelId: "dall-e-3",
+        displayName: "DALL-E 3",
+        pricing: {
+          bySize: {
+            "1024x1024": { standard: 0.04, hd: 0.08 },
+            "1024x1792": { standard: 0.08, hd: 0.12 },
+            "1792x1024": { standard: 0.08, hd: 0.12 }
+          }
+        },
+        supportedSizes: [...DALLE3_SIZES],
+        supportedQualities: [...DALLE3_QUALITIES],
+        maxImages: 1,
+        // DALL-E 3 only supports n=1
+        defaultSize: "1024x1024",
+        defaultQuality: "standard",
+        features: {
+          textRendering: true
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "dall-e-2",
+        displayName: "DALL-E 2 (Legacy)",
+        pricing: {
+          bySize: {
+            "256x256": 0.016,
+            "512x512": 0.018,
+            "1024x1024": 0.02
+          }
+        },
+        supportedSizes: [...DALLE2_SIZES],
+        maxImages: 10,
+        defaultSize: "1024x1024"
+      }
+    ];
+  }
+});
 // src/providers/openai-models.ts
 var OPENAI_MODELS;
 var init_openai_models = __esm({
@@ -5503,6 +6753,144 @@ var init_openai_models = __esm({
   }
 });
+// src/providers/openai-speech-models.ts
+function getOpenAISpeechModelSpec(modelId) {
+  return openaiSpeechModels.find((m) => m.modelId === modelId);
+}
+function isOpenAISpeechModel(modelId) {
+  return openaiSpeechModels.some((m) => m.modelId === modelId);
+}
+function calculateOpenAISpeechCost(modelId, characterCount, estimatedMinutes) {
+  const spec = getOpenAISpeechModelSpec(modelId);
+  if (!spec) return void 0;
+  if (spec.pricing.perCharacter !== void 0) {
+    return characterCount * spec.pricing.perCharacter;
+  }
+  if (spec.pricing.perMinute !== void 0 && estimatedMinutes !== void 0) {
+    return estimatedMinutes * spec.pricing.perMinute;
+  }
+  if (spec.pricing.perMinute !== void 0) {
+    const approxMinutes = characterCount / 750;
+    return approxMinutes * spec.pricing.perMinute;
+  }
+  return void 0;
+}
+var OPENAI_TTS_VOICES, OPENAI_TTS_EXTENDED_VOICES, OPENAI_TTS_FORMATS, openaiSpeechModels;
+var init_openai_speech_models = __esm({
+  "src/providers/openai-speech-models.ts"() {
+    "use strict";
+    OPENAI_TTS_VOICES = [
+      "alloy",
+      "echo",
+      "fable",
+      "onyx",
+      "nova",
+      "shimmer"
+    ];
+    OPENAI_TTS_EXTENDED_VOICES = [
+      ...OPENAI_TTS_VOICES,
+      "ash",
+      "ballad",
+      "coral",
+      "sage",
+      "verse"
+    ];
+    OPENAI_TTS_FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
+    openaiSpeechModels = [
+      // Standard TTS models (character-based pricing)
+      {
+        provider: "openai",
+        modelId: "tts-1",
+        displayName: "TTS-1",
+        pricing: {
+          // $15 per 1M characters = $0.000015 per character
+          perCharacter: 15e-6
+        },
+        voices: [...OPENAI_TTS_VOICES],
+        formats: OPENAI_TTS_FORMATS,
+        maxInputLength: 4096,
+        defaultVoice: "alloy",
+        defaultFormat: "mp3",
+        features: {
+          voiceInstructions: false
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "tts-1-1106",
+        displayName: "TTS-1 (Nov 2023)",
+        pricing: {
+          perCharacter: 15e-6
+        },
+        voices: [...OPENAI_TTS_VOICES],
+        formats: OPENAI_TTS_FORMATS,
+        maxInputLength: 4096,
+        defaultVoice: "alloy",
+        defaultFormat: "mp3",
+        features: {
+          voiceInstructions: false
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "tts-1-hd",
+        displayName: "TTS-1 HD",
+        pricing: {
+          // $30 per 1M characters = $0.00003 per character
+          perCharacter: 3e-5
+        },
+        voices: [...OPENAI_TTS_VOICES],
+        formats: OPENAI_TTS_FORMATS,
+        maxInputLength: 4096,
+        defaultVoice: "alloy",
+        defaultFormat: "mp3",
+        features: {
+          voiceInstructions: false
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "tts-1-hd-1106",
+        displayName: "TTS-1 HD (Nov 2023)",
+        pricing: {
+          perCharacter: 3e-5
+        },
+        voices: [...OPENAI_TTS_VOICES],
+        formats: OPENAI_TTS_FORMATS,
+        maxInputLength: 4096,
+        defaultVoice: "alloy",
+        defaultFormat: "mp3",
+        features: {
+          voiceInstructions: false
+        }
+      },
+      // Token-based TTS model with voice instructions support
+      {
+        provider: "openai",
+        modelId: "gpt-4o-mini-tts",
+        displayName: "GPT-4o Mini TTS",
+        pricing: {
+          // $0.60 per 1M input tokens = $0.0000006 per token
+          perInputToken: 6e-7,
+          // $12 per 1M audio output tokens = $0.000012 per token
+          perAudioOutputToken: 12e-6,
+          // ~$0.015 per minute of audio
+          perMinute: 0.015
+        },
+        voices: [...OPENAI_TTS_EXTENDED_VOICES],
+        formats: OPENAI_TTS_FORMATS,
+        maxInputLength: 2e3,
+        // tokens, not characters
+        defaultVoice: "alloy",
+        defaultFormat: "mp3",
+        features: {
+          voiceInstructions: true
+        }
+      }
+    ];
+  }
+});
 // src/providers/openai.ts
 function sanitizeExtra(extra, allowTemperature) {
   if (!extra) {
@@ -5522,9 +6910,12 @@ var init_openai = __esm({
     "use strict";
     import_openai = __toESM(require("openai"), 1);
     import_tiktoken = require("tiktoken");
+    init_messages();
     init_base_provider();
     init_constants2();
+    init_openai_image_models();
     init_openai_models();
+    init_openai_speech_models();
     init_utils();
     ROLE_MAP = {
       system: "system",
@@ -5539,6 +6930,87 @@ var init_openai = __esm({
       getModelSpecs() {
         return OPENAI_MODELS;
       }
+      // =========================================================================
+      // Image Generation
+      // =========================================================================
+      getImageModelSpecs() {
+        return openaiImageModels;
+      }
+      supportsImageGeneration(modelId) {
+        return isOpenAIImageModel(modelId);
+      }
+      async generateImage(options) {
+        const client = this.client;
+        const spec = getOpenAIImageModelSpec(options.model);
+        const size = options.size ?? spec?.defaultSize ?? "1024x1024";
+        const quality = options.quality ?? spec?.defaultQuality ?? "standard";
+        const n = options.n ?? 1;
+        const isDallE2 = options.model === "dall-e-2";
+        const isGptImage = options.model.startsWith("gpt-image");
+        const requestParams = {
+          model: options.model,
+          prompt: options.prompt,
+          size,
+          n
+        };
+        if (!isDallE2 && !isGptImage) {
+          requestParams.quality = quality;
+        }
+        if (isGptImage) {
+        } else if (!isDallE2) {
+          requestParams.response_format = options.responseFormat ?? "url";
+        }
+        const response = await client.images.generate(requestParams);
+        const cost = calculateOpenAIImageCost(options.model, size, quality, n);
+        const images = response.data ?? [];
+        return {
+          images: images.map((img) => ({
+            url: img.url,
+            b64Json: img.b64_json,
+            revisedPrompt: img.revised_prompt
+          })),
+          model: options.model,
+          usage: {
+            imagesGenerated: images.length,
+            size,
+            quality
+          },
+          cost
+        };
+      }
+      // =========================================================================
+      // Speech Generation
+      // =========================================================================
+      getSpeechModelSpecs() {
+        return openaiSpeechModels;
+      }
+      supportsSpeechGeneration(modelId) {
+        return isOpenAISpeechModel(modelId);
+      }
+      async generateSpeech(options) {
+        const client = this.client;
+        const spec = getOpenAISpeechModelSpec(options.model);
+        const format = options.responseFormat ?? spec?.defaultFormat ?? "mp3";
+        const voice = options.voice ?? spec?.defaultVoice ?? "alloy";
+        const response = await client.audio.speech.create({
+          model: options.model,
+          input: options.input,
+          voice,
+          response_format: format,
+          speed: options.speed ?? 1
+        });
+        const audioBuffer = await response.arrayBuffer();
+        const cost = calculateOpenAISpeechCost(options.model, options.input.length);
+        return {
+          audio: audioBuffer,
+          model: options.model,
+          usage: {
+            characterCount: options.input.length
+          },
+          cost,
+          format
+        };
+      }
       buildRequestPayload(options, descriptor, spec, messages) {
         const { maxTokens, temperature, topP, stopSequences, extra } = options;
         const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
@@ -5546,11 +7018,7 @@ var init_openai = __esm({
         const sanitizedExtra = sanitizeExtra(extra, shouldIncludeTemperature);
         return {
           model: descriptor.name,
-          messages: messages.map((message) => ({
-            role: ROLE_MAP[message.role],
-            content: message.content,
-            name: message.name
-          })),
+          messages: messages.map((message) => this.convertToOpenAIMessage(message)),
           // Only set max_completion_tokens if explicitly provided
           // Otherwise let the API use "as much as fits" in the context window
           ...maxTokens !== void 0 ? { max_completion_tokens: maxTokens } : {},
@@ -5562,6 +7030,77 @@ var init_openai = __esm({
           ...shouldIncludeTemperature ? { temperature } : {}
         };
       }
+      /**
+       * Convert an LLMMessage to OpenAI's ChatCompletionMessageParam.
+       * Handles role-specific content type requirements:
+       * - system/assistant: string content only
+       * - user: string or multimodal array content
+       */
+      convertToOpenAIMessage(message) {
+        const role = ROLE_MAP[message.role];
+        if (role === "user") {
+          const content = this.convertToOpenAIContent(message.content);
+          return {
+            role: "user",
+            content,
+            ...message.name ? { name: message.name } : {}
+          };
+        }
+        const textContent = typeof message.content === "string" ? message.content : extractText(message.content);
+        if (role === "system") {
+          return {
+            role: "system",
+            content: textContent,
+            ...message.name ? { name: message.name } : {}
+          };
+        }
+        return {
+          role: "assistant",
+          content: textContent,
+          ...message.name ? { name: message.name } : {}
+        };
+      }
+      /**
+       * Convert llmist content to OpenAI's content format.
+       * Optimizes by returning string for text-only content, array for multimodal.
+       */
+      convertToOpenAIContent(content) {
+        if (typeof content === "string") {
+          return content;
+        }
+        return content.map((part) => {
+          if (part.type === "text") {
+            return { type: "text", text: part.text };
+          }
+          if (part.type === "image") {
+            return this.convertImagePart(part);
+          }
+          if (part.type === "audio") {
+            throw new Error(
+              "OpenAI chat completions do not support audio input. Use Whisper for transcription or Gemini for audio understanding."
+            );
+          }
+          throw new Error(`Unsupported content type: ${part.type}`);
+        });
+      }
+      /**
+       * Convert an image content part to OpenAI's image_url format.
+       * Supports both URLs and base64 data URLs.
+       */
+      convertImagePart(part) {
+        if (part.source.type === "url") {
+          return {
+            type: "image_url",
+            image_url: { url: part.source.url }
+          };
+        }
+        return {
+          type: "image_url",
+          image_url: {
+            url: `data:${part.source.mediaType};base64,${part.source.data}`
+          }
+        };
+      }
       async executeStreamRequest(payload, signal) {
         const client = this.client;
         const stream2 = await client.chat.completions.create(payload, signal ? { signal } : void 0);
@@ -5570,9 +7109,9 @@ var init_openai = __esm({
       async *wrapStream(iterable) {
         const stream2 = iterable;
         for await (const chunk of stream2) {
-          const text = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
-          if (text) {
-            yield { text, rawEvent: chunk };
+          const text3 = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
+          if (text3) {
+            yield { text: text3, rawEvent: chunk };
           }
           const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
           const usage = chunk.usage ? {
@@ -5620,17 +7159,26 @@ var init_openai = __esm({
           }
           try {
             let tokenCount = 0;
+            let imageCount = 0;
             for (const message of messages) {
               tokenCount += OPENAI_MESSAGE_OVERHEAD_TOKENS;
               const roleText = ROLE_MAP[message.role];
               tokenCount += encoding.encode(roleText).length;
-              tokenCount += encoding.encode(message.content ?? "").length;
+              const textContent = extractText(message.content);
+              tokenCount += encoding.encode(textContent).length;
+              const parts = normalizeContent(message.content);
+              for (const part of parts) {
+                if (part.type === "image") {
+                  imageCount++;
+                }
+              }
               if (message.name) {
                 tokenCount += encoding.encode(message.name).length;
                 tokenCount += OPENAI_NAME_FIELD_OVERHEAD_TOKENS;
               }
             }
             tokenCount += OPENAI_REPLY_PRIMING_TOKENS;
+            tokenCount += imageCount * 765;
             return tokenCount;
           } finally {
             encoding.free();
@@ -5640,8 +7188,19 @@ var init_openai = __esm({
             `Token counting failed for ${descriptor.name}, using fallback estimation:`,
             error
           );
-          const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
-          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
+          let totalChars = 0;
+          let imageCount = 0;
+          for (const msg of messages) {
+            const parts = normalizeContent(msg.content);
+            for (const part of parts) {
+              if (part.type === "text") {
+                totalChars += part.text.length;
+              } else if (part.type === "image") {
+                imageCount++;
+              }
+            }
+          }
+          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 765;
         }
       }
     };
@@ -5879,30 +7438,109 @@ var init_model_registry = __esm({
   }
 });
-// src/core/options.ts
-var ModelIdentifierParser;
-var init_options = __esm({
-  "src/core/options.ts"() {
+// src/core/namespaces/image.ts
+var ImageNamespace;
+var init_image = __esm({
+  "src/core/namespaces/image.ts"() {
     "use strict";
-    ModelIdentifierParser = class {
-      constructor(defaultProvider = "openai") {
+    ImageNamespace = class {
+      constructor(adapters, defaultProvider) {
+        this.adapters = adapters;
         this.defaultProvider = defaultProvider;
       }
-      parse(identifier) {
-        const trimmed = identifier.trim();
-        if (!trimmed) {
-          throw new Error("Model identifier cannot be empty");
+      /**
+       * Generate images from a text prompt.
+       *
+       * @param options - Image generation options
+       * @returns Promise resolving to the generation result with images and cost
+       * @throws Error if the provider doesn't support image generation
+       */
+      async generate(options) {
+        const modelId = options.model;
+        const adapter = this.findImageAdapter(modelId);
+        if (!adapter || !adapter.generateImage) {
+          throw new Error(
+            `No provider supports image generation for model "${modelId}". Available image models: ${this.listModels().map((m) => m.modelId).join(", ")}`
+          );
         }
-        const [maybeProvider, ...rest] = trimmed.split(":");
-        if (rest.length === 0) {
-          return { provider: this.defaultProvider, name: maybeProvider };
+        return adapter.generateImage(options);
+      }
+      /**
+       * List all available image generation models.
+       */
+      listModels() {
+        const models = [];
+        for (const adapter of this.adapters) {
+          if (adapter.getImageModelSpecs) {
+            models.push(...adapter.getImageModelSpecs());
+          }
         }
-        const provider = maybeProvider;
-        const name = rest.join(":");
-        if (!name) {
-          throw new Error("Model name cannot be empty");
+        return models;
+      }
+      /**
+       * Check if a model is supported for image generation.
+       */
+      supportsModel(modelId) {
+        return this.findImageAdapter(modelId) !== void 0;
+      }
+      findImageAdapter(modelId) {
+        return this.adapters.find(
+          (adapter) => adapter.supportsImageGeneration?.(modelId) ?? false
+        );
+      }
+    };
+  }
+});
+// src/core/namespaces/speech.ts
+var SpeechNamespace;
+var init_speech = __esm({
+  "src/core/namespaces/speech.ts"() {
+    "use strict";
+    SpeechNamespace = class {
+      constructor(adapters, defaultProvider) {
+        this.adapters = adapters;
+        this.defaultProvider = defaultProvider;
+      }
+      /**
+       * Generate speech audio from text.
+       *
+       * @param options - Speech generation options
+       * @returns Promise resolving to the generation result with audio and cost
+       * @throws Error if the provider doesn't support speech generation
+       */
+      async generate(options) {
+        const modelId = options.model;
+        const adapter = this.findSpeechAdapter(modelId);
+        if (!adapter || !adapter.generateSpeech) {
+          throw new Error(
+            `No provider supports speech generation for model "${modelId}". Available speech models: ${this.listModels().map((m) => m.modelId).join(", ")}`
+          );
         }
-        return { provider, name };
+        return adapter.generateSpeech(options);
+      }
+      /**
+       * List all available speech generation models.
+       */
+      listModels() {
+        const models = [];
+        for (const adapter of this.adapters) {
+          if (adapter.getSpeechModelSpecs) {
+            models.push(...adapter.getSpeechModelSpecs());
+          }
+        }
+        return models;
+      }
+      /**
+       * Check if a model is supported for speech generation.
+       */
+      supportsModel(modelId) {
+        return this.findSpeechAdapter(modelId) !== void 0;
+      }
+      findSpeechAdapter(modelId) {
+        return this.adapters.find(
+          (adapter) => adapter.supportsSpeechGeneration?.(modelId) ?? false
+        );
       }
     };
   }
@@ -5951,6 +7589,201 @@ var init_quick_methods = __esm({
   }
 });
+// src/core/namespaces/text.ts
+var TextNamespace;
+var init_text = __esm({
+  "src/core/namespaces/text.ts"() {
+    "use strict";
+    init_quick_methods();
+    TextNamespace = class {
+      constructor(client) {
+        this.client = client;
+      }
+      /**
+       * Generate a complete text response.
+       *
+       * @param prompt - User prompt
+       * @param options - Optional configuration
+       * @returns Complete text response
+       */
+      async complete(prompt, options) {
+        return complete(this.client, prompt, options);
+      }
+      /**
+       * Stream text chunks.
+       *
+       * @param prompt - User prompt
+       * @param options - Optional configuration
+       * @returns Async generator yielding text chunks
+       */
+      stream(prompt, options) {
+        return stream(this.client, prompt, options);
+      }
+    };
+  }
+});
+// src/core/namespaces/vision.ts
+var VisionNamespace;
+var init_vision = __esm({
+  "src/core/namespaces/vision.ts"() {
+    "use strict";
+    init_input_content();
+    init_messages();
+    VisionNamespace = class {
+      constructor(client) {
+        this.client = client;
+      }
+      /**
+       * Build a message builder with the image content attached.
+       * Handles URLs, data URLs, base64 strings, and binary buffers.
+       */
+      buildImageMessage(options) {
+        const builder = new LLMMessageBuilder();
+        if (options.systemPrompt) {
+          builder.addSystem(options.systemPrompt);
+        }
+        if (typeof options.image === "string") {
+          if (options.image.startsWith("http://") || options.image.startsWith("https://")) {
+            builder.addUserWithImageUrl(options.prompt, options.image);
+          } else if (isDataUrl(options.image)) {
+            const parsed = parseDataUrl(options.image);
+            if (!parsed) {
+              throw new Error("Invalid data URL format");
+            }
+            builder.addUserWithImage(
+              options.prompt,
+              parsed.data,
+              parsed.mimeType
+            );
+          } else {
+            const buffer = Buffer.from(options.image, "base64");
+            builder.addUserWithImage(options.prompt, buffer, options.mimeType);
+          }
+        } else {
+          builder.addUserWithImage(options.prompt, options.image, options.mimeType);
+        }
+        return builder;
+      }
+      /**
+       * Stream the response and collect text and usage information.
+       */
+      async streamAndCollect(options, builder) {
+        let response = "";
+        let finalUsage;
+        for await (const chunk of this.client.stream({
+          model: options.model,
+          messages: builder.build(),
+          maxTokens: options.maxTokens,
+          temperature: options.temperature
+        })) {
+          response += chunk.text;
+          if (chunk.usage) {
+            finalUsage = {
+              inputTokens: chunk.usage.inputTokens,
+              outputTokens: chunk.usage.outputTokens,
+              totalTokens: chunk.usage.totalTokens
+            };
+          }
+        }
+        return { text: response.trim(), usage: finalUsage };
+      }
+      /**
+       * Analyze an image with a vision-capable model.
+       * Returns the analysis as a string.
+       *
+       * @param options - Vision analysis options
+       * @returns Promise resolving to the analysis text
+       * @throws Error if the image format is unsupported or model doesn't support vision
+       *
+       * @example
+       * ```typescript
+       * // From file
+       * const result = await llmist.vision.analyze({
+       *   model: "gpt-4o",
+       *   image: await fs.readFile("photo.jpg"),
+       *   prompt: "What's in this image?",
+       * });
+       *
+       * // From URL (OpenAI only)
+       * const result = await llmist.vision.analyze({
+       *   model: "gpt-4o",
+       *   image: "https://example.com/image.jpg",
+       *   prompt: "Describe this image",
+       * });
+       * ```
+       */
+      async analyze(options) {
+        const builder = this.buildImageMessage(options);
+        const { text: text3 } = await this.streamAndCollect(options, builder);
+        return text3;
+      }
+      /**
+       * Analyze an image and return detailed result with usage info.
+       *
+       * @param options - Vision analysis options
+       * @returns Promise resolving to the analysis result with usage info
+       */
+      async analyzeWithUsage(options) {
+        const builder = this.buildImageMessage(options);
+        const { text: text3, usage } = await this.streamAndCollect(options, builder);
+        return {
+          text: text3,
+          model: options.model,
+          usage
+        };
+      }
+      /**
+       * Check if a model supports vision/image input.
+       *
+       * @param modelId - Model ID to check
+       * @returns True if the model supports vision
+       */
+      supportsModel(modelId) {
+        const spec = this.client.modelRegistry.getModelSpec(modelId);
+        return spec?.features?.vision === true;
+      }
+      /**
+       * List all models that support vision.
+       *
+       * @returns Array of model IDs that support vision
+       */
+      listModels() {
+        return this.client.modelRegistry.listModels().filter((spec) => spec.features?.vision === true).map((spec) => spec.modelId);
+      }
+    };
+  }
+});
+// src/core/options.ts
+var ModelIdentifierParser;
+var init_options = __esm({
+  "src/core/options.ts"() {
+    "use strict";
+    ModelIdentifierParser = class {
+      constructor(defaultProvider = "openai") {
+        this.defaultProvider = defaultProvider;
+      }
+      parse(identifier) {
+        const trimmed = identifier.trim();
+        if (!trimmed) {
+          throw new Error("Model identifier cannot be empty");
+        }
+        const [maybeProvider, ...rest] = trimmed.split(":");
+        if (rest.length === 0) {
+          return { provider: this.defaultProvider, name: maybeProvider };
+        }
+        const provider = maybeProvider;
+        const name = rest.join(":");
+        if (!name) {
+          throw new Error("Model name cannot be empty");
+        }
+        return { provider, name };
+      }
+    };
+  }
+});
 // src/core/client.ts
 var client_exports = {};
 __export(client_exports, {
@@ -5963,12 +7796,22 @@ var init_client = __esm({
     init_builder();
     init_discovery();
     init_model_registry();
+    init_image();
+    init_speech();
+    init_text();
+    init_vision();
     init_options();
     init_quick_methods();
     LLMist = class _LLMist {
       parser;
+      defaultProvider;
       modelRegistry;
       adapters;
+      // Namespaces for different generation types
+      text;
+      image;
+      speech;
+      vision;
       constructor(...args) {
         let adapters = [];
         let defaultProvider;
@@ -6007,6 +7850,7 @@ var init_client = __esm({
           const priorityB = b.priority ?? 0;
           return priorityB - priorityA;
         });
+        this.defaultProvider = resolvedDefaultProvider;
         this.parser = new ModelIdentifierParser(resolvedDefaultProvider);
         this.modelRegistry = new ModelRegistry();
         for (const adapter of this.adapters) {
@@ -6015,6 +7859,10 @@ var init_client = __esm({
         if (customModels.length > 0) {
           this.modelRegistry.registerModels(customModels);
         }
+        this.text = new TextNamespace(this);
+        this.image = new ImageNamespace(this.adapters, this.defaultProvider);
+        this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
+        this.vision = new VisionNamespace(this);
       }
       stream(options) {
         const descriptor = this.parser.parse(options.model);
@@ -6199,6 +8047,7 @@ var init_builder = __esm({
   "src/agent/builder.ts"() {
     "use strict";
     init_constants();
+    init_input_content();
     init_model_shortcuts();
     init_registry();
     init_agent();
@@ -6846,13 +8695,17 @@ ${endPrefix}`
        * }
        * ```
        */
-      ask(userPrompt) {
+      /**
+       * Build AgentOptions with the given user prompt.
+       * Centralizes options construction for ask(), askWithImage(), and askWithContent().
+       */
+      buildAgentOptions(userPrompt) {
         if (!this.client) {
           const { LLMist: LLMistClass } = (init_client(), __toCommonJS(client_exports));
           this.client = new LLMistClass();
         }
         const registry = GadgetRegistry.from(this.gadgets);
-        const options = {
+        return {
           client: this.client,
           model: this.model ?? "openai:gpt-5-nano",
           systemPrompt: this.systemPrompt,
@@ -6878,6 +8731,83 @@ ${endPrefix}`
           compactionConfig: this.compactionConfig,
           signal: this.signal
         };
+      }
+      ask(userPrompt) {
+        const options = this.buildAgentOptions(userPrompt);
+        return new Agent(AGENT_INTERNAL_KEY, options);
+      }
+      /**
+       * Build and create the agent with a multimodal user prompt (text + image).
+       * Returns the Agent instance ready to run.
+       *
+       * @param textPrompt - Text prompt describing what to do with the image
+       * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
+       * @param mimeType - Optional MIME type (auto-detected if not provided)
+       * @returns Configured Agent instance
+       *
+       * @example
+       * ```typescript
+       * const agent = LLMist.createAgent()
+       *   .withModel("gpt-4o")
+       *   .withSystem("You analyze images")
+       *   .askWithImage(
+       *     "What's in this image?",
+       *     await fs.readFile("photo.jpg")
+       *   );
+       *
+       * for await (const event of agent.run()) {
+       *   // handle events
+       * }
+       * ```
+       */
+      askWithImage(textPrompt, imageData, mimeType) {
+        const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
+        const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
+        if (!detectedMime) {
+          throw new Error(
+            "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
+          );
+        }
+        const userContent = [
+          text(textPrompt),
+          {
+            type: "image",
+            source: {
+              type: "base64",
+              mediaType: detectedMime,
+              data: toBase64(imageBuffer)
+            }
+          }
+        ];
+        const options = this.buildAgentOptions(userContent);
+        return new Agent(AGENT_INTERNAL_KEY, options);
+      }
+      /**
+       * Build and return an Agent configured with multimodal content.
+       * More flexible than askWithImage - accepts any combination of content parts.
+       *
+       * @param content - Array of content parts (text, images, audio)
+       * @returns A configured Agent ready for execution
+       *
+       * @example
+       * ```typescript
+       * import { text, imageFromBuffer, audioFromBuffer } from "llmist";
+       *
+       * const agent = LLMist.createAgent()
+       *   .withModel("gemini:gemini-2.5-flash")
+       *   .askWithContent([
+       *     text("Describe this image and transcribe the audio:"),
+       *     imageFromBuffer(imageData),
+       *     audioFromBuffer(audioData),
+       *   ]);
+       *
+       * for await (const event of agent.run()) {
+       *   // handle events
+       * }
+       * ```
+       */
+      askWithContent(content) {
+        const options = this.buildAgentOptions(content);
         return new Agent(AGENT_INTERNAL_KEY, options);
       }
       /**
@@ -6995,7 +8925,10 @@ var COMMANDS = {
   complete: "complete",
   agent: "agent",
   models: "models",
-  gadget: "gadget"
+  gadget: "gadget",
+  image: "image",
+  speech: "speech",
+  vision: "vision"
 };
 var LOG_LEVELS = ["silly", "trace", "debug", "info", "warn", "error", "fatal"];
 var DEFAULT_MODEL = "openai:gpt-5-nano";
@@ -7016,7 +8949,20 @@ var OPTION_FLAGS = {
   docker: "--docker",
   dockerRo: "--docker-ro",
   noDocker: "--no-docker",
-  dockerDev: "--docker-dev"
+  dockerDev: "--docker-dev",
+  // Multimodal input options
+  inputImage: "--image <path>",
+  inputAudio: "--audio <path>",
+  // Image generation options
+  imageSize: "--size <size>",
+  imageQuality: "--quality <quality>",
+  imageCount: "-n, --count <number>",
+  imageOutput: "-o, --output <path>",
+  // Speech generation options
+  voice: "--voice <name>",
+  speechFormat: "--format <format>",
+  speechSpeed: "--speed <value>",
+  speechOutput: "-o, --output <path>"
 };
 var OPTION_DESCRIPTIONS = {
   model: "Model identifier, e.g. openai:gpt-5-nano or anthropic:claude-sonnet-4-5.",
@@ -7032,10 +8978,23 @@ var OPTION_DESCRIPTIONS = {
   noBuiltins: "Disable built-in gadgets (AskUser, TellUser).",
   noBuiltinInteraction: "Disable interactive gadgets (AskUser) while keeping TellUser.",
   quiet: "Suppress all output except content (text and TellUser messages).",
+  // Multimodal input descriptions
+  inputImage: "Image file to include with the prompt (vision models).",
+  inputAudio: "Audio file to include with the prompt (Gemini only).",
   docker: "Run agent in a Docker sandbox container for security isolation.",
   dockerRo: "Run in Docker with current directory mounted read-only.",
   noDocker: "Disable Docker sandboxing (override config).",
-  dockerDev: "Run in Docker dev mode (mount local source instead of npm install)."
+  dockerDev: "Run in Docker dev mode (mount local source instead of npm install).",
+  // Image generation descriptions
+  imageSize: "Image size/aspect ratio, e.g. '1024x1024', '1:1', '16:9'.",
+  imageQuality: "Image quality: 'standard', 'hd', 'low', 'medium', 'high'.",
+  imageCount: "Number of images to generate (model dependent, usually 1-4).",
+  imageOutput: "Output path for the generated image. Defaults to stdout if not specified.",
+  // Speech generation descriptions
+  voice: "Voice name for speech generation, e.g. 'nova', 'alloy', 'Zephyr'.",
+  speechFormat: "Audio format: 'mp3', 'opus', 'aac', 'flac', 'wav', 'pcm'.",
+  speechSpeed: "Speech speed multiplier (0.25 to 4.0, default 1.0).",
+  speechOutput: "Output path for audio file. Defaults to stdout if not specified."
 };
 var SUMMARY_PREFIX = "[llmist]";
@@ -7045,7 +9004,7 @@ var import_commander2 = require("commander");
 // package.json
 var package_default = {
   name: "llmist",
-  version: "2.2.0",
+  version: "2.5.0",
   description: "TypeScript LLM client with streaming tool execution. Tools fire mid-stream. Built-in function calling works with any model\u2014no structured outputs or native tool support required.",
   type: "module",
   main: "dist/index.cjs",
@@ -7167,7 +9126,7 @@ var package_default = {
 };
 // src/cli/agent-command.ts
-var import_promises3 = require("readline/promises");
+var import_promises4 = require("readline/promises");
 var import_chalk5 = __toESM(require("chalk"), 1);
 init_builder();
@@ -7185,6 +9144,7 @@ function isAbortError(error) {
 }
 // src/cli/agent-command.ts
+init_input_content();
 init_registry();
 init_constants2();
@@ -7509,15 +9469,84 @@ var finish = createGadget({
 });
 var builtinGadgets = [askUser, tellUser, finish];
+// src/cli/file-utils.ts
+var import_promises2 = require("fs/promises");
+var import_node_path3 = require("path");
+init_input_content();
+var DEFAULT_MAX_FILE_SIZE = 50 * 1024 * 1024;
+function formatFileSize(bytes) {
+  if (bytes < 1024) return `${bytes} bytes`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
+  if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+  return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
+}
+async function checkFileSize(absolutePath, filePath, maxSize) {
+  const stats = await (0, import_promises2.stat)(absolutePath);
+  if (stats.size > maxSize) {
+    throw new Error(
+      `File "${filePath}" is too large (${formatFileSize(stats.size)}). Maximum allowed size is ${formatFileSize(maxSize)}. Consider compressing the file or using a smaller version.`
+    );
+  }
+}
+async function readImageFile(filePath, options = {}) {
+  const absolutePath = (0, import_node_path3.resolve)(filePath);
+  const maxFileSize = options.maxFileSize ?? DEFAULT_MAX_FILE_SIZE;
+  let buffer;
+  try {
+    await checkFileSize(absolutePath, filePath, maxFileSize);
+    buffer = await (0, import_promises2.readFile)(absolutePath);
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    throw new Error(`Failed to read image file "${filePath}": ${message}`);
+  }
+  const mimeType = detectImageMimeType(buffer);
+  if (!mimeType) {
+    throw new Error(
+      `File "${filePath}" is not a supported image format. Supported formats: JPEG, PNG, GIF, WebP`
+    );
+  }
+  return imageFromBuffer(buffer, mimeType);
+}
+async function readAudioFile(filePath, options = {}) {
+  const absolutePath = (0, import_node_path3.resolve)(filePath);
+  const maxFileSize = options.maxFileSize ?? DEFAULT_MAX_FILE_SIZE;
+  let buffer;
+  try {
+    await checkFileSize(absolutePath, filePath, maxFileSize);
+    buffer = await (0, import_promises2.readFile)(absolutePath);
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    throw new Error(`Failed to read audio file "${filePath}": ${message}`);
+  }
+  const mimeType = detectAudioMimeType(buffer);
+  if (!mimeType) {
+    throw new Error(
+      `File "${filePath}" is not a supported audio format. Supported formats: MP3, WAV, OGG, WebM`
+    );
+  }
+  return audioFromBuffer(buffer, mimeType);
+}
+async function readFileBuffer(filePath, options = {}) {
+  const absolutePath = (0, import_node_path3.resolve)(filePath);
+  const maxFileSize = options.maxFileSize ?? DEFAULT_MAX_FILE_SIZE;
+  try {
+    await checkFileSize(absolutePath, filePath, maxFileSize);
+    return await (0, import_promises2.readFile)(absolutePath);
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    throw new Error(`Failed to read file "${filePath}": ${message}`);
+  }
+}
 // src/cli/gadgets.ts
 var import_node_fs7 = __toESM(require("fs"), 1);
-var import_node_path6 = __toESM(require("path"), 1);
+var import_node_path7 = __toESM(require("path"), 1);
 var import_node_url = require("url");
 init_gadget();
 // src/cli/builtins/filesystem/list-directory.ts
 var import_node_fs4 = __toESM(require("fs"), 1);
-var import_node_path4 = __toESM(require("path"), 1);
+var import_node_path5 = __toESM(require("path"), 1);
 var import_zod4 = require("zod");
 // src/index.ts
@@ -7541,6 +9570,7 @@ init_prompt_config();
 // src/index.ts
 init_client();
+init_input_content();
 init_messages();
 init_model_registry();
 init_model_shortcuts();
@@ -7571,6 +9601,10 @@ init_logger();
 // src/testing/mock-stream.ts
 init_constants();
+// src/testing/mock-builder.ts
+init_input_content();
+init_messages();
 // src/testing/mock-client.ts
 init_client();
@@ -7582,7 +9616,7 @@ var import_node_stream = require("stream");
 // src/cli/builtins/filesystem/utils.ts
 var import_node_fs3 = __toESM(require("fs"), 1);
-var import_node_path3 = __toESM(require("path"), 1);
+var import_node_path4 = __toESM(require("path"), 1);
 var PathSandboxException = class extends Error {
   constructor(inputPath, reason) {
     super(`Path access denied: ${inputPath}. ${reason}`);
@@ -7591,7 +9625,7 @@ var PathSandboxException = class extends Error {
 };
 function validatePathIsWithinCwd(inputPath) {
   const cwd = process.cwd();
-  const resolvedPath = import_node_path3.default.resolve(cwd, inputPath);
+  const resolvedPath = import_node_path4.default.resolve(cwd, inputPath);
   let finalPath;
   try {
     finalPath = import_node_fs3.default.realpathSync(resolvedPath);
@@ -7603,7 +9637,7 @@ function validatePathIsWithinCwd(inputPath) {
       throw error;
     }
   }
-  const cwdWithSep = cwd + import_node_path3.default.sep;
+  const cwdWithSep = cwd + import_node_path4.default.sep;
   if (!finalPath.startsWith(cwdWithSep) && finalPath !== cwd) {
     throw new PathSandboxException(inputPath, "Path is outside the current working directory");
   }
@@ -7616,8 +9650,8 @@ function listFiles(dirPath, basePath = dirPath, maxDepth = 1, currentDepth = 1)
   try {
     const items = import_node_fs4.default.readdirSync(dirPath);
     for (const item of items) {
-      const fullPath = import_node_path4.default.join(dirPath, item);
-      const relativePath = import_node_path4.default.relative(basePath, fullPath);
+      const fullPath = import_node_path5.default.join(dirPath, item);
+      const relativePath = import_node_path5.default.relative(basePath, fullPath);
       try {
         const stats = import_node_fs4.default.lstatSync(fullPath);
         let type;
@@ -7732,7 +9766,7 @@ ${formattedList}`;
 // src/cli/builtins/filesystem/read-file.ts
 var import_node_fs5 = __toESM(require("fs"), 1);
 var import_zod5 = require("zod");
-var readFile = createGadget({
+var readFile2 = createGadget({
   name: "ReadFile",
   description: "Read the entire content of a file and return it as text. The file path must be within the current working directory or its subdirectories.",
   schema: import_zod5.z.object({
@@ -7761,7 +9795,7 @@ ${content}`;
 // src/cli/builtins/filesystem/write-file.ts
 var import_node_fs6 = __toESM(require("fs"), 1);
-var import_node_path5 = __toESM(require("path"), 1);
+var import_node_path6 = __toESM(require("path"), 1);
 var import_zod6 = require("zod");
 var writeFile = createGadget({
   name: "WriteFile",
@@ -7796,7 +9830,7 @@ console.log(\`Server running on http://localhost:\${port}\`);`
   ],
   execute: ({ filePath, content }) => {
     const validatedPath = validatePathIsWithinCwd(filePath);
-    const parentDir = import_node_path5.default.dirname(validatedPath);
+    const parentDir = import_node_path6.default.dirname(validatedPath);
     let createdDir = false;
     if (!import_node_fs6.default.existsSync(parentDir)) {
       validatePathIsWithinCwd(parentDir);
@@ -7805,7 +9839,7 @@ console.log(\`Server running on http://localhost:\${port}\`);`
     }
     import_node_fs6.default.writeFileSync(validatedPath, content, "utf-8");
     const bytesWritten = Buffer.byteLength(content, "utf-8");
-    const dirNote = createdDir ? ` (created directory: ${import_node_path5.default.dirname(filePath)})` : "";
+    const dirNote = createdDir ? ` (created directory: ${import_node_path6.default.dirname(filePath)})` : "";
     return `path=${filePath}
 Wrote ${bytesWritten} bytes${dirNote}`;
@@ -8003,7 +10037,7 @@ error: ${message}`;
 // src/cli/builtins/index.ts
 var builtinGadgetRegistry = {
   ListDirectory: listDirectory,
-  ReadFile: readFile,
+  ReadFile: readFile2,
   WriteFile: writeFile,
   EditFile: editFile,
   RunCommand: runCommand
@@ -8040,10 +10074,10 @@ function expandHomePath(input) {
   if (!home) {
     return input;
   }
-  return import_node_path6.default.join(home, input.slice(1));
+  return import_node_path7.default.join(home, input.slice(1));
 }
 function isFileLikeSpecifier(specifier) {
-  return PATH_PREFIXES.some((prefix) => specifier.startsWith(prefix)) || specifier.includes(import_node_path6.default.sep);
+  return PATH_PREFIXES.some((prefix) => specifier.startsWith(prefix)) || specifier.includes(import_node_path7.default.sep);
 }
 function tryResolveBuiltin(specifier) {
   if (specifier.startsWith(BUILTIN_PREFIX)) {
@@ -8066,7 +10100,7 @@ function resolveGadgetSpecifier(specifier, cwd) {
     return specifier;
   }
   const expanded = expandHomePath(specifier);
-  const resolvedPath = import_node_path6.default.resolve(cwd, expanded);
+  const resolvedPath = import_node_path7.default.resolve(cwd, expanded);
   if (!import_node_fs7.default.existsSync(resolvedPath)) {
     throw new Error(`Gadget module not found at ${resolvedPath}`);
   }
@@ -8138,13 +10172,14 @@ async function loadGadgets(specifiers, cwd, importer = (specifier) => import(spe
 }
 // src/cli/llm-logging.ts
-var import_promises2 = require("fs/promises");
+var import_promises3 = require("fs/promises");
 var import_node_os = require("os");
-var import_node_path7 = require("path");
-var DEFAULT_LLM_LOG_DIR = (0, import_node_path7.join)((0, import_node_os.homedir)(), ".llmist", "logs");
+var import_node_path8 = require("path");
+init_messages();
+var DEFAULT_LLM_LOG_DIR = (0, import_node_path8.join)((0, import_node_os.homedir)(), ".llmist", "logs");
 function resolveLogDir(option, subdir) {
   if (option === true) {
-    return (0, import_node_path7.join)(DEFAULT_LLM_LOG_DIR, subdir);
+    return (0, import_node_path8.join)(DEFAULT_LLM_LOG_DIR, subdir);
   }
   if (typeof option === "string") {
     return option;
@@ -8155,14 +10190,14 @@ function formatLlmRequest(messages) {
   const lines = [];
   for (const msg of messages) {
     lines.push(`=== ${msg.role.toUpperCase()} ===`);
-    lines.push(msg.content ?? "");
+    lines.push(msg.content ? extractText(msg.content) : "");
     lines.push("");
   }
   return lines.join("\n");
 }
 async function writeLogFile(dir, filename, content) {
-  await (0, import_promises2.mkdir)(dir, { recursive: true });
-  await (0, import_promises2.writeFile)((0, import_node_path7.join)(dir, filename), content, "utf-8");
+  await (0, import_promises3.mkdir)(dir, { recursive: true });
+  await (0, import_promises3.writeFile)((0, import_node_path8.join)(dir, filename), content, "utf-8");
 }
 function formatSessionTimestamp(date = /* @__PURE__ */ new Date()) {
   const pad = (n) => n.toString().padStart(2, "0");
@@ -8176,9 +10211,9 @@ function formatSessionTimestamp(date = /* @__PURE__ */ new Date()) {
 }
 async function createSessionDir(baseDir) {
   const timestamp = formatSessionTimestamp();
-  const sessionDir = (0, import_node_path7.join)(baseDir, timestamp);
+  const sessionDir = (0, import_node_path8.join)(baseDir, timestamp);
   try {
-    await (0, import_promises2.mkdir)(sessionDir, { recursive: true });
+    await (0, import_promises3.mkdir)(sessionDir, { recursive: true });
     return sessionDir;
   } catch (error) {
     console.warn(`[llmist] Failed to create log session directory: ${sessionDir}`, error);
@@ -8229,9 +10264,9 @@ function ensureMarkedConfigured() {
     markedConfigured = true;
   }
 }
-function renderMarkdown(text) {
+function renderMarkdown(text3) {
   ensureMarkedConfigured();
-  let rendered = import_marked.marked.parse(text);
+  let rendered = import_marked.marked.parse(text3);
   rendered = rendered.replace(/\*\*(.+?)\*\*/g, (_, content) => import_chalk3.default.bold(content)).replace(/(?<!\*)\*(\S[^*]*)\*(?!\*)/g, (_, content) => import_chalk3.default.italic(content));
   return rendered.trimEnd();
 }
@@ -8245,8 +10280,8 @@ function createRainbowSeparator() {
   }
   return result;
 }
-function renderMarkdownWithSeparators(text) {
-  const rendered = renderMarkdown(text);
+function renderMarkdownWithSeparators(text3) {
+  const rendered = renderMarkdown(text3);
   const separator = createRainbowSeparator();
   return `
 ${separator}
@@ -8414,12 +10449,12 @@ var StreamPrinter = class {
    *
    * @param text - Text to write
    */
-  write(text) {
-    if (!text) {
+  write(text3) {
+    if (!text3) {
       return;
     }
-    this.target.write(text);
-    this.endedWithNewline = text.endsWith("\n");
+    this.target.write(text3);
+    this.endedWithNewline = text3.endsWith("\n");
   }
   /**
    * Ensures output ends with a newline by writing one if needed.
@@ -8898,7 +10933,7 @@ function addCompleteOptions(cmd, defaults) {
     OPTION_DESCRIPTIONS.maxTokens,
     createNumericParser({ label: "Max tokens", integer: true, min: 1 }),
     defaults?.["max-tokens"]
-  ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]);
+  ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]).option(OPTION_FLAGS.inputImage, OPTION_DESCRIPTIONS.inputImage).option(OPTION_FLAGS.inputAudio, OPTION_DESCRIPTIONS.inputAudio);
 }
 function addAgentOptions(cmd, defaults) {
   const gadgetAccumulator = (value, previous = []) => [
@@ -8922,7 +10957,7 @@ function addAgentOptions(cmd, defaults) {
     OPTION_FLAGS.noBuiltinInteraction,
     OPTION_DESCRIPTIONS.noBuiltinInteraction,
     defaults?.["builtin-interaction"] !== false
-  ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]).option(OPTION_FLAGS.docker, OPTION_DESCRIPTIONS.docker).option(OPTION_FLAGS.dockerRo, OPTION_DESCRIPTIONS.dockerRo).option(OPTION_FLAGS.noDocker, OPTION_DESCRIPTIONS.noDocker).option(OPTION_FLAGS.dockerDev, OPTION_DESCRIPTIONS.dockerDev);
+  ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]).option(OPTION_FLAGS.inputImage, OPTION_DESCRIPTIONS.inputImage).option(OPTION_FLAGS.inputAudio, OPTION_DESCRIPTIONS.inputAudio).option(OPTION_FLAGS.docker, OPTION_DESCRIPTIONS.docker).option(OPTION_FLAGS.dockerRo, OPTION_DESCRIPTIONS.dockerRo).option(OPTION_FLAGS.noDocker, OPTION_DESCRIPTIONS.noDocker).option(OPTION_FLAGS.dockerDev, OPTION_DESCRIPTIONS.dockerDev);
 }
 function configToCompleteOptions(config) {
   const result = {};
@@ -8989,7 +11024,7 @@ var DEV_SOURCE_MOUNT_TARGET = "/llmist-src";
 // src/cli/config.ts
 var import_node_fs8 = require("fs");
 var import_node_os2 = require("os");
-var import_node_path8 = require("path");
+var import_node_path9 = require("path");
 var import_js_toml = require("js-toml");
 // src/cli/templates.ts
@@ -9127,6 +11162,22 @@ var AGENT_CONFIG_KEYS = /* @__PURE__ */ new Set([
   "docker-cwd-permission"
   // Override CWD mount permission for this profile
 ]);
+var IMAGE_CONFIG_KEYS = /* @__PURE__ */ new Set([
+  "model",
+  "size",
+  "quality",
+  "count",
+  "output",
+  "quiet"
+]);
+var SPEECH_CONFIG_KEYS = /* @__PURE__ */ new Set([
+  "model",
+  "voice",
+  "format",
+  "speed",
+  "output",
+  "quiet"
+]);
 var CUSTOM_CONFIG_KEYS = /* @__PURE__ */ new Set([
   ...COMPLETE_CONFIG_KEYS,
   ...AGENT_CONFIG_KEYS,
@@ -9134,7 +11185,7 @@ var CUSTOM_CONFIG_KEYS = /* @__PURE__ */ new Set([
   "description"
 ]);
 function getConfigPath() {
-  return (0, import_node_path8.join)((0, import_node_os2.homedir)(), ".llmist", "cli.toml");
+  return (0, import_node_path9.join)((0, import_node_os2.homedir)(), ".llmist", "cli.toml");
 }
 var ConfigError = class extends Error {
   constructor(message, path5) {
@@ -9387,6 +11438,75 @@ function validateAgentConfig(raw, section) {
   }
   return result;
 }
+function validateImageConfig(raw, section) {
+  if (typeof raw !== "object" || raw === null) {
+    throw new ConfigError(`[${section}] must be a table`);
+  }
+  const rawObj = raw;
+  for (const key of Object.keys(rawObj)) {
+    if (!IMAGE_CONFIG_KEYS.has(key)) {
+      throw new ConfigError(`[${section}].${key} is not a valid option`);
+    }
+  }
+  const result = {};
+  if ("model" in rawObj) {
+    result.model = validateString(rawObj.model, "model", section);
+  }
+  if ("size" in rawObj) {
+    result.size = validateString(rawObj.size, "size", section);
+  }
+  if ("quality" in rawObj) {
+    result.quality = validateString(rawObj.quality, "quality", section);
+  }
+  if ("count" in rawObj) {
+    result.count = validateNumber(rawObj.count, "count", section, {
+      integer: true,
+      min: 1,
+      max: 10
+    });
+  }
+  if ("output" in rawObj) {
+    result.output = validateString(rawObj.output, "output", section);
+  }
+  if ("quiet" in rawObj) {
+    result.quiet = validateBoolean(rawObj.quiet, "quiet", section);
+  }
+  return result;
+}
+function validateSpeechConfig(raw, section) {
+  if (typeof raw !== "object" || raw === null) {
+    throw new ConfigError(`[${section}] must be a table`);
+  }
+  const rawObj = raw;
+  for (const key of Object.keys(rawObj)) {
+    if (!SPEECH_CONFIG_KEYS.has(key)) {
+      throw new ConfigError(`[${section}].${key} is not a valid option`);
+    }
+  }
+  const result = {};
+  if ("model" in rawObj) {
+    result.model = validateString(rawObj.model, "model", section);
+  }
+  if ("voice" in rawObj) {
+    result.voice = validateString(rawObj.voice, "voice", section);
+  }
+  if ("format" in rawObj) {
+    result.format = validateString(rawObj.format, "format", section);
+  }
+  if ("speed" in rawObj) {
+    result.speed = validateNumber(rawObj.speed, "speed", section, {
+      min: 0.25,
+      max: 4
+    });
+  }
+  if ("output" in rawObj) {
+    result.output = validateString(rawObj.output, "output", section);
+  }
+  if ("quiet" in rawObj) {
+    result.quiet = validateBoolean(rawObj.quiet, "quiet", section);
+  }
+  return result;
+}
 function validateStringOrBoolean(value, field, section) {
   if (typeof value === "string" || typeof value === "boolean") {
     return value;
@@ -9509,6 +11629,10 @@ function validateConfig(raw, configPath) {
         result.complete = validateCompleteConfig(value, key);
       } else if (key === "agent") {
         result.agent = validateAgentConfig(value, key);
+      } else if (key === "image") {
+        result.image = validateImageConfig(value, key);
+      } else if (key === "speech") {
+        result.speech = validateSpeechConfig(value, key);
       } else if (key === "prompts") {
         result.prompts = validatePromptsConfig(value, key);
       } else if (key === "docker") {
@@ -9553,7 +11677,7 @@ function loadConfig() {
   return resolveTemplatesInConfig(inherited, configPath);
 }
 function getCustomCommandNames(config) {
-  const reserved = /* @__PURE__ */ new Set(["global", "complete", "agent", "prompts", "docker"]);
+  const reserved = /* @__PURE__ */ new Set(["global", "complete", "agent", "image", "speech", "prompts", "docker"]);
   return Object.keys(config).filter((key) => !reserved.has(key));
 }
 function resolveTemplatesInConfig(config, configPath) {
@@ -9908,8 +12032,8 @@ function computeDockerfileHash(dockerfile) {
 // src/cli/docker/image-manager.ts
 var import_node_fs9 = require("fs");
 var import_node_os3 = require("os");
-var import_node_path9 = require("path");
-var CACHE_DIR = (0, import_node_path9.join)((0, import_node_os3.homedir)(), ".llmist", "docker-cache");
+var import_node_path10 = require("path");
+var CACHE_DIR = (0, import_node_path10.join)((0, import_node_os3.homedir)(), ".llmist", "docker-cache");
 var HASH_FILE = "image-hash.json";
 function ensureCacheDir() {
   if (!(0, import_node_fs9.existsSync)(CACHE_DIR)) {
@@ -9917,7 +12041,7 @@ function ensureCacheDir() {
   }
 }
 function getCachedHash(imageName) {
-  const hashPath = (0, import_node_path9.join)(CACHE_DIR, HASH_FILE);
+  const hashPath = (0, import_node_path10.join)(CACHE_DIR, HASH_FILE);
   if (!(0, import_node_fs9.existsSync)(hashPath)) {
     return void 0;
   }
@@ -9931,7 +12055,7 @@ function getCachedHash(imageName) {
 }
 function setCachedHash(imageName, hash) {
   ensureCacheDir();
-  const hashPath = (0, import_node_path9.join)(CACHE_DIR, HASH_FILE);
+  const hashPath = (0, import_node_path10.join)(CACHE_DIR, HASH_FILE);
   let cache = {};
   if ((0, import_node_fs9.existsSync)(hashPath)) {
     try {
@@ -9957,7 +12081,7 @@ var DockerBuildError = class extends Error {
 };
 async function buildImage(imageName, dockerfile) {
   ensureCacheDir();
-  const dockerfilePath = (0, import_node_path9.join)(CACHE_DIR, "Dockerfile");
+  const dockerfilePath = (0, import_node_path10.join)(CACHE_DIR, "Dockerfile");
   (0, import_node_fs9.writeFileSync)(dockerfilePath, dockerfile);
   const proc = Bun.spawn(
     ["docker", "build", "-t", imageName, "-f", dockerfilePath, CACHE_DIR],
@@ -9992,7 +12116,7 @@ async function ensureImage(imageName = DEFAULT_IMAGE_NAME, dockerfile) {
 // src/cli/docker/docker-wrapper.ts
 var import_node_fs10 = require("fs");
-var import_node_path10 = require("path");
+var import_node_path11 = require("path");
 var import_node_os4 = require("os");
 var DockerUnavailableError = class extends Error {
   constructor() {
@@ -10038,9 +12162,9 @@ function autoDetectDevSource() {
   if (!scriptPath || !scriptPath.endsWith("src/cli.ts")) {
     return void 0;
   }
-  const srcDir = (0, import_node_path10.dirname)(scriptPath);
-  const projectDir = (0, import_node_path10.dirname)(srcDir);
-  const packageJsonPath = (0, import_node_path10.join)(projectDir, "package.json");
+  const srcDir = (0, import_node_path11.dirname)(scriptPath);
+  const projectDir = (0, import_node_path11.dirname)(srcDir);
+  const packageJsonPath = (0, import_node_path11.join)(projectDir, "package.json");
   if (!(0, import_node_fs10.existsSync)(packageJsonPath)) {
     return void 0;
   }
@@ -10189,7 +12313,7 @@ function createHumanInputHandler(env, progress, keyboard) {
       keyboard.cleanupEsc();
       keyboard.cleanupEsc = null;
     }
-    const rl = (0, import_promises3.createInterface)({ input: env.stdin, output: env.stdout });
+    const rl = (0, import_promises4.createInterface)({ input: env.stdin, output: env.stdout });
     try {
       const questionLine = question.trim() ? `
 ${renderMarkdownWithSeparators(question.trim())}` : "";
@@ -10547,8 +12671,8 @@ Denied: ${result.reason ?? "by user"}`
   builder.withTextOnlyHandler("acknowledge");
   builder.withTextWithGadgetsHandler({
     gadgetName: "TellUser",
-    parameterMapping: (text) => ({ message: text, done: false, type: "info" }),
-    resultMapping: (text) => `\u2139\uFE0F  ${text}`
+    parameterMapping: (text3) => ({ message: text3, done: false, type: "info" }),
+    resultMapping: (text3) => `\u2139\uFE0F  ${text3}`
   });
   builder.withTrailingMessage(
     (ctx) => [
@@ -10557,7 +12681,19 @@ Denied: ${result.reason ?? "by user"}`
       "Maximize efficiency by batching independent operations in a single response."
     ].join(" ")
   );
-  const agent = builder.ask(prompt);
+  let agent;
+  if (options.image || options.audio) {
+    const parts = [text(prompt)];
+    if (options.image) {
+      parts.push(await readImageFile(options.image));
+    }
+    if (options.audio) {
+      parts.push(await readAudioFile(options.audio));
+    }
+    agent = builder.askWithContent(parts);
+  } else {
+    agent = builder.ask(prompt);
+  }
   let textBuffer = "";
   const flushTextBuffer = () => {
     if (textBuffer) {
@@ -10632,6 +12768,7 @@ function registerAgentCommand(program, env, config) {
 }
 // src/cli/complete-command.ts
+init_input_content();
 init_messages();
 init_model_shortcuts();
 init_constants2();
@@ -10643,7 +12780,18 @@ async function executeComplete(promptArg, options, env) {
   if (options.system) {
     builder.addSystem(options.system);
   }
-  builder.addUser(prompt);
+  if (options.image || options.audio) {
+    const parts = [text(prompt)];
+    if (options.image) {
+      parts.push(await readImageFile(options.image));
+    }
+    if (options.audio) {
+      parts.push(await readAudioFile(options.audio));
+    }
+    builder.addUserMultimodal(parts);
+  } else {
+    builder.addUser(prompt);
+  }
   const messages = builder.build();
   const llmLogsBaseDir = resolveLogDir(options.logLlmRequests, "requests");
   let llmSessionDir;
@@ -10718,7 +12866,7 @@ init_schema_to_json();
 init_schema_validator();
 // src/cli/gadget-prompts.ts
-var import_promises4 = require("readline/promises");
+var import_promises5 = require("readline/promises");
 var import_chalk6 = __toESM(require("chalk"), 1);
 init_schema_to_json();
 async function promptForParameters(schema, ctx) {
@@ -10729,7 +12877,7 @@ async function promptForParameters(schema, ctx) {
   if (!jsonSchema.properties || Object.keys(jsonSchema.properties).length === 0) {
     return {};
   }
-  const rl = (0, import_promises4.createInterface)({ input: ctx.stdin, output: ctx.stdout });
+  const rl = (0, import_promises5.createInterface)({ input: ctx.stdin, output: ctx.stdout });
   const params = {};
   try {
     for (const [key, prop] of Object.entries(jsonSchema.properties)) {
@@ -11148,19 +13296,118 @@ function registerGadgetCommand(program, env) {
   );
 }
+// src/cli/image-command.ts
+var import_node_fs11 = require("fs");
+var DEFAULT_IMAGE_MODEL = "dall-e-3";
+async function executeImage(promptArg, options, env) {
+  const prompt = await resolvePrompt(promptArg, env);
+  const client = env.createClient();
+  const model = options.model;
+  const n = options.count ? Number.parseInt(options.count, 10) : 1;
+  const stderrTTY = env.stderr.isTTY === true;
+  if (!options.quiet && stderrTTY) {
+    env.stderr.write(`${SUMMARY_PREFIX} Generating image with ${model}...
+`);
+  }
+  const result = await client.image.generate({
+    model,
+    prompt,
+    size: options.size,
+    quality: options.quality,
+    n,
+    responseFormat: options.output ? "b64_json" : "url"
+  });
+  if (options.output) {
+    const imageData = result.images[0];
+    if (imageData.b64Json) {
+      const buffer = Buffer.from(imageData.b64Json, "base64");
+      (0, import_node_fs11.writeFileSync)(options.output, buffer);
+      if (!options.quiet) {
+        env.stderr.write(`${SUMMARY_PREFIX} Image saved to ${options.output}
+`);
+      }
+    } else if (imageData.url) {
+      env.stdout.write(`${imageData.url}
+`);
+    }
+  } else {
+    for (const image of result.images) {
+      if (image.url) {
+        env.stdout.write(`${image.url}
+`);
+      } else if (image.b64Json) {
+        env.stdout.write(image.b64Json);
+      }
+    }
+  }
+  if (!options.quiet && stderrTTY) {
+    const parts = [
+      `${result.images.length} image(s)`,
+      `size: ${result.usage.size}`,
+      `quality: ${result.usage.quality}`
+    ];
+    if (result.cost !== void 0) {
+      parts.push(`cost: ${formatCost(result.cost)}`);
+    }
+    env.stderr.write(`${SUMMARY_PREFIX} ${parts.join(" | ")}
+`);
+  }
+}
+function registerImageCommand(program, env, config) {
+  program.command(COMMANDS.image).description("Generate images from a text prompt.").argument("[prompt]", "Image generation prompt. If omitted, stdin is used when available.").option(
+    OPTION_FLAGS.model,
+    OPTION_DESCRIPTIONS.model,
+    config?.model ?? DEFAULT_IMAGE_MODEL
+  ).option(OPTION_FLAGS.imageSize, OPTION_DESCRIPTIONS.imageSize, config?.size).option(OPTION_FLAGS.imageQuality, OPTION_DESCRIPTIONS.imageQuality, config?.quality).option(OPTION_FLAGS.imageCount, OPTION_DESCRIPTIONS.imageCount, config?.count?.toString()).option(OPTION_FLAGS.imageOutput, OPTION_DESCRIPTIONS.imageOutput, config?.output).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, config?.quiet ?? false).action(
+    (prompt, options) => executeAction(() => executeImage(prompt, options, env), env)
+  );
+}
 // src/cli/models-command.ts
 var import_chalk8 = __toESM(require("chalk"), 1);
 init_model_shortcuts();
 async function handleModelsCommand(options, env) {
   const client = env.createClient();
-  const models = client.modelRegistry.listModels(options.provider);
+  const showText = options.all || options.text || !options.image && !options.speech;
+  const showImage = options.all || options.image;
+  const showSpeech = options.all || options.speech;
+  const textModels = showText ? client.modelRegistry.listModels(options.provider) : [];
+  const imageModels = showImage ? client.image.listModels().filter((m) => !options.provider || m.provider === options.provider) : [];
+  const speechModels = showSpeech ? client.speech.listModels().filter((m) => !options.provider || m.provider === options.provider) : [];
   if (options.format === "json") {
-    renderJSON(models, env.stdout);
+    renderJSON(textModels, imageModels, speechModels, env.stdout);
   } else {
-    renderTable(models, options.verbose || false, env.stdout);
+    renderAllTables(textModels, imageModels, speechModels, options.verbose || false, env.stdout);
+  }
+}
+function renderAllTables(textModels, imageModels, speechModels, verbose, stream2) {
+  const hasAnyModels = textModels.length > 0 || imageModels.length > 0 || speechModels.length > 0;
+  if (!hasAnyModels) {
+    stream2.write(import_chalk8.default.yellow("\nNo models found matching the specified criteria.\n\n"));
+    return;
+  }
+  stream2.write(import_chalk8.default.bold.cyan("\nAvailable Models\n"));
+  stream2.write(import_chalk8.default.cyan("=".repeat(80)) + "\n\n");
+  if (textModels.length > 0) {
+    renderTextTable(textModels, verbose, stream2);
+  }
+  if (imageModels.length > 0) {
+    renderImageTable(imageModels, verbose, stream2);
+  }
+  if (speechModels.length > 0) {
+    renderSpeechTable(speechModels, verbose, stream2);
+  }
+  if (textModels.length > 0) {
+    stream2.write(import_chalk8.default.bold.magenta("Model Shortcuts\n"));
+    stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n");
+    const shortcuts = Object.entries(MODEL_ALIASES).sort((a, b) => a[0].localeCompare(b[0]));
+    for (const [shortcut, fullName] of shortcuts) {
+      stream2.write(import_chalk8.default.cyan(`  ${shortcut.padEnd(15)}`) + import_chalk8.default.dim(" \u2192 ") + import_chalk8.default.white(fullName) + "\n");
+    }
+    stream2.write("\n");
   }
 }
-function renderTable(models, verbose, stream2) {
+function renderTextTable(models, verbose, stream2) {
   const grouped = /* @__PURE__ */ new Map();
   for (const model of models) {
     const provider = model.provider;
@@ -11169,13 +13416,13 @@ function renderTable(models, verbose, stream2) {
     }
     grouped.get(provider).push(model);
   }
-  stream2.write(import_chalk8.default.bold.cyan("\nAvailable Models\n"));
-  stream2.write(import_chalk8.default.cyan("=".repeat(80)) + "\n\n");
+  stream2.write(import_chalk8.default.bold.blue("\u{1F4DD} Text/LLM Models\n"));
+  stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n\n");
   const providers = Array.from(grouped.keys()).sort();
   for (const provider of providers) {
     const providerModels = grouped.get(provider);
     const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
-    stream2.write(import_chalk8.default.bold.yellow(`${providerName} Models
+    stream2.write(import_chalk8.default.bold.yellow(`${providerName}
 `));
     if (verbose) {
       renderVerboseTable(providerModels, stream2);
@@ -11184,13 +13431,6 @@ function renderTable(models, verbose, stream2) {
     }
     stream2.write("\n");
   }
-  stream2.write(import_chalk8.default.bold.magenta("Model Shortcuts\n"));
-  stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n");
-  const shortcuts = Object.entries(MODEL_ALIASES).sort((a, b) => a[0].localeCompare(b[0]));
-  for (const [shortcut, fullName] of shortcuts) {
-    stream2.write(import_chalk8.default.cyan(`  ${shortcut.padEnd(15)}`) + import_chalk8.default.dim(" \u2192 ") + import_chalk8.default.white(fullName) + "\n");
-  }
-  stream2.write("\n");
 }
 function renderCompactTable(models, stream2) {
   const idWidth = 25;
@@ -11267,9 +13507,171 @@ function renderVerboseTable(models, stream2) {
   }
   stream2.write("\n");
 }
-function renderJSON(models, stream2) {
-  const output = {
-    models: models.map((model) => ({
+function renderImageTable(models, verbose, stream2) {
+  stream2.write(import_chalk8.default.bold.green("\u{1F3A8} Image Generation Models\n"));
+  stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n\n");
+  const grouped = /* @__PURE__ */ new Map();
+  for (const model of models) {
+    if (!grouped.has(model.provider)) {
+      grouped.set(model.provider, []);
+    }
+    grouped.get(model.provider).push(model);
+  }
+  for (const [provider, providerModels] of Array.from(grouped.entries()).sort()) {
+    const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
+    stream2.write(import_chalk8.default.bold.yellow(`${providerName}
+`));
+    if (verbose) {
+      for (const model of providerModels) {
+        stream2.write(import_chalk8.default.bold.green(`
+  ${model.modelId}
+`));
+        stream2.write(import_chalk8.default.dim("  " + "\u2500".repeat(60)) + "\n");
+        stream2.write(`  ${import_chalk8.default.dim("Name:")}      ${import_chalk8.default.white(model.displayName)}
+`);
+        stream2.write(`  ${import_chalk8.default.dim("Sizes:")}     ${import_chalk8.default.yellow(model.supportedSizes.join(", "))}
+`);
+        if (model.supportedQualities) {
+          stream2.write(`  ${import_chalk8.default.dim("Qualities:")} ${import_chalk8.default.yellow(model.supportedQualities.join(", "))}
+`);
+        }
+        stream2.write(`  ${import_chalk8.default.dim("Max Images:")} ${import_chalk8.default.yellow(model.maxImages.toString())}
+`);
+        stream2.write(`  ${import_chalk8.default.dim("Pricing:")}   ${import_chalk8.default.cyan(formatImagePrice(model))}
+`);
+        if (model.features) {
+          const features = [];
+          if (model.features.textRendering) features.push("text-rendering");
+          if (model.features.transparency) features.push("transparency");
+          if (model.features.conversational) features.push("conversational");
+          if (features.length > 0) {
+            stream2.write(`  ${import_chalk8.default.dim("Features:")}  ${import_chalk8.default.blue(features.join(", "))}
+`);
+          }
+        }
+      }
+    } else {
+      const idWidth = 32;
+      const nameWidth = 25;
+      const sizesWidth = 20;
+      const priceWidth = 15;
+      stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
+      stream2.write(
+        import_chalk8.default.bold(
+          "Model ID".padEnd(idWidth) + "  " + "Display Name".padEnd(nameWidth) + "  " + "Sizes".padEnd(sizesWidth) + "  " + "Price".padEnd(priceWidth)
+        ) + "\n"
+      );
+      stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
+      for (const model of providerModels) {
+        const sizes = model.supportedSizes.length > 2 ? model.supportedSizes.slice(0, 2).join(", ") + "..." : model.supportedSizes.join(", ");
+        stream2.write(
+          import_chalk8.default.green(model.modelId.padEnd(idWidth)) + "  " + import_chalk8.default.white(model.displayName.substring(0, nameWidth - 1).padEnd(nameWidth)) + "  " + import_chalk8.default.yellow(sizes.padEnd(sizesWidth)) + "  " + import_chalk8.default.cyan(formatImagePrice(model).padEnd(priceWidth)) + "\n"
+        );
+      }
+      stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
+    }
+    stream2.write("\n");
+  }
+}
+function renderSpeechTable(models, verbose, stream2) {
+  stream2.write(import_chalk8.default.bold.magenta("\u{1F3A4} Speech (TTS) Models\n"));
+  stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n\n");
+  const grouped = /* @__PURE__ */ new Map();
+  for (const model of models) {
+    if (!grouped.has(model.provider)) {
+      grouped.set(model.provider, []);
+    }
+    grouped.get(model.provider).push(model);
+  }
+  for (const [provider, providerModels] of Array.from(grouped.entries()).sort()) {
+    const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
+    stream2.write(import_chalk8.default.bold.yellow(`${providerName}
+`));
+    if (verbose) {
+      for (const model of providerModels) {
+        stream2.write(import_chalk8.default.bold.green(`
+  ${model.modelId}
+`));
+        stream2.write(import_chalk8.default.dim("  " + "\u2500".repeat(60)) + "\n");
+        stream2.write(`  ${import_chalk8.default.dim("Name:")}    ${import_chalk8.default.white(model.displayName)}
+`);
+        stream2.write(`  ${import_chalk8.default.dim("Voices:")}  ${import_chalk8.default.yellow(model.voices.length.toString())} voices
+`);
+        if (model.voices.length <= 6) {
+          stream2.write(`            ${import_chalk8.default.dim(model.voices.join(", "))}
+`);
+        } else {
+          stream2.write(`            ${import_chalk8.default.dim(model.voices.slice(0, 6).join(", ") + "...")}
+`);
+        }
+        stream2.write(`  ${import_chalk8.default.dim("Formats:")} ${import_chalk8.default.yellow(model.formats.join(", "))}
+`);
+        stream2.write(`  ${import_chalk8.default.dim("Max Input:")} ${import_chalk8.default.yellow(model.maxInputLength.toString())} chars
+`);
+        stream2.write(`  ${import_chalk8.default.dim("Pricing:")} ${import_chalk8.default.cyan(formatSpeechPrice(model))}
+`);
+        if (model.features) {
+          const features = [];
+          if (model.features.multiSpeaker) features.push("multi-speaker");
+          if (model.features.voiceInstructions) features.push("voice-instructions");
+          if (model.features.languages) features.push(`${model.features.languages} languages`);
+          if (features.length > 0) {
+            stream2.write(`  ${import_chalk8.default.dim("Features:")} ${import_chalk8.default.blue(features.join(", "))}
+`);
+          }
+        }
+      }
+    } else {
+      const idWidth = 30;
+      const nameWidth = 28;
+      const voicesWidth = 12;
+      const priceWidth = 18;
+      stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
+      stream2.write(
+        import_chalk8.default.bold(
+          "Model ID".padEnd(idWidth) + "  " + "Display Name".padEnd(nameWidth) + "  " + "Voices".padEnd(voicesWidth) + "  " + "Price".padEnd(priceWidth)
+        ) + "\n"
+      );
+      stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
+      for (const model of providerModels) {
+        stream2.write(
+          import_chalk8.default.green(model.modelId.padEnd(idWidth)) + "  " + import_chalk8.default.white(model.displayName.substring(0, nameWidth - 1).padEnd(nameWidth)) + "  " + import_chalk8.default.yellow(`${model.voices.length} voices`.padEnd(voicesWidth)) + "  " + import_chalk8.default.cyan(formatSpeechPrice(model).padEnd(priceWidth)) + "\n"
+        );
+      }
+      stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
+    }
+    stream2.write("\n");
+  }
+}
+function formatImagePrice(model) {
+  if (model.pricing.perImage !== void 0) {
+    return `$${model.pricing.perImage.toFixed(2)}/img`;
+  }
+  if (model.pricing.bySize) {
+    const prices = Object.values(model.pricing.bySize);
+    const minPrice = Math.min(...prices.flatMap((p) => typeof p === "number" ? [p] : Object.values(p)));
+    const maxPrice = Math.max(...prices.flatMap((p) => typeof p === "number" ? [p] : Object.values(p)));
+    if (minPrice === maxPrice) {
+      return `$${minPrice.toFixed(2)}/img`;
+    }
+    return `$${minPrice.toFixed(2)}-${maxPrice.toFixed(2)}`;
+  }
+  return "varies";
+}
+function formatSpeechPrice(model) {
+  if (model.pricing.perCharacter !== void 0) {
+    const perMillion = model.pricing.perCharacter * 1e6;
+    return `$${perMillion.toFixed(0)}/1M chars`;
+  }
+  if (model.pricing.perMinute !== void 0) {
+    return `~$${model.pricing.perMinute.toFixed(2)}/min`;
+  }
+  return "varies";
+}
+function renderJSON(textModels, imageModels, speechModels, stream2) {
+  const output = {};
+  if (textModels.length > 0) {
+    output.textModels = textModels.map((model) => ({
       provider: model.provider,
       modelId: model.modelId,
       displayName: model.displayName,
@@ -11285,9 +13687,33 @@ function renderJSON(models, stream2) {
       knowledgeCutoff: model.knowledgeCutoff,
       features: model.features,
       metadata: model.metadata
-    })),
-    shortcuts: MODEL_ALIASES
-  };
+    }));
+    output.shortcuts = MODEL_ALIASES;
+  }
+  if (imageModels.length > 0) {
+    output.imageModels = imageModels.map((model) => ({
+      provider: model.provider,
+      modelId: model.modelId,
+      displayName: model.displayName,
+      supportedSizes: model.supportedSizes,
+      supportedQualities: model.supportedQualities,
+      maxImages: model.maxImages,
+      pricing: model.pricing,
+      features: model.features
+    }));
+  }
+  if (speechModels.length > 0) {
+    output.speechModels = speechModels.map((model) => ({
+      provider: model.provider,
+      modelId: model.modelId,
+      displayName: model.displayName,
+      voices: model.voices,
+      formats: model.formats,
+      maxInputLength: model.maxInputLength,
+      pricing: model.pricing,
+      features: model.features
+    }));
+  }
   stream2.write(JSON.stringify(output, null, 2) + "\n");
 }
 function formatTokens2(count) {
@@ -11300,7 +13726,7 @@ function formatTokens2(count) {
   }
 }
 function registerModelsCommand(program, env) {
-  program.command(COMMANDS.models).description("List all available LLM models with pricing and capabilities.").option("--provider <name>", "Filter by provider (openai, anthropic, gemini)").option("--format <format>", "Output format: table or json", "table").option("--verbose", "Show detailed model information", false).action(
+  program.command(COMMANDS.models).description("List available models with pricing and capabilities.").option("--provider <name>", "Filter by provider (openai, anthropic, gemini)").option("--format <format>", "Output format: table or json", "table").option("--verbose", "Show detailed model information", false).option("--text", "Show text/LLM models (default if no type specified)").option("--image", "Show image generation models").option("--speech", "Show speech/TTS models").option("--all", "Show all model types (text, image, speech)").action(
     (options) => executeAction(
       () => handleModelsCommand(options, env),
       env
@@ -11308,6 +13734,96 @@ function registerModelsCommand(program, env) {
   );
 }
+// src/cli/speech-command.ts
+var import_node_fs12 = require("fs");
+var DEFAULT_SPEECH_MODEL = "tts-1";
+var DEFAULT_VOICE = "nova";
+async function executeSpeech(textArg, options, env) {
+  const text3 = await resolvePrompt(textArg, env);
+  const client = env.createClient();
+  const model = options.model;
+  const voice = options.voice ?? DEFAULT_VOICE;
+  const speed = options.speed ? Number.parseFloat(options.speed) : void 0;
+  const stderrTTY = env.stderr.isTTY === true;
+  if (!options.quiet && stderrTTY) {
+    env.stderr.write(`${SUMMARY_PREFIX} Generating speech with ${model} (voice: ${voice})...
+`);
+  }
+  const result = await client.speech.generate({
+    model,
+    input: text3,
+    voice,
+    responseFormat: options.format,
+    speed
+  });
+  const audioBuffer = Buffer.from(result.audio);
+  if (options.output) {
+    (0, import_node_fs12.writeFileSync)(options.output, audioBuffer);
+    if (!options.quiet) {
+      env.stderr.write(`${SUMMARY_PREFIX} Audio saved to ${options.output}
+`);
+    }
+  } else {
+    env.stdout.write(audioBuffer);
+  }
+  if (!options.quiet && stderrTTY) {
+    const parts = [
+      `${result.usage.characterCount} characters`,
+      `format: ${result.format}`
+    ];
+    if (result.cost !== void 0) {
+      parts.push(`cost: ${formatCost(result.cost)}`);
+    }
+    env.stderr.write(`${SUMMARY_PREFIX} ${parts.join(" | ")}
+`);
+  }
+}
+function registerSpeechCommand(program, env, config) {
+  program.command(COMMANDS.speech).description("Generate speech audio from text.").argument("[text]", "Text to convert to speech. If omitted, stdin is used when available.").option(
+    OPTION_FLAGS.model,
+    OPTION_DESCRIPTIONS.model,
+    config?.model ?? DEFAULT_SPEECH_MODEL
+  ).option(OPTION_FLAGS.voice, OPTION_DESCRIPTIONS.voice, config?.voice ?? DEFAULT_VOICE).option(OPTION_FLAGS.speechFormat, OPTION_DESCRIPTIONS.speechFormat, config?.format).option(OPTION_FLAGS.speechSpeed, OPTION_DESCRIPTIONS.speechSpeed, config?.speed?.toString()).option(OPTION_FLAGS.speechOutput, OPTION_DESCRIPTIONS.speechOutput, config?.output).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, config?.quiet ?? false).action(
+    (text3, options) => executeAction(() => executeSpeech(text3, options, env), env)
+  );
+}
+// src/cli/vision-command.ts
+init_model_shortcuts();
+async function executeVision(imagePath, options, env) {
+  const client = env.createClient();
+  const model = resolveModel(options.model);
+  const imageBuffer = await readFileBuffer(imagePath);
+  const prompt = options.prompt ?? "Describe this image in detail.";
+  const stderrTTY = env.stderr.isTTY === true;
+  if (!options.quiet && stderrTTY) {
+    env.stderr.write(`${SUMMARY_PREFIX} Analyzing image with ${model}...
+`);
+  }
+  const result = await client.vision.analyze({
+    model,
+    image: imageBuffer,
+    prompt,
+    maxTokens: options.maxTokens
+  });
+  env.stdout.write(result);
+  env.stdout.write("\n");
+}
+function registerVisionCommand(program, env) {
+  program.command(COMMANDS.vision ?? "vision").description("Analyze an image using vision-capable models").argument("<image>", "Path to image file to analyze").option(
+    OPTION_FLAGS.model,
+    OPTION_DESCRIPTIONS.model,
+    "gpt-4o"
+    // Default to a vision-capable model
+  ).option("-p, --prompt <prompt>", "Analysis prompt describing what to extract or describe").option(
+    OPTION_FLAGS.maxTokens,
+    OPTION_DESCRIPTIONS.maxTokens,
+    createNumericParser({ label: "Max tokens", integer: true, min: 1 })
+  ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet).action(
+    (imagePath, options) => executeAction(() => executeVision(imagePath, options, env), env)
+  );
+}
 // src/cli/environment.ts
 var import_node_readline = __toESM(require("readline"), 1);
 var import_chalk9 = __toESM(require("chalk"), 1);
@@ -11353,7 +13869,7 @@ function createLoggerFactory(config) {
 }
 function createPromptFunction(stdin, stdout) {
   return (question) => {
-    return new Promise((resolve2) => {
+    return new Promise((resolve3) => {
       const rl = import_node_readline.default.createInterface({
         input: stdin,
         output: stdout
@@ -11368,7 +13884,7 @@ function createPromptFunction(stdin, stdout) {
 `);
       rl.question(import_chalk9.default.green.bold("You: "), (answer) => {
         rl.close();
-        resolve2(answer);
+        resolve3(answer);
       });
     });
   };
@@ -11459,6 +13975,9 @@ function createProgram(env, config) {
   });
   registerCompleteCommand(program, env, config?.complete);
   registerAgentCommand(program, env, config?.agent);
+  registerImageCommand(program, env, config?.image);
+  registerSpeechCommand(program, env, config?.speech);
+  registerVisionCommand(program, env);
   registerModelsCommand(program, env);
   registerGadgetCommand(program, env);
   if (config) {