npm - llmist - Versions diffs - 2.4.0 → 2.5.0 - Mend

llmist 2.4.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +7 -0
package/dist/{chunk-QFRVTS5F.js → chunk-IHSZUAYN.js} +4 -2
package/dist/chunk-IHSZUAYN.js.map +1 -0
package/dist/{chunk-6ZDUWO6N.js → chunk-YHS2DYXP.js} +1781 -528
package/dist/chunk-YHS2DYXP.js.map +1 -0
package/dist/cli.cjs +1218 -151
package/dist/cli.cjs.map +1 -1
package/dist/cli.js +172 -26
package/dist/cli.js.map +1 -1
package/dist/index.cjs +1393 -124
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +93 -20
package/dist/index.d.ts +93 -20
package/dist/index.js +34 -2
package/dist/{mock-stream-BQcC2VCP.d.cts → mock-stream-ga4KIiwX.d.cts} +714 -12
package/dist/{mock-stream-BQcC2VCP.d.ts → mock-stream-ga4KIiwX.d.ts} +714 -12
package/dist/testing/index.cjs +1713 -508
package/dist/testing/index.cjs.map +1 -1
package/dist/testing/index.d.cts +2 -2
package/dist/testing/index.d.ts +2 -2
package/dist/testing/index.js +1 -1
package/package.json +1 -1
package/dist/chunk-6ZDUWO6N.js.map +0 -1
package/dist/chunk-QFRVTS5F.js.map +0 -1

package/dist/index.cjs CHANGED Viewed

@@ -45,6 +45,158 @@ var init_constants = __esm({
   }
 });
+// src/core/input-content.ts
+function isTextPart(part) {
+  return part.type === "text";
+}
+function isImagePart(part) {
+  return part.type === "image";
+}
+function isAudioPart(part) {
+  return part.type === "audio";
+}
+function text(content) {
+  return { type: "text", text: content };
+}
+function imageFromBase64(data, mediaType) {
+  return {
+    type: "image",
+    source: { type: "base64", mediaType, data }
+  };
+}
+function imageFromUrl(url) {
+  return {
+    type: "image",
+    source: { type: "url", url }
+  };
+}
+function detectImageMimeType(data) {
+  const bytes = data instanceof Buffer ? data : Buffer.from(data);
+  for (const { bytes: magic, mimeType } of IMAGE_MAGIC_BYTES) {
+    if (bytes.length >= magic.length) {
+      let matches = true;
+      for (let i = 0; i < magic.length; i++) {
+        if (bytes[i] !== magic[i]) {
+          matches = false;
+          break;
+        }
+      }
+      if (matches) {
+        if (mimeType === "image/webp") {
+          if (bytes.length >= 12) {
+            const webpMarker = bytes[8] === 87 && bytes[9] === 69 && bytes[10] === 66 && bytes[11] === 80;
+            if (!webpMarker) continue;
+          }
+        }
+        return mimeType;
+      }
+    }
+  }
+  return null;
+}
+function detectAudioMimeType(data) {
+  const bytes = data instanceof Buffer ? data : Buffer.from(data);
+  for (const { bytes: magic, mimeType } of AUDIO_MAGIC_BYTES) {
+    if (bytes.length >= magic.length) {
+      let matches = true;
+      for (let i = 0; i < magic.length; i++) {
+        if (bytes[i] !== magic[i]) {
+          matches = false;
+          break;
+        }
+      }
+      if (matches) {
+        if (mimeType === "audio/wav") {
+          if (bytes.length >= 12) {
+            const waveMarker = bytes[8] === 87 && bytes[9] === 65 && bytes[10] === 86 && bytes[11] === 69;
+            if (!waveMarker) continue;
+          }
+        }
+        return mimeType;
+      }
+    }
+  }
+  return null;
+}
+function toBase64(data) {
+  if (typeof data === "string") {
+    return data;
+  }
+  return Buffer.from(data).toString("base64");
+}
+function imageFromBuffer(buffer, mediaType) {
+  const detectedType = mediaType ?? detectImageMimeType(buffer);
+  if (!detectedType) {
+    throw new Error(
+      "Could not detect image MIME type. Please provide the mediaType parameter explicitly."
+    );
+  }
+  return {
+    type: "image",
+    source: {
+      type: "base64",
+      mediaType: detectedType,
+      data: toBase64(buffer)
+    }
+  };
+}
+function audioFromBase64(data, mediaType) {
+  return {
+    type: "audio",
+    source: { type: "base64", mediaType, data }
+  };
+}
+function audioFromBuffer(buffer, mediaType) {
+  const detectedType = mediaType ?? detectAudioMimeType(buffer);
+  if (!detectedType) {
+    throw new Error(
+      "Could not detect audio MIME type. Please provide the mediaType parameter explicitly."
+    );
+  }
+  return {
+    type: "audio",
+    source: {
+      type: "base64",
+      mediaType: detectedType,
+      data: toBase64(buffer)
+    }
+  };
+}
+function isDataUrl(input) {
+  return input.startsWith("data:");
+}
+function parseDataUrl(url) {
+  const match = url.match(/^data:([^;]+);base64,(.+)$/);
+  if (!match) return null;
+  return { mimeType: match[1], data: match[2] };
+}
+var IMAGE_MAGIC_BYTES, AUDIO_MAGIC_BYTES;
+var init_input_content = __esm({
+  "src/core/input-content.ts"() {
+    "use strict";
+    IMAGE_MAGIC_BYTES = [
+      { bytes: [255, 216, 255], mimeType: "image/jpeg" },
+      { bytes: [137, 80, 78, 71], mimeType: "image/png" },
+      { bytes: [71, 73, 70, 56], mimeType: "image/gif" },
+      // WebP starts with RIFF....WEBP
+      { bytes: [82, 73, 70, 70], mimeType: "image/webp" }
+    ];
+    AUDIO_MAGIC_BYTES = [
+      // MP3 frame sync
+      { bytes: [255, 251], mimeType: "audio/mp3" },
+      { bytes: [255, 250], mimeType: "audio/mp3" },
+      // ID3 tag (MP3)
+      { bytes: [73, 68, 51], mimeType: "audio/mp3" },
+      // OGG
+      { bytes: [79, 103, 103, 83], mimeType: "audio/ogg" },
+      // WAV (RIFF)
+      { bytes: [82, 73, 70, 70], mimeType: "audio/wav" },
+      // WebM
+      { bytes: [26, 69, 223, 163], mimeType: "audio/webm" }
+    ];
+  }
+});
 // src/core/model-shortcuts.ts
 function isKnownModelPattern(model) {
   const normalized = model.toLowerCase();
@@ -402,7 +554,9 @@ var init_prompt_config = __esm({
       rules: () => [
         "Output ONLY plain text with the exact markers - never use function/tool calling",
         "You can invoke multiple gadgets in a single response",
-        "For dependent gadgets, invoke the first one and wait for the result"
+        "Gadgets without dependencies execute immediately (in parallel if multiple)",
+        "Use :invocation_id:dep1,dep2 syntax when a gadget needs results from prior gadgets",
+        "If any dependency fails, dependent gadgets are automatically skipped"
       ],
       customExamples: null
     };
@@ -410,11 +564,24 @@ var init_prompt_config = __esm({
 });
 // src/core/messages.ts
+function normalizeContent(content) {
+  if (typeof content === "string") {
+    return [{ type: "text", text: content }];
+  }
+  return content;
+}
+function extractText(content) {
+  if (typeof content === "string") {
+    return content;
+  }
+  return content.filter((part) => part.type === "text").map((part) => part.text).join("");
+}
 var LLMMessageBuilder;
 var init_messages = __esm({
   "src/core/messages.ts"() {
     "use strict";
     init_constants();
+    init_input_content();
     init_prompt_config();
     LLMMessageBuilder = class {
       messages = [];
@@ -516,6 +683,10 @@ CRITICAL: ${criticalUsage}
         parts.push(`
   1. Start marker: ${this.startPrefix}gadget_name`);
         parts.push(`
+     With ID: ${this.startPrefix}gadget_name:my_id`);
+        parts.push(`
+     With dependencies: ${this.startPrefix}gadget_name:my_id:dep1,dep2`);
+        parts.push(`
   2. ${formatDescription}`);
         parts.push(`
   3. End marker: ${this.endPrefix}`);
@@ -565,6 +736,25 @@ ${this.endPrefix}`;
 EXAMPLE (Multiple Gadgets):
 ${multipleExample}`);
+        const dependencyExample = `${this.startPrefix}fetch_data:fetch_1
+${this.argPrefix}url
+https://api.example.com/users
+${this.endPrefix}
+${this.startPrefix}fetch_data:fetch_2
+${this.argPrefix}url
+https://api.example.com/orders
+${this.endPrefix}
+${this.startPrefix}merge_data:merge_1:fetch_1,fetch_2
+${this.argPrefix}format
+json
+${this.endPrefix}`;
+        parts.push(`
+EXAMPLE (With Dependencies):
+merge_1 waits for fetch_1 AND fetch_2 to complete.
+If either fails, merge_1 is automatically skipped.
+${dependencyExample}`);
         parts.push(`
 BLOCK FORMAT SYNTAX:
@@ -615,6 +805,25 @@ Produces: { "items": ["first", "second"] }`);
         }
         return parts.join("");
       }
+      /**
+       * Add a user message.
+       * Content can be a string (text only) or an array of content parts (multimodal).
+       *
+       * @param content - Message content
+       * @param metadata - Optional metadata
+       *
+       * @example
+       * ```typescript
+       * // Text only
+       * builder.addUser("Hello!");
+       *
+       * // Multimodal
+       * builder.addUser([
+       *   text("What's in this image?"),
+       *   imageFromBuffer(imageData),
+       * ]);
+       * ```
+       */
       addUser(content, metadata) {
         this.messages.push({ role: "user", content, metadata });
         return this;
@@ -623,6 +832,104 @@ Produces: { "items": ["first", "second"] }`);
         this.messages.push({ role: "assistant", content, metadata });
         return this;
       }
+      /**
+       * Add a user message with an image attachment.
+       *
+       * @param textContent - Text prompt
+       * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
+       * @param mimeType - Optional MIME type (auto-detected if not provided)
+       *
+       * @example
+       * ```typescript
+       * builder.addUserWithImage(
+       *   "What's in this image?",
+       *   await fs.readFile("photo.jpg"),
+       *   "image/jpeg"  // Optional - auto-detected
+       * );
+       * ```
+       */
+      addUserWithImage(textContent, imageData, mimeType) {
+        const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
+        const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
+        if (!detectedMime) {
+          throw new Error(
+            "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
+          );
+        }
+        const content = [
+          text(textContent),
+          {
+            type: "image",
+            source: {
+              type: "base64",
+              mediaType: detectedMime,
+              data: toBase64(imageBuffer)
+            }
+          }
+        ];
+        this.messages.push({ role: "user", content });
+        return this;
+      }
+      /**
+       * Add a user message with an image URL (OpenAI only).
+       *
+       * @param textContent - Text prompt
+       * @param imageUrl - URL to the image
+       *
+       * @example
+       * ```typescript
+       * builder.addUserWithImageUrl(
+       *   "What's in this image?",
+       *   "https://example.com/image.jpg"
+       * );
+       * ```
+       */
+      addUserWithImageUrl(textContent, imageUrl) {
+        const content = [text(textContent), imageFromUrl(imageUrl)];
+        this.messages.push({ role: "user", content });
+        return this;
+      }
+      /**
+       * Add a user message with an audio attachment (Gemini only).
+       *
+       * @param textContent - Text prompt
+       * @param audioData - Audio data (Buffer, Uint8Array, or base64 string)
+       * @param mimeType - Optional MIME type (auto-detected if not provided)
+       *
+       * @example
+       * ```typescript
+       * builder.addUserWithAudio(
+       *   "Transcribe this audio",
+       *   await fs.readFile("recording.mp3"),
+       *   "audio/mp3"  // Optional - auto-detected
+       * );
+       * ```
+       */
+      addUserWithAudio(textContent, audioData, mimeType) {
+        const audioBuffer = typeof audioData === "string" ? Buffer.from(audioData, "base64") : audioData;
+        const content = [text(textContent), audioFromBuffer(audioBuffer, mimeType)];
+        this.messages.push({ role: "user", content });
+        return this;
+      }
+      /**
+       * Add a user message with multiple content parts.
+       * Provides full flexibility for complex multimodal messages.
+       *
+       * @param parts - Array of content parts
+       *
+       * @example
+       * ```typescript
+       * builder.addUserMultimodal([
+       *   text("Compare these images:"),
+       *   imageFromBuffer(image1),
+       *   imageFromBuffer(image2),
+       * ]);
+       * ```
+       */
+      addUserMultimodal(parts) {
+        this.messages.push({ role: "user", content: parts });
+        return this;
+      }
       addGadgetCall(gadget, parameters, result) {
         const paramStr = this.formatBlockParameters(parameters, "");
         this.messages.push({
@@ -1941,7 +2248,7 @@ var init_conversation_manager = __esm({
           if (msg.role === "user") {
             this.historyBuilder.addUser(msg.content);
           } else if (msg.role === "assistant") {
-            this.historyBuilder.addAssistant(msg.content);
+            this.historyBuilder.addAssistant(extractText(msg.content));
           }
         }
       }
@@ -1962,8 +2269,10 @@ async function runWithHandlers(agentGenerator, handlers) {
         if (handlers.onGadgetCall) {
           await handlers.onGadgetCall({
             gadgetName: event.call.gadgetName,
+            invocationId: event.call.invocationId,
             parameters: event.call.parameters,
-            parametersRaw: event.call.parametersRaw
+            parametersRaw: event.call.parametersRaw,
+            dependencies: event.call.dependencies
           });
         }
         break;
@@ -2840,15 +3149,37 @@ var init_parser = __esm({
         return segment.trim().length > 0 ? segment : void 0;
       }
       /**
-       * Parse gadget name, handling both old format (name:invocationId) and new format (just name).
-       * For new format, generates a unique invocation ID.
+       * Parse gadget name with optional invocation ID and dependencies.
+       *
+       * Supported formats:
+       * - `GadgetName` - Auto-generate ID, no dependencies
+       * - `GadgetName:my_id` - Explicit ID, no dependencies
+       * - `GadgetName:my_id:dep1,dep2` - Explicit ID with dependencies
+       *
+       * Dependencies must be comma-separated invocation IDs.
        */
       parseGadgetName(gadgetName) {
-        if (gadgetName.includes(":")) {
-          const parts = gadgetName.split(":");
-          return { actualName: parts[0], invocationId: parts[1] };
+        const parts = gadgetName.split(":");
+        if (parts.length === 1) {
+          return {
+            actualName: parts[0],
+            invocationId: `gadget_${++globalInvocationCounter}`,
+            dependencies: []
+          };
+        } else if (parts.length === 2) {
+          return {
+            actualName: parts[0],
+            invocationId: parts[1].trim(),
+            dependencies: []
+          };
+        } else {
+          const deps = parts[2].split(",").map((d) => d.trim()).filter((d) => d.length > 0);
+          return {
+            actualName: parts[0],
+            invocationId: parts[1].trim(),
+            dependencies: deps
+          };
         }
-        return { actualName: gadgetName, invocationId: `gadget_${++globalInvocationCounter}` };
       }
       /**
        * Extract the error message from a parse error.
@@ -2884,39 +3215,20 @@ var init_parser = __esm({
           const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
           if (metadataEndIndex === -1) break;
           const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
-          const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
+          const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
           const contentStartIndex = metadataEndIndex + 1;
           let partEndIndex;
           let endMarkerLength = 0;
-          if (gadgetName.includes(":")) {
-            const oldEndMarker = `${this.endPrefix + actualGadgetName}:${invocationId}`;
-            partEndIndex = this.buffer.indexOf(oldEndMarker, contentStartIndex);
-            if (partEndIndex === -1) break;
-            endMarkerLength = oldEndMarker.length;
+          const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
+          const endPos = this.buffer.indexOf(this.endPrefix, contentStartIndex);
+          if (nextStartPos !== -1 && (endPos === -1 || nextStartPos < endPos)) {
+            partEndIndex = nextStartPos;
+            endMarkerLength = 0;
+          } else if (endPos !== -1) {
+            partEndIndex = endPos;
+            endMarkerLength = this.endPrefix.length;
           } else {
-            const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
-            let validEndPos = -1;
-            let searchPos = contentStartIndex;
-            while (true) {
-              const endPos = this.buffer.indexOf(this.endPrefix, searchPos);
-              if (endPos === -1) break;
-              const afterEnd = this.buffer.substring(endPos + this.endPrefix.length);
-              if (afterEnd.startsWith("\n") || afterEnd.startsWith("\r") || afterEnd.startsWith(this.startPrefix) || afterEnd.length === 0) {
-                validEndPos = endPos;
-                break;
-              } else {
-                searchPos = endPos + this.endPrefix.length;
-              }
-            }
-            if (nextStartPos !== -1 && (validEndPos === -1 || nextStartPos < validEndPos)) {
-              partEndIndex = nextStartPos;
-              endMarkerLength = 0;
-            } else if (validEndPos !== -1) {
-              partEndIndex = validEndPos;
-              endMarkerLength = this.endPrefix.length;
-            } else {
-              break;
-            }
+            break;
           }
           const parametersRaw = this.buffer.substring(contentStartIndex, partEndIndex).trim();
           const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2927,7 +3239,8 @@ var init_parser = __esm({
               invocationId,
               parametersRaw,
               parameters,
-              parseError
+              parseError,
+              dependencies
             }
           };
           startIndex = partEndIndex + endMarkerLength;
@@ -2950,7 +3263,7 @@ var init_parser = __esm({
           const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
           if (metadataEndIndex !== -1) {
             const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
-            const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
+            const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
             const contentStartIndex = metadataEndIndex + 1;
             const parametersRaw = this.buffer.substring(contentStartIndex).trim();
             const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2961,7 +3274,8 @@ var init_parser = __esm({
                 invocationId,
                 parametersRaw,
                 parameters,
-                parseError
+                parseError,
+                dependencies
               }
             };
             return;
@@ -3331,6 +3645,13 @@ var init_stream_processor = __esm({
       accumulatedText = "";
       shouldStopExecution = false;
       observerFailureCount = 0;
+      // Dependency tracking for gadget execution DAG
+      /** Gadgets waiting for their dependencies to complete */
+      pendingGadgets = /* @__PURE__ */ new Map();
+      /** Completed gadget results, keyed by invocation ID */
+      completedResults = /* @__PURE__ */ new Map();
+      /** Invocation IDs of gadgets that have failed (error or skipped due to dependency) */
+      failedInvocations = /* @__PURE__ */ new Set();
       constructor(options) {
         this.iteration = options.iteration;
         this.registry = options.registry;
@@ -3431,6 +3752,16 @@ var init_stream_processor = __esm({
               }
             }
           }
+          const finalPendingEvents = await this.processPendingGadgets();
+          outputs.push(...finalPendingEvents);
+          if (finalPendingEvents.some((e) => e.type === "gadget_result")) {
+            didExecuteGadgets = true;
+          }
+          for (const evt of finalPendingEvents) {
+            if (evt.type === "gadget_result" && evt.result.breaksLoop) {
+              shouldBreakLoop = true;
+            }
+          }
         }
         let finalMessage = this.accumulatedText;
         if (this.hooks.interceptors?.interceptAssistantMessage) {
@@ -3482,7 +3813,11 @@ var init_stream_processor = __esm({
         return [{ type: "text", content }];
       }
       /**
-       * Process a gadget call through the full lifecycle.
+       * Process a gadget call through the full lifecycle, handling dependencies.
+       *
+       * Gadgets without dependencies (or with all dependencies satisfied) execute immediately.
+       * Gadgets with unsatisfied dependencies are queued for later execution.
+       * After each execution, pending gadgets are checked to see if they can now run.
        */
       async processGadgetCall(call) {
         if (this.shouldStopExecution) {
@@ -3493,6 +3828,53 @@ var init_stream_processor = __esm({
         }
         const events = [];
         events.push({ type: "gadget_call", call });
+        if (call.dependencies.length > 0) {
+          if (call.dependencies.includes(call.invocationId)) {
+            this.logger.warn("Gadget has self-referential dependency (depends on itself)", {
+              gadgetName: call.gadgetName,
+              invocationId: call.invocationId
+            });
+            this.failedInvocations.add(call.invocationId);
+            const skipEvent = {
+              type: "gadget_skipped",
+              gadgetName: call.gadgetName,
+              invocationId: call.invocationId,
+              parameters: call.parameters ?? {},
+              failedDependency: call.invocationId,
+              failedDependencyError: `Gadget "${call.invocationId}" cannot depend on itself (self-referential dependency)`
+            };
+            events.push(skipEvent);
+            return events;
+          }
+          const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
+          if (failedDep) {
+            const skipEvents = await this.handleFailedDependency(call, failedDep);
+            events.push(...skipEvents);
+            return events;
+          }
+          const unsatisfied = call.dependencies.filter((dep) => !this.completedResults.has(dep));
+          if (unsatisfied.length > 0) {
+            this.logger.debug("Queueing gadget for later - waiting on dependencies", {
+              gadgetName: call.gadgetName,
+              invocationId: call.invocationId,
+              waitingOn: unsatisfied
+            });
+            this.pendingGadgets.set(call.invocationId, call);
+            return events;
+          }
+        }
+        const executeEvents = await this.executeGadgetWithHooks(call);
+        events.push(...executeEvents);
+        const triggeredEvents = await this.processPendingGadgets();
+        events.push(...triggeredEvents);
+        return events;
+      }
+      /**
+       * Execute a gadget through the full hook lifecycle.
+       * This is the core execution logic, extracted from processGadgetCall.
+       */
+      async executeGadgetWithHooks(call) {
+        const events = [];
         if (call.parseError) {
           this.logger.warn("Gadget has parse error", {
             gadgetName: call.gadgetName,
@@ -3623,6 +4005,10 @@ var init_stream_processor = __esm({
           });
         }
         await this.runObserversInParallel(completeObservers);
+        this.completedResults.set(result.invocationId, result);
+        if (result.error) {
+          this.failedInvocations.add(result.invocationId);
+        }
         events.push({ type: "gadget_result", result });
         if (result.error) {
           const errorType = this.determineErrorType(call, result);
@@ -3638,6 +4024,162 @@ var init_stream_processor = __esm({
         }
         return events;
       }
+      /**
+       * Handle a gadget that cannot execute because a dependency failed.
+       * Calls the onDependencySkipped controller to allow customization.
+       */
+      async handleFailedDependency(call, failedDep) {
+        const events = [];
+        const depResult = this.completedResults.get(failedDep);
+        const depError = depResult?.error ?? "Dependency failed";
+        let action = { action: "skip" };
+        if (this.hooks.controllers?.onDependencySkipped) {
+          const context = {
+            iteration: this.iteration,
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            parameters: call.parameters ?? {},
+            failedDependency: failedDep,
+            failedDependencyError: depError,
+            logger: this.logger
+          };
+          action = await this.hooks.controllers.onDependencySkipped(context);
+        }
+        if (action.action === "skip") {
+          this.failedInvocations.add(call.invocationId);
+          const skipEvent = {
+            type: "gadget_skipped",
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            parameters: call.parameters ?? {},
+            failedDependency: failedDep,
+            failedDependencyError: depError
+          };
+          events.push(skipEvent);
+          if (this.hooks.observers?.onGadgetSkipped) {
+            const observeContext = {
+              iteration: this.iteration,
+              gadgetName: call.gadgetName,
+              invocationId: call.invocationId,
+              parameters: call.parameters ?? {},
+              failedDependency: failedDep,
+              failedDependencyError: depError,
+              logger: this.logger
+            };
+            await this.safeObserve(() => this.hooks.observers.onGadgetSkipped(observeContext));
+          }
+          this.logger.info("Gadget skipped due to failed dependency", {
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            failedDependency: failedDep
+          });
+        } else if (action.action === "execute_anyway") {
+          this.logger.info("Executing gadget despite failed dependency (controller override)", {
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            failedDependency: failedDep
+          });
+          const executeEvents = await this.executeGadgetWithHooks(call);
+          events.push(...executeEvents);
+        } else if (action.action === "use_fallback") {
+          const fallbackResult = {
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            parameters: call.parameters ?? {},
+            result: action.fallbackResult,
+            executionTimeMs: 0
+          };
+          this.completedResults.set(call.invocationId, fallbackResult);
+          events.push({ type: "gadget_result", result: fallbackResult });
+          this.logger.info("Using fallback result for gadget with failed dependency", {
+            gadgetName: call.gadgetName,
+            invocationId: call.invocationId,
+            failedDependency: failedDep
+          });
+        }
+        return events;
+      }
+      /**
+       * Process pending gadgets whose dependencies are now satisfied.
+       * Executes ready gadgets in parallel and continues until no more can be triggered.
+       */
+      async processPendingGadgets() {
+        const events = [];
+        let progress = true;
+        while (progress && this.pendingGadgets.size > 0) {
+          progress = false;
+          const readyToExecute = [];
+          const readyToSkip = [];
+          for (const [invocationId, call] of this.pendingGadgets) {
+            const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
+            if (failedDep) {
+              readyToSkip.push({ call, failedDep });
+              continue;
+            }
+            const allSatisfied = call.dependencies.every((dep) => this.completedResults.has(dep));
+            if (allSatisfied) {
+              readyToExecute.push(call);
+            }
+          }
+          for (const { call, failedDep } of readyToSkip) {
+            this.pendingGadgets.delete(call.invocationId);
+            const skipEvents = await this.handleFailedDependency(call, failedDep);
+            events.push(...skipEvents);
+            progress = true;
+          }
+          if (readyToExecute.length > 0) {
+            this.logger.debug("Executing ready gadgets in parallel", {
+              count: readyToExecute.length,
+              invocationIds: readyToExecute.map((c) => c.invocationId)
+            });
+            for (const call of readyToExecute) {
+              this.pendingGadgets.delete(call.invocationId);
+            }
+            const executePromises = readyToExecute.map((call) => this.executeGadgetWithHooks(call));
+            const results = await Promise.all(executePromises);
+            for (const executeEvents of results) {
+              events.push(...executeEvents);
+            }
+            progress = true;
+          }
+        }
+        if (this.pendingGadgets.size > 0) {
+          const pendingIds = new Set(this.pendingGadgets.keys());
+          for (const [invocationId, call] of this.pendingGadgets) {
+            const missingDeps = call.dependencies.filter((dep) => !this.completedResults.has(dep));
+            const circularDeps = missingDeps.filter((dep) => pendingIds.has(dep));
+            const trulyMissingDeps = missingDeps.filter((dep) => !pendingIds.has(dep));
+            let errorMessage;
+            let logLevel = "warn";
+            if (circularDeps.length > 0 && trulyMissingDeps.length > 0) {
+              errorMessage = `Dependencies unresolvable: circular=[${circularDeps.join(", ")}], missing=[${trulyMissingDeps.join(", ")}]`;
+              logLevel = "error";
+            } else if (circularDeps.length > 0) {
+              errorMessage = `Circular dependency detected: "${invocationId}" depends on "${circularDeps[0]}" which also depends on "${invocationId}" (directly or indirectly)`;
+            } else {
+              errorMessage = `Dependency "${missingDeps[0]}" was never executed - check that the invocation ID exists and is spelled correctly`;
+            }
+            this.logger[logLevel]("Gadget has unresolvable dependencies", {
+              gadgetName: call.gadgetName,
+              invocationId,
+              circularDependencies: circularDeps,
+              missingDependencies: trulyMissingDeps
+            });
+            this.failedInvocations.add(invocationId);
+            const skipEvent = {
+              type: "gadget_skipped",
+              gadgetName: call.gadgetName,
+              invocationId,
+              parameters: call.parameters ?? {},
+              failedDependency: missingDeps[0],
+              failedDependencyError: errorMessage
+            };
+            events.push(skipEvent);
+          }
+          this.pendingGadgets.clear();
+        }
+        return events;
+      }
       /**
        * Safely execute an observer, catching and logging any errors.
        * Observers are non-critical, so errors are logged but don't crash the system.
@@ -4075,9 +4617,9 @@ var init_agent = __esm({
                   if (msg.role === "user") {
                     this.conversation.addUserMessage(msg.content);
                   } else if (msg.role === "assistant") {
-                    this.conversation.addAssistantMessage(msg.content);
+                    this.conversation.addAssistantMessage(extractText(msg.content));
                   } else if (msg.role === "system") {
-                    this.conversation.addUserMessage(`[System] ${msg.content}`);
+                    this.conversation.addUserMessage(`[System] ${extractText(msg.content)}`);
                   }
                 }
               }
@@ -4656,6 +5198,7 @@ var init_anthropic = __esm({
   "src/providers/anthropic.ts"() {
     "use strict";
     import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
+    init_messages();
     init_anthropic_models();
     init_base_provider();
     init_constants2();
@@ -4694,7 +5237,7 @@ var init_anthropic = __esm({
         const systemMessages = messages.filter((message) => message.role === "system");
         const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
           type: "text",
-          text: m.content,
+          text: extractText(m.content),
           // Add cache_control to the LAST system message block
           ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
         })) : void 0;
@@ -4707,14 +5250,10 @@ var init_anthropic = __esm({
         );
         const conversation = nonSystemMessages.map((message, index) => ({
           role: message.role,
-          content: [
-            {
-              type: "text",
-              text: message.content,
-              // Add cache_control to the LAST user message
-              ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
-            }
-          ]
+          content: this.convertToAnthropicContent(
+            message.content,
+            message.role === "user" && index === lastUserIndex
+          )
         }));
         const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
         const payload = {
@@ -4730,6 +5269,52 @@ var init_anthropic = __esm({
         };
         return payload;
       }
+      /**
+       * Convert llmist content to Anthropic's content block format.
+       * Handles text, images (base64 only), and applies cache_control.
+       */
+      convertToAnthropicContent(content, addCacheControl) {
+        const parts = normalizeContent(content);
+        return parts.map((part, index) => {
+          const isLastPart = index === parts.length - 1;
+          const cacheControl = addCacheControl && isLastPart ? { cache_control: { type: "ephemeral" } } : {};
+          if (part.type === "text") {
+            return {
+              type: "text",
+              text: part.text,
+              ...cacheControl
+            };
+          }
+          if (part.type === "image") {
+            return this.convertImagePart(part, cacheControl);
+          }
+          if (part.type === "audio") {
+            throw new Error(
+              "Anthropic does not support audio input. Use Google Gemini for audio processing."
+            );
+          }
+          throw new Error(`Unsupported content type: ${part.type}`);
+        });
+      }
+      /**
+       * Convert an image content part to Anthropic's image block format.
+       */
+      convertImagePart(part, cacheControl) {
+        if (part.source.type === "url") {
+          throw new Error(
+            "Anthropic does not support image URLs. Please provide base64-encoded image data instead."
+          );
+        }
+        return {
+          type: "image",
+          source: {
+            type: "base64",
+            media_type: part.source.mediaType,
+            data: part.source.data
+          },
+          ...cacheControl
+        };
+      }
       async executeStreamRequest(payload, signal) {
         const client = this.client;
         const stream2 = await client.messages.create(payload, signal ? { signal } : void 0);
@@ -4812,17 +5397,12 @@ var init_anthropic = __esm({
       async countTokens(messages, descriptor, _spec) {
         const client = this.client;
         const systemMessages = messages.filter((message) => message.role === "system");
-        const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
+        const system = systemMessages.length > 0 ? systemMessages.map((m) => extractText(m.content)).join("\n\n") : void 0;
         const conversation = messages.filter(
           (message) => message.role !== "system"
         ).map((message) => ({
           role: message.role,
-          content: [
-            {
-              type: "text",
-              text: message.content
-            }
-          ]
+          content: this.convertToAnthropicContent(message.content, false)
         }));
         try {
           const response = await client.messages.countTokens({
@@ -4836,8 +5416,19 @@ var init_anthropic = __esm({
             `Token counting failed for ${descriptor.name}, using fallback estimation:`,
             error
           );
-          const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
-          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
+          let totalChars = 0;
+          let imageCount = 0;
+          for (const msg of messages) {
+            const parts = normalizeContent(msg.content);
+            for (const part of parts) {
+              if (part.type === "text") {
+                totalChars += part.text.length;
+              } else if (part.type === "image") {
+                imageCount++;
+              }
+            }
+          }
+          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 1e3;
         }
       }
     };
@@ -5366,6 +5957,7 @@ var init_gemini = __esm({
   "src/providers/gemini.ts"() {
     "use strict";
     import_genai = require("@google/genai");
+    init_messages();
     init_base_provider();
     init_constants2();
     init_gemini_image_models();
@@ -5535,7 +6127,7 @@ var init_gemini = __esm({
         };
         return {
           model: descriptor.name,
-          contents: this.convertContentsForNewSDK(contents),
+          contents,
           config
         };
       }
@@ -5570,18 +6162,25 @@ var init_gemini = __esm({
           if (message.role === "system") {
             expandedMessages.push({
               role: "user",
-              content: message.content
+              content: extractText(message.content)
             });
             expandedMessages.push({
               role: "assistant",
               content: "Understood."
             });
           } else {
-            expandedMessages.push(message);
+            expandedMessages.push({
+              role: message.role,
+              content: message.content
+            });
           }
         }
         return this.mergeConsecutiveMessages(expandedMessages);
       }
+      /**
+       * Merge consecutive messages with the same role (required by Gemini).
+       * Handles multimodal content by converting to Gemini's part format.
+       */
       mergeConsecutiveMessages(messages) {
         if (messages.length === 0) {
           return [];
@@ -5590,15 +6189,16 @@ var init_gemini = __esm({
         let currentGroup = null;
         for (const message of messages) {
           const geminiRole = GEMINI_ROLE_MAP[message.role];
+          const geminiParts = this.convertToGeminiParts(message.content);
           if (currentGroup && currentGroup.role === geminiRole) {
-            currentGroup.parts.push({ text: message.content });
+            currentGroup.parts.push(...geminiParts);
           } else {
             if (currentGroup) {
               result.push(currentGroup);
             }
             currentGroup = {
               role: geminiRole,
-              parts: [{ text: message.content }]
+              parts: geminiParts
             };
           }
         }
@@ -5607,11 +6207,39 @@ var init_gemini = __esm({
         }
         return result;
       }
-      convertContentsForNewSDK(contents) {
-        return contents.map((content) => ({
-          role: content.role,
-          parts: content.parts.map((part) => ({ text: part.text }))
-        }));
+      /**
+       * Convert llmist content to Gemini's part format.
+       * Handles text, images, and audio (Gemini supports all three).
+       */
+      convertToGeminiParts(content) {
+        const parts = normalizeContent(content);
+        return parts.map((part) => {
+          if (part.type === "text") {
+            return { text: part.text };
+          }
+          if (part.type === "image") {
+            if (part.source.type === "url") {
+              throw new Error(
+                "Gemini does not support image URLs directly. Please provide base64-encoded image data."
+              );
+            }
+            return {
+              inlineData: {
+                mimeType: part.source.mediaType,
+                data: part.source.data
+              }
+            };
+          }
+          if (part.type === "audio") {
+            return {
+              inlineData: {
+                mimeType: part.source.mediaType,
+                data: part.source.data
+              }
+            };
+          }
+          throw new Error(`Unsupported content type: ${part.type}`);
+        });
       }
       buildGenerationConfig(options) {
         const config = {};
@@ -5632,9 +6260,9 @@ var init_gemini = __esm({
       async *wrapStream(iterable) {
         const stream2 = iterable;
         for await (const chunk of stream2) {
-          const text = this.extractText(chunk);
-          if (text) {
-            yield { text, rawEvent: chunk };
+          const text3 = this.extractText(chunk);
+          if (text3) {
+            yield { text: text3, rawEvent: chunk };
           }
           const finishReason = this.extractFinishReason(chunk);
           const usage = this.extractUsage(chunk);
@@ -5695,7 +6323,7 @@ var init_gemini = __esm({
         try {
           const response = await client.models.countTokens({
             model: descriptor.name,
-            contents: this.convertContentsForNewSDK(contents)
+            contents
             // Note: systemInstruction not used - it's not supported by countTokens()
             // and would cause a 2100% token counting error
           });
@@ -5705,8 +6333,19 @@ var init_gemini = __esm({
             `Token counting failed for ${descriptor.name}, using fallback estimation:`,
             error
           );
-          const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
-          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
+          let totalChars = 0;
+          let mediaCount = 0;
+          for (const msg of messages) {
+            const parts = normalizeContent(msg.content);
+            for (const part of parts) {
+              if (part.type === "text") {
+                totalChars += part.text.length;
+              } else if (part.type === "image" || part.type === "audio") {
+                mediaCount++;
+              }
+            }
+          }
+          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + mediaCount * 258;
         }
       }
     };
@@ -6349,6 +6988,7 @@ var init_openai = __esm({
     "use strict";
     import_openai = __toESM(require("openai"), 1);
     import_tiktoken = require("tiktoken");
+    init_messages();
     init_base_provider();
     init_constants2();
     init_openai_image_models();
@@ -6456,11 +7096,7 @@ var init_openai = __esm({
         const sanitizedExtra = sanitizeExtra(extra, shouldIncludeTemperature);
         return {
           model: descriptor.name,
-          messages: messages.map((message) => ({
-            role: ROLE_MAP[message.role],
-            content: message.content,
-            name: message.name
-          })),
+          messages: messages.map((message) => this.convertToOpenAIMessage(message)),
           // Only set max_completion_tokens if explicitly provided
           // Otherwise let the API use "as much as fits" in the context window
           ...maxTokens !== void 0 ? { max_completion_tokens: maxTokens } : {},
@@ -6472,6 +7108,77 @@ var init_openai = __esm({
           ...shouldIncludeTemperature ? { temperature } : {}
         };
       }
+      /**
+       * Convert an LLMMessage to OpenAI's ChatCompletionMessageParam.
+       * Handles role-specific content type requirements:
+       * - system/assistant: string content only
+       * - user: string or multimodal array content
+       */
+      convertToOpenAIMessage(message) {
+        const role = ROLE_MAP[message.role];
+        if (role === "user") {
+          const content = this.convertToOpenAIContent(message.content);
+          return {
+            role: "user",
+            content,
+            ...message.name ? { name: message.name } : {}
+          };
+        }
+        const textContent = typeof message.content === "string" ? message.content : extractText(message.content);
+        if (role === "system") {
+          return {
+            role: "system",
+            content: textContent,
+            ...message.name ? { name: message.name } : {}
+          };
+        }
+        return {
+          role: "assistant",
+          content: textContent,
+          ...message.name ? { name: message.name } : {}
+        };
+      }
+      /**
+       * Convert llmist content to OpenAI's content format.
+       * Optimizes by returning string for text-only content, array for multimodal.
+       */
+      convertToOpenAIContent(content) {
+        if (typeof content === "string") {
+          return content;
+        }
+        return content.map((part) => {
+          if (part.type === "text") {
+            return { type: "text", text: part.text };
+          }
+          if (part.type === "image") {
+            return this.convertImagePart(part);
+          }
+          if (part.type === "audio") {
+            throw new Error(
+              "OpenAI chat completions do not support audio input. Use Whisper for transcription or Gemini for audio understanding."
+            );
+          }
+          throw new Error(`Unsupported content type: ${part.type}`);
+        });
+      }
+      /**
+       * Convert an image content part to OpenAI's image_url format.
+       * Supports both URLs and base64 data URLs.
+       */
+      convertImagePart(part) {
+        if (part.source.type === "url") {
+          return {
+            type: "image_url",
+            image_url: { url: part.source.url }
+          };
+        }
+        return {
+          type: "image_url",
+          image_url: {
+            url: `data:${part.source.mediaType};base64,${part.source.data}`
+          }
+        };
+      }
       async executeStreamRequest(payload, signal) {
         const client = this.client;
         const stream2 = await client.chat.completions.create(payload, signal ? { signal } : void 0);
@@ -6480,9 +7187,9 @@ var init_openai = __esm({
       async *wrapStream(iterable) {
         const stream2 = iterable;
         for await (const chunk of stream2) {
-          const text = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
-          if (text) {
-            yield { text, rawEvent: chunk };
+          const text3 = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
+          if (text3) {
+            yield { text: text3, rawEvent: chunk };
           }
           const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
           const usage = chunk.usage ? {
@@ -6530,17 +7237,26 @@ var init_openai = __esm({
           }
           try {
             let tokenCount = 0;
+            let imageCount = 0;
             for (const message of messages) {
               tokenCount += OPENAI_MESSAGE_OVERHEAD_TOKENS;
               const roleText = ROLE_MAP[message.role];
               tokenCount += encoding.encode(roleText).length;
-              tokenCount += encoding.encode(message.content ?? "").length;
+              const textContent = extractText(message.content);
+              tokenCount += encoding.encode(textContent).length;
+              const parts = normalizeContent(message.content);
+              for (const part of parts) {
+                if (part.type === "image") {
+                  imageCount++;
+                }
+              }
               if (message.name) {
                 tokenCount += encoding.encode(message.name).length;
                 tokenCount += OPENAI_NAME_FIELD_OVERHEAD_TOKENS;
               }
             }
             tokenCount += OPENAI_REPLY_PRIMING_TOKENS;
+            tokenCount += imageCount * 765;
             return tokenCount;
           } finally {
             encoding.free();
@@ -6550,8 +7266,19 @@ var init_openai = __esm({
             `Token counting failed for ${descriptor.name}, using fallback estimation:`,
             error
           );
-          const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
-          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
+          let totalChars = 0;
+          let imageCount = 0;
+          for (const msg of messages) {
+            const parts = normalizeContent(msg.content);
+            for (const part of parts) {
+              if (part.type === "text") {
+                totalChars += part.text.length;
+              } else if (part.type === "image") {
+                imageCount++;
+              }
+            }
+          }
+          return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 765;
         }
       }
     };
@@ -6974,6 +7701,138 @@ var init_text = __esm({
   }
 });
+// src/core/namespaces/vision.ts
+var VisionNamespace;
+var init_vision = __esm({
+  "src/core/namespaces/vision.ts"() {
+    "use strict";
+    init_input_content();
+    init_messages();
+    VisionNamespace = class {
+      constructor(client) {
+        this.client = client;
+      }
+      /**
+       * Build a message builder with the image content attached.
+       * Handles URLs, data URLs, base64 strings, and binary buffers.
+       */
+      buildImageMessage(options) {
+        const builder = new LLMMessageBuilder();
+        if (options.systemPrompt) {
+          builder.addSystem(options.systemPrompt);
+        }
+        if (typeof options.image === "string") {
+          if (options.image.startsWith("http://") || options.image.startsWith("https://")) {
+            builder.addUserWithImageUrl(options.prompt, options.image);
+          } else if (isDataUrl(options.image)) {
+            const parsed = parseDataUrl(options.image);
+            if (!parsed) {
+              throw new Error("Invalid data URL format");
+            }
+            builder.addUserWithImage(
+              options.prompt,
+              parsed.data,
+              parsed.mimeType
+            );
+          } else {
+            const buffer = Buffer.from(options.image, "base64");
+            builder.addUserWithImage(options.prompt, buffer, options.mimeType);
+          }
+        } else {
+          builder.addUserWithImage(options.prompt, options.image, options.mimeType);
+        }
+        return builder;
+      }
+      /**
+       * Stream the response and collect text and usage information.
+       */
+      async streamAndCollect(options, builder) {
+        let response = "";
+        let finalUsage;
+        for await (const chunk of this.client.stream({
+          model: options.model,
+          messages: builder.build(),
+          maxTokens: options.maxTokens,
+          temperature: options.temperature
+        })) {
+          response += chunk.text;
+          if (chunk.usage) {
+            finalUsage = {
+              inputTokens: chunk.usage.inputTokens,
+              outputTokens: chunk.usage.outputTokens,
+              totalTokens: chunk.usage.totalTokens
+            };
+          }
+        }
+        return { text: response.trim(), usage: finalUsage };
+      }
+      /**
+       * Analyze an image with a vision-capable model.
+       * Returns the analysis as a string.
+       *
+       * @param options - Vision analysis options
+       * @returns Promise resolving to the analysis text
+       * @throws Error if the image format is unsupported or model doesn't support vision
+       *
+       * @example
+       * ```typescript
+       * // From file
+       * const result = await llmist.vision.analyze({
+       *   model: "gpt-4o",
+       *   image: await fs.readFile("photo.jpg"),
+       *   prompt: "What's in this image?",
+       * });
+       *
+       * // From URL (OpenAI only)
+       * const result = await llmist.vision.analyze({
+       *   model: "gpt-4o",
+       *   image: "https://example.com/image.jpg",
+       *   prompt: "Describe this image",
+       * });
+       * ```
+       */
+      async analyze(options) {
+        const builder = this.buildImageMessage(options);
+        const { text: text3 } = await this.streamAndCollect(options, builder);
+        return text3;
+      }
+      /**
+       * Analyze an image and return detailed result with usage info.
+       *
+       * @param options - Vision analysis options
+       * @returns Promise resolving to the analysis result with usage info
+       */
+      async analyzeWithUsage(options) {
+        const builder = this.buildImageMessage(options);
+        const { text: text3, usage } = await this.streamAndCollect(options, builder);
+        return {
+          text: text3,
+          model: options.model,
+          usage
+        };
+      }
+      /**
+       * Check if a model supports vision/image input.
+       *
+       * @param modelId - Model ID to check
+       * @returns True if the model supports vision
+       */
+      supportsModel(modelId) {
+        const spec = this.client.modelRegistry.getModelSpec(modelId);
+        return spec?.features?.vision === true;
+      }
+      /**
+       * List all models that support vision.
+       *
+       * @returns Array of model IDs that support vision
+       */
+      listModels() {
+        return this.client.modelRegistry.listModels().filter((spec) => spec.features?.vision === true).map((spec) => spec.modelId);
+      }
+    };
+  }
+});
 // src/core/options.ts
 var ModelIdentifierParser;
 var init_options = __esm({
@@ -7018,6 +7877,7 @@ var init_client = __esm({
     init_image();
     init_speech();
     init_text();
+    init_vision();
     init_options();
     init_quick_methods();
     LLMist = class _LLMist {
@@ -7029,6 +7889,7 @@ var init_client = __esm({
       text;
       image;
       speech;
+      vision;
       constructor(...args) {
         let adapters = [];
         let defaultProvider;
@@ -7079,6 +7940,7 @@ var init_client = __esm({
         this.text = new TextNamespace(this);
         this.image = new ImageNamespace(this.adapters, this.defaultProvider);
         this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
+        this.vision = new VisionNamespace(this);
       }
       stream(options) {
         const descriptor = this.parser.parse(options.model);
@@ -7263,6 +8125,7 @@ var init_builder = __esm({
   "src/agent/builder.ts"() {
     "use strict";
     init_constants();
+    init_input_content();
     init_model_shortcuts();
     init_registry();
     init_agent();
@@ -7910,13 +8773,17 @@ ${endPrefix}`
        * }
        * ```
        */
-      ask(userPrompt) {
+      /**
+       * Build AgentOptions with the given user prompt.
+       * Centralizes options construction for ask(), askWithImage(), and askWithContent().
+       */
+      buildAgentOptions(userPrompt) {
         if (!this.client) {
           const { LLMist: LLMistClass } = (init_client(), __toCommonJS(client_exports));
           this.client = new LLMistClass();
         }
         const registry = GadgetRegistry.from(this.gadgets);
-        const options = {
+        return {
           client: this.client,
           model: this.model ?? "openai:gpt-5-nano",
           systemPrompt: this.systemPrompt,
@@ -7942,6 +8809,83 @@ ${endPrefix}`
           compactionConfig: this.compactionConfig,
           signal: this.signal
         };
+      }
+      ask(userPrompt) {
+        const options = this.buildAgentOptions(userPrompt);
+        return new Agent(AGENT_INTERNAL_KEY, options);
+      }
+      /**
+       * Build and create the agent with a multimodal user prompt (text + image).
+       * Returns the Agent instance ready to run.
+       *
+       * @param textPrompt - Text prompt describing what to do with the image
+       * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
+       * @param mimeType - Optional MIME type (auto-detected if not provided)
+       * @returns Configured Agent instance
+       *
+       * @example
+       * ```typescript
+       * const agent = LLMist.createAgent()
+       *   .withModel("gpt-4o")
+       *   .withSystem("You analyze images")
+       *   .askWithImage(
+       *     "What's in this image?",
+       *     await fs.readFile("photo.jpg")
+       *   );
+       *
+       * for await (const event of agent.run()) {
+       *   // handle events
+       * }
+       * ```
+       */
+      askWithImage(textPrompt, imageData, mimeType) {
+        const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
+        const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
+        if (!detectedMime) {
+          throw new Error(
+            "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
+          );
+        }
+        const userContent = [
+          text(textPrompt),
+          {
+            type: "image",
+            source: {
+              type: "base64",
+              mediaType: detectedMime,
+              data: toBase64(imageBuffer)
+            }
+          }
+        ];
+        const options = this.buildAgentOptions(userContent);
+        return new Agent(AGENT_INTERNAL_KEY, options);
+      }
+      /**
+       * Build and return an Agent configured with multimodal content.
+       * More flexible than askWithImage - accepts any combination of content parts.
+       *
+       * @param content - Array of content parts (text, images, audio)
+       * @returns A configured Agent ready for execution
+       *
+       * @example
+       * ```typescript
+       * import { text, imageFromBuffer, audioFromBuffer } from "llmist";
+       *
+       * const agent = LLMist.createAgent()
+       *   .withModel("gemini:gemini-2.5-flash")
+       *   .askWithContent([
+       *     text("Describe this image and transcribe the audio:"),
+       *     imageFromBuffer(imageData),
+       *     audioFromBuffer(audioData),
+       *   ]);
+       *
+       * for await (const event of agent.run()) {
+       *   // handle events
+       * }
+       * ```
+       */
+      askWithContent(content) {
+        const options = this.buildAgentOptions(content);
         return new Agent(AGENT_INTERNAL_KEY, options);
       }
       /**
@@ -8087,6 +9031,8 @@ __export(index_exports, {
   StreamParser: () => StreamParser,
   StreamProcessor: () => StreamProcessor,
   SummarizationStrategy: () => SummarizationStrategy,
+  audioFromBase64: () => audioFromBase64,
+  audioFromBuffer: () => audioFromBuffer,
   collectEvents: () => collectEvents,
   collectText: () => collectText,
   complete: () => complete,
@@ -8102,20 +9048,34 @@ __export(index_exports, {
   createOpenAIProviderFromEnv: () => createOpenAIProviderFromEnv,
   createTextMockStream: () => createTextMockStream,
   defaultLogger: () => defaultLogger,
+  detectAudioMimeType: () => detectAudioMimeType,
+  detectImageMimeType: () => detectImageMimeType,
   discoverProviderAdapters: () => discoverProviderAdapters,
+  extractText: () => extractText,
   getMockManager: () => getMockManager,
   getModelId: () => getModelId,
   getProvider: () => getProvider,
   hasProviderPrefix: () => hasProviderPrefix,
+  imageFromBase64: () => imageFromBase64,
+  imageFromBuffer: () => imageFromBuffer,
+  imageFromUrl: () => imageFromUrl,
+  isAudioPart: () => isAudioPart,
+  isDataUrl: () => isDataUrl,
+  isImagePart: () => isImagePart,
+  isTextPart: () => isTextPart,
   iterationProgressHint: () => iterationProgressHint,
   mockLLM: () => mockLLM,
+  normalizeContent: () => normalizeContent,
   parallelGadgetHint: () => parallelGadgetHint,
+  parseDataUrl: () => parseDataUrl,
   resolveHintTemplate: () => resolveHintTemplate,
   resolveModel: () => resolveModel,
   resolvePromptTemplate: () => resolvePromptTemplate,
   resolveRulesTemplate: () => resolveRulesTemplate,
   runWithHandlers: () => runWithHandlers,
   stream: () => stream,
+  text: () => text,
+  toBase64: () => toBase64,
   validateAndApplyDefaults: () => validateAndApplyDefaults,
   validateGadgetParams: () => validateGadgetParams,
   z: () => import_zod2.z
@@ -9016,6 +9976,7 @@ function createHints(config) {
 // src/index.ts
 init_client();
+init_input_content();
 init_messages();
 init_model_registry();
 init_model_shortcuts();
@@ -9263,9 +10224,9 @@ function sleep(ms) {
 function generateInvocationId() {
   return `inv-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
 }
-function splitIntoChunks(text, minChunkSize = 5, maxChunkSize = 30) {
+function splitIntoChunks(text3, minChunkSize = 5, maxChunkSize = 30) {
   const chunks = [];
-  let remaining = text;
+  let remaining = text3;
   while (remaining.length > 0) {
     const chunkSize = Math.min(
       Math.floor(Math.random() * (maxChunkSize - minChunkSize + 1)) + minChunkSize,
@@ -9324,17 +10285,17 @@ ${String(value)}
   return result;
 }
 function formatGadgetCalls(gadgetCalls) {
-  let text = "";
+  let text3 = "";
   const calls = [];
   for (const call of gadgetCalls) {
     const invocationId = call.invocationId ?? generateInvocationId();
     calls.push({ name: call.gadgetName, invocationId });
     const blockParams = serializeToBlockFormat(call.parameters);
-    text += `
+    text3 += `
 ${GADGET_START_PREFIX}${call.gadgetName}
 ${blockParams}${GADGET_END_PREFIX}`;
   }
-  return { text, calls };
+  return { text: text3, calls };
 }
 async function* createMockStream(response) {
   if (response.delayMs) {
@@ -9374,9 +10335,9 @@ async function* createMockStream(response) {
     };
   }
 }
-function createTextMockStream(text, options) {
+function createTextMockStream(text3, options) {
   return createMockStream({
-    text,
+    text: text3,
     delayMs: options?.delayMs,
     streamDelayMs: options?.streamDelayMs,
     usage: options?.usage,
@@ -9393,10 +10354,10 @@ var MockProviderAdapter = class {
   constructor(options) {
     this.mockManager = getMockManager(options);
   }
-  supports(descriptor) {
+  supports(_descriptor) {
     return true;
   }
-  stream(options, descriptor, spec) {
+  stream(options, descriptor, _spec) {
     const context = {
       model: options.model,
       provider: descriptor.provider,
@@ -9407,20 +10368,154 @@ var MockProviderAdapter = class {
     return this.createMockStreamFromContext(context);
   }
   async *createMockStreamFromContext(context) {
-    try {
-      const mockResponse = await this.mockManager.findMatch(context);
-      if (!mockResponse) {
-        yield {
-          text: "",
-          finishReason: "stop",
-          usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
-        };
-        return;
-      }
-      yield* createMockStream(mockResponse);
-    } catch (error) {
-      throw error;
+    const mockResponse = await this.mockManager.findMatch(context);
+    if (!mockResponse) {
+      yield {
+        text: "",
+        finishReason: "stop",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+      return;
+    }
+    yield* createMockStream(mockResponse);
+  }
+  // ==========================================================================
+  // Image Generation Support
+  // ==========================================================================
+  /**
+   * Check if this adapter supports image generation for a given model.
+   * Returns true if there's a registered mock with images for this model.
+   */
+  supportsImageGeneration(_modelId) {
+    return true;
+  }
+  /**
+   * Generate mock images based on registered mocks.
+   *
+   * @param options - Image generation options
+   * @returns Mock image generation result
+   */
+  async generateImage(options) {
+    const context = {
+      model: options.model,
+      provider: "mock",
+      modelName: options.model,
+      options: {
+        model: options.model,
+        messages: [{ role: "user", content: options.prompt }]
+      },
+      messages: [{ role: "user", content: options.prompt }]
+    };
+    const mockResponse = await this.mockManager.findMatch(context);
+    if (!mockResponse?.images || mockResponse.images.length === 0) {
+      throw new Error(
+        `No mock registered for image generation with model "${options.model}". Use mockLLM().forModel("${options.model}").returnsImage(...).register() to add one.`
+      );
+    }
+    return this.createImageResult(options, mockResponse);
+  }
+  /**
+   * Transform mock response into ImageGenerationResult format.
+   *
+   * @param options - Original image generation options
+   * @param mockResponse - Mock response containing image data
+   * @returns ImageGenerationResult with mock data and zero cost
+   */
+  createImageResult(options, mockResponse) {
+    const images = mockResponse.images ?? [];
+    return {
+      images: images.map((img) => ({
+        b64Json: img.data,
+        revisedPrompt: img.revisedPrompt
+      })),
+      model: options.model,
+      usage: {
+        imagesGenerated: images.length,
+        size: options.size ?? "1024x1024",
+        quality: options.quality ?? "standard"
+      },
+      cost: 0
+      // Mock cost is always 0
+    };
+  }
+  // ==========================================================================
+  // Speech Generation Support
+  // ==========================================================================
+  /**
+   * Check if this adapter supports speech generation for a given model.
+   * Returns true if there's a registered mock with audio for this model.
+   */
+  supportsSpeechGeneration(_modelId) {
+    return true;
+  }
+  /**
+   * Generate mock speech based on registered mocks.
+   *
+   * @param options - Speech generation options
+   * @returns Mock speech generation result
+   */
+  async generateSpeech(options) {
+    const context = {
+      model: options.model,
+      provider: "mock",
+      modelName: options.model,
+      options: {
+        model: options.model,
+        messages: [{ role: "user", content: options.input }]
+      },
+      messages: [{ role: "user", content: options.input }]
+    };
+    const mockResponse = await this.mockManager.findMatch(context);
+    if (!mockResponse?.audio) {
+      throw new Error(
+        `No mock registered for speech generation with model "${options.model}". Use mockLLM().forModel("${options.model}").returnsAudio(...).register() to add one.`
+      );
     }
+    return this.createSpeechResult(options, mockResponse);
+  }
+  /**
+   * Transform mock response into SpeechGenerationResult format.
+   * Converts base64 audio data to ArrayBuffer.
+   *
+   * @param options - Original speech generation options
+   * @param mockResponse - Mock response containing audio data
+   * @returns SpeechGenerationResult with mock data and zero cost
+   */
+  createSpeechResult(options, mockResponse) {
+    const audio = mockResponse.audio;
+    const binaryString = atob(audio.data);
+    const bytes = new Uint8Array(binaryString.length);
+    for (let i = 0; i < binaryString.length; i++) {
+      bytes[i] = binaryString.charCodeAt(i);
+    }
+    const format = this.mimeTypeToAudioFormat(audio.mimeType);
+    return {
+      audio: bytes.buffer,
+      model: options.model,
+      usage: {
+        characterCount: options.input.length
+      },
+      cost: 0,
+      // Mock cost is always 0
+      format
+    };
+  }
+  /**
+   * Map MIME type to audio format for SpeechGenerationResult.
+   * Defaults to "mp3" for unknown MIME types.
+   *
+   * @param mimeType - Audio MIME type string
+   * @returns Audio format identifier
+   */
+  mimeTypeToAudioFormat(mimeType) {
+    const mapping = {
+      "audio/mp3": "mp3",
+      "audio/mpeg": "mp3",
+      "audio/wav": "wav",
+      "audio/webm": "opus",
+      "audio/ogg": "opus"
+    };
+    return mapping[mimeType] ?? "mp3";
   }
 };
 function createMockAdapter(options) {
@@ -9428,6 +10523,20 @@ function createMockAdapter(options) {
 }
 // src/testing/mock-builder.ts
+init_input_content();
+init_messages();
+function hasImageContent(content) {
+  if (typeof content === "string") return false;
+  return content.some((part) => isImagePart(part));
+}
+function hasAudioContent(content) {
+  if (typeof content === "string") return false;
+  return content.some((part) => isAudioPart(part));
+}
+function countImages(content) {
+  if (typeof content === "string") return 0;
+  return content.filter((part) => isImagePart(part)).length;
+}
 var MockBuilder = class {
   matchers = [];
   response = {};
@@ -9490,9 +10599,9 @@ var MockBuilder = class {
    * @example
    * mockLLM().whenMessageContains('hello')
    */
-  whenMessageContains(text) {
+  whenMessageContains(text3) {
     this.matchers.push(
-      (ctx) => ctx.messages.some((msg) => msg.content?.toLowerCase().includes(text.toLowerCase()))
+      (ctx) => ctx.messages.some((msg) => extractText(msg.content).toLowerCase().includes(text3.toLowerCase()))
     );
     return this;
   }
@@ -9502,10 +10611,11 @@ var MockBuilder = class {
    * @example
    * mockLLM().whenLastMessageContains('goodbye')
    */
-  whenLastMessageContains(text) {
+  whenLastMessageContains(text3) {
     this.matchers.push((ctx) => {
       const lastMsg = ctx.messages[ctx.messages.length - 1];
-      return lastMsg?.content?.toLowerCase().includes(text.toLowerCase()) ?? false;
+      if (!lastMsg) return false;
+      return extractText(lastMsg.content).toLowerCase().includes(text3.toLowerCase());
     });
     return this;
   }
@@ -9516,7 +10626,7 @@ var MockBuilder = class {
    * mockLLM().whenMessageMatches(/calculate \d+/)
    */
   whenMessageMatches(regex) {
-    this.matchers.push((ctx) => ctx.messages.some((msg) => regex.test(msg.content ?? "")));
+    this.matchers.push((ctx) => ctx.messages.some((msg) => regex.test(extractText(msg.content))));
     return this;
   }
   /**
@@ -9525,10 +10635,10 @@ var MockBuilder = class {
    * @example
    * mockLLM().whenRoleContains('system', 'You are a helpful assistant')
    */
-  whenRoleContains(role, text) {
+  whenRoleContains(role, text3) {
     this.matchers.push(
       (ctx) => ctx.messages.some(
-        (msg) => msg.role === role && msg.content?.toLowerCase().includes(text.toLowerCase())
+        (msg) => msg.role === role && extractText(msg.content).toLowerCase().includes(text3.toLowerCase())
       )
     );
     return this;
@@ -9556,6 +10666,43 @@ var MockBuilder = class {
     this.matchers.push(matcher);
     return this;
   }
+  // ==========================================================================
+  // Multimodal Matchers
+  // ==========================================================================
+  /**
+   * Match when any message contains an image.
+   *
+   * @example
+   * mockLLM().whenMessageHasImage().returns("I see an image of a sunset.")
+   */
+  whenMessageHasImage() {
+    this.matchers.push((ctx) => ctx.messages.some((msg) => hasImageContent(msg.content)));
+    return this;
+  }
+  /**
+   * Match when any message contains audio.
+   *
+   * @example
+   * mockLLM().whenMessageHasAudio().returns("I hear music playing.")
+   */
+  whenMessageHasAudio() {
+    this.matchers.push((ctx) => ctx.messages.some((msg) => hasAudioContent(msg.content)));
+    return this;
+  }
+  /**
+   * Match based on the number of images in the last message.
+   *
+   * @example
+   * mockLLM().whenImageCount((n) => n >= 2).returns("Comparing multiple images...")
+   */
+  whenImageCount(predicate) {
+    this.matchers.push((ctx) => {
+      const lastMsg = ctx.messages[ctx.messages.length - 1];
+      if (!lastMsg) return false;
+      return predicate(countImages(lastMsg.content));
+    });
+    return this;
+  }
   /**
    * Set the text response to return.
    * Can be a static string or a function that returns a string dynamically.
@@ -9565,17 +10712,17 @@ var MockBuilder = class {
    * mockLLM().returns(() => `Response at ${Date.now()}`)
    * mockLLM().returns((ctx) => `You said: ${ctx.messages[0]?.content}`)
    */
-  returns(text) {
-    if (typeof text === "function") {
+  returns(text3) {
+    if (typeof text3 === "function") {
       this.response = async (ctx) => {
-        const resolvedText = await Promise.resolve().then(() => text(ctx));
+        const resolvedText = await Promise.resolve().then(() => text3(ctx));
         return { text: resolvedText };
       };
     } else {
       if (typeof this.response === "function") {
         throw new Error("Cannot use returns() after withResponse() with a function");
       }
-      this.response.text = text;
+      this.response.text = text3;
     }
     return this;
   }
@@ -9612,6 +10759,112 @@ var MockBuilder = class {
     this.response.gadgetCalls.push({ gadgetName, parameters });
     return this;
   }
+  // ==========================================================================
+  // Multimodal Response Helpers
+  // ==========================================================================
+  /**
+   * Return a single image in the response.
+   * Useful for mocking image generation endpoints.
+   *
+   * @param data - Image data (base64 string or Buffer)
+   * @param mimeType - MIME type (auto-detected if Buffer provided without type)
+   *
+   * @example
+   * mockLLM()
+   *   .forModel('dall-e-3')
+   *   .returnsImage(pngBuffer)
+   *   .register();
+   */
+  returnsImage(data, mimeType) {
+    if (typeof this.response === "function") {
+      throw new Error("Cannot use returnsImage() after withResponse() with a function");
+    }
+    let imageData;
+    let imageMime;
+    if (typeof data === "string") {
+      imageData = data;
+      if (!mimeType) {
+        throw new Error("MIME type is required when providing base64 string data");
+      }
+      imageMime = mimeType;
+    } else {
+      imageData = toBase64(data);
+      const detected = mimeType ?? detectImageMimeType(data);
+      if (!detected) {
+        throw new Error(
+          "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
+        );
+      }
+      imageMime = detected;
+    }
+    if (!this.response.images) {
+      this.response.images = [];
+    }
+    this.response.images.push({ data: imageData, mimeType: imageMime });
+    return this;
+  }
+  /**
+   * Return multiple images in the response.
+   *
+   * @example
+   * mockLLM()
+   *   .forModel('dall-e-3')
+   *   .returnsImages([
+   *     { data: pngBuffer1 },
+   *     { data: pngBuffer2 },
+   *   ])
+   *   .register();
+   */
+  returnsImages(images) {
+    for (const img of images) {
+      this.returnsImage(img.data, img.mimeType);
+      if (img.revisedPrompt && this.response && typeof this.response !== "function") {
+        const lastImage = this.response.images?.[this.response.images.length - 1];
+        if (lastImage) {
+          lastImage.revisedPrompt = img.revisedPrompt;
+        }
+      }
+    }
+    return this;
+  }
+  /**
+   * Return audio data in the response.
+   * Useful for mocking speech synthesis endpoints.
+   *
+   * @param data - Audio data (base64 string or Buffer)
+   * @param mimeType - MIME type (auto-detected if Buffer provided without type)
+   *
+   * @example
+   * mockLLM()
+   *   .forModel('tts-1')
+   *   .returnsAudio(mp3Buffer)
+   *   .register();
+   */
+  returnsAudio(data, mimeType) {
+    if (typeof this.response === "function") {
+      throw new Error("Cannot use returnsAudio() after withResponse() with a function");
+    }
+    let audioData;
+    let audioMime;
+    if (typeof data === "string") {
+      audioData = data;
+      if (!mimeType) {
+        throw new Error("MIME type is required when providing base64 string data");
+      }
+      audioMime = mimeType;
+    } else {
+      audioData = toBase64(data);
+      const detected = mimeType ?? detectAudioMimeType(data);
+      if (!detected) {
+        throw new Error(
+          "Could not detect audio MIME type. Please provide the mimeType parameter explicitly."
+        );
+      }
+      audioMime = detected;
+    }
+    this.response.audio = { data: audioData, mimeType: audioMime };
+    return this;
+  }
   /**
    * Set the complete mock response object.
    * This allows full control over all response properties.
@@ -9825,6 +11078,8 @@ var import_node_stream = require("stream");
   StreamParser,
   StreamProcessor,
   SummarizationStrategy,
+  audioFromBase64,
+  audioFromBuffer,
   collectEvents,
   collectText,
   complete,
@@ -9840,20 +11095,34 @@ var import_node_stream = require("stream");
   createOpenAIProviderFromEnv,
   createTextMockStream,
   defaultLogger,
+  detectAudioMimeType,
+  detectImageMimeType,
   discoverProviderAdapters,
+  extractText,
   getMockManager,
   getModelId,
   getProvider,
   hasProviderPrefix,
+  imageFromBase64,
+  imageFromBuffer,
+  imageFromUrl,
+  isAudioPart,
+  isDataUrl,
+  isImagePart,
+  isTextPart,
   iterationProgressHint,
   mockLLM,
+  normalizeContent,
   parallelGadgetHint,
+  parseDataUrl,
   resolveHintTemplate,
   resolveModel,
   resolvePromptTemplate,
   resolveRulesTemplate,
   runWithHandlers,
   stream,
+  text,
+  toBase64,
   validateAndApplyDefaults,
   validateGadgetParams,
   z