llmist 2.4.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -45,6 +45,158 @@ var init_constants = __esm({
45
45
  }
46
46
  });
47
47
 
48
+ // src/core/input-content.ts
49
+ function isTextPart(part) {
50
+ return part.type === "text";
51
+ }
52
+ function isImagePart(part) {
53
+ return part.type === "image";
54
+ }
55
+ function isAudioPart(part) {
56
+ return part.type === "audio";
57
+ }
58
+ function text(content) {
59
+ return { type: "text", text: content };
60
+ }
61
+ function imageFromBase64(data, mediaType) {
62
+ return {
63
+ type: "image",
64
+ source: { type: "base64", mediaType, data }
65
+ };
66
+ }
67
+ function imageFromUrl(url) {
68
+ return {
69
+ type: "image",
70
+ source: { type: "url", url }
71
+ };
72
+ }
73
+ function detectImageMimeType(data) {
74
+ const bytes = data instanceof Buffer ? data : Buffer.from(data);
75
+ for (const { bytes: magic, mimeType } of IMAGE_MAGIC_BYTES) {
76
+ if (bytes.length >= magic.length) {
77
+ let matches = true;
78
+ for (let i = 0; i < magic.length; i++) {
79
+ if (bytes[i] !== magic[i]) {
80
+ matches = false;
81
+ break;
82
+ }
83
+ }
84
+ if (matches) {
85
+ if (mimeType === "image/webp") {
86
+ if (bytes.length >= 12) {
87
+ const webpMarker = bytes[8] === 87 && bytes[9] === 69 && bytes[10] === 66 && bytes[11] === 80;
88
+ if (!webpMarker) continue;
89
+ }
90
+ }
91
+ return mimeType;
92
+ }
93
+ }
94
+ }
95
+ return null;
96
+ }
97
+ function detectAudioMimeType(data) {
98
+ const bytes = data instanceof Buffer ? data : Buffer.from(data);
99
+ for (const { bytes: magic, mimeType } of AUDIO_MAGIC_BYTES) {
100
+ if (bytes.length >= magic.length) {
101
+ let matches = true;
102
+ for (let i = 0; i < magic.length; i++) {
103
+ if (bytes[i] !== magic[i]) {
104
+ matches = false;
105
+ break;
106
+ }
107
+ }
108
+ if (matches) {
109
+ if (mimeType === "audio/wav") {
110
+ if (bytes.length >= 12) {
111
+ const waveMarker = bytes[8] === 87 && bytes[9] === 65 && bytes[10] === 86 && bytes[11] === 69;
112
+ if (!waveMarker) continue;
113
+ }
114
+ }
115
+ return mimeType;
116
+ }
117
+ }
118
+ }
119
+ return null;
120
+ }
121
+ function toBase64(data) {
122
+ if (typeof data === "string") {
123
+ return data;
124
+ }
125
+ return Buffer.from(data).toString("base64");
126
+ }
127
+ function imageFromBuffer(buffer, mediaType) {
128
+ const detectedType = mediaType ?? detectImageMimeType(buffer);
129
+ if (!detectedType) {
130
+ throw new Error(
131
+ "Could not detect image MIME type. Please provide the mediaType parameter explicitly."
132
+ );
133
+ }
134
+ return {
135
+ type: "image",
136
+ source: {
137
+ type: "base64",
138
+ mediaType: detectedType,
139
+ data: toBase64(buffer)
140
+ }
141
+ };
142
+ }
143
+ function audioFromBase64(data, mediaType) {
144
+ return {
145
+ type: "audio",
146
+ source: { type: "base64", mediaType, data }
147
+ };
148
+ }
149
+ function audioFromBuffer(buffer, mediaType) {
150
+ const detectedType = mediaType ?? detectAudioMimeType(buffer);
151
+ if (!detectedType) {
152
+ throw new Error(
153
+ "Could not detect audio MIME type. Please provide the mediaType parameter explicitly."
154
+ );
155
+ }
156
+ return {
157
+ type: "audio",
158
+ source: {
159
+ type: "base64",
160
+ mediaType: detectedType,
161
+ data: toBase64(buffer)
162
+ }
163
+ };
164
+ }
165
+ function isDataUrl(input) {
166
+ return input.startsWith("data:");
167
+ }
168
+ function parseDataUrl(url) {
169
+ const match = url.match(/^data:([^;]+);base64,(.+)$/);
170
+ if (!match) return null;
171
+ return { mimeType: match[1], data: match[2] };
172
+ }
173
+ var IMAGE_MAGIC_BYTES, AUDIO_MAGIC_BYTES;
174
+ var init_input_content = __esm({
175
+ "src/core/input-content.ts"() {
176
+ "use strict";
177
+ IMAGE_MAGIC_BYTES = [
178
+ { bytes: [255, 216, 255], mimeType: "image/jpeg" },
179
+ { bytes: [137, 80, 78, 71], mimeType: "image/png" },
180
+ { bytes: [71, 73, 70, 56], mimeType: "image/gif" },
181
+ // WebP starts with RIFF....WEBP
182
+ { bytes: [82, 73, 70, 70], mimeType: "image/webp" }
183
+ ];
184
+ AUDIO_MAGIC_BYTES = [
185
+ // MP3 frame sync
186
+ { bytes: [255, 251], mimeType: "audio/mp3" },
187
+ { bytes: [255, 250], mimeType: "audio/mp3" },
188
+ // ID3 tag (MP3)
189
+ { bytes: [73, 68, 51], mimeType: "audio/mp3" },
190
+ // OGG
191
+ { bytes: [79, 103, 103, 83], mimeType: "audio/ogg" },
192
+ // WAV (RIFF)
193
+ { bytes: [82, 73, 70, 70], mimeType: "audio/wav" },
194
+ // WebM
195
+ { bytes: [26, 69, 223, 163], mimeType: "audio/webm" }
196
+ ];
197
+ }
198
+ });
199
+
48
200
  // src/core/model-shortcuts.ts
49
201
  function isKnownModelPattern(model) {
50
202
  const normalized = model.toLowerCase();
@@ -402,7 +554,9 @@ var init_prompt_config = __esm({
402
554
  rules: () => [
403
555
  "Output ONLY plain text with the exact markers - never use function/tool calling",
404
556
  "You can invoke multiple gadgets in a single response",
405
- "For dependent gadgets, invoke the first one and wait for the result"
557
+ "Gadgets without dependencies execute immediately (in parallel if multiple)",
558
+ "Use :invocation_id:dep1,dep2 syntax when a gadget needs results from prior gadgets",
559
+ "If any dependency fails, dependent gadgets are automatically skipped"
406
560
  ],
407
561
  customExamples: null
408
562
  };
@@ -410,11 +564,24 @@ var init_prompt_config = __esm({
410
564
  });
411
565
 
412
566
  // src/core/messages.ts
567
+ function normalizeContent(content) {
568
+ if (typeof content === "string") {
569
+ return [{ type: "text", text: content }];
570
+ }
571
+ return content;
572
+ }
573
+ function extractText(content) {
574
+ if (typeof content === "string") {
575
+ return content;
576
+ }
577
+ return content.filter((part) => part.type === "text").map((part) => part.text).join("");
578
+ }
413
579
  var LLMMessageBuilder;
414
580
  var init_messages = __esm({
415
581
  "src/core/messages.ts"() {
416
582
  "use strict";
417
583
  init_constants();
584
+ init_input_content();
418
585
  init_prompt_config();
419
586
  LLMMessageBuilder = class {
420
587
  messages = [];
@@ -516,6 +683,10 @@ CRITICAL: ${criticalUsage}
516
683
  parts.push(`
517
684
  1. Start marker: ${this.startPrefix}gadget_name`);
518
685
  parts.push(`
686
+ With ID: ${this.startPrefix}gadget_name:my_id`);
687
+ parts.push(`
688
+ With dependencies: ${this.startPrefix}gadget_name:my_id:dep1,dep2`);
689
+ parts.push(`
519
690
  2. ${formatDescription}`);
520
691
  parts.push(`
521
692
  3. End marker: ${this.endPrefix}`);
@@ -565,6 +736,25 @@ ${this.endPrefix}`;
565
736
  EXAMPLE (Multiple Gadgets):
566
737
 
567
738
  ${multipleExample}`);
739
+ const dependencyExample = `${this.startPrefix}fetch_data:fetch_1
740
+ ${this.argPrefix}url
741
+ https://api.example.com/users
742
+ ${this.endPrefix}
743
+ ${this.startPrefix}fetch_data:fetch_2
744
+ ${this.argPrefix}url
745
+ https://api.example.com/orders
746
+ ${this.endPrefix}
747
+ ${this.startPrefix}merge_data:merge_1:fetch_1,fetch_2
748
+ ${this.argPrefix}format
749
+ json
750
+ ${this.endPrefix}`;
751
+ parts.push(`
752
+
753
+ EXAMPLE (With Dependencies):
754
+ merge_1 waits for fetch_1 AND fetch_2 to complete.
755
+ If either fails, merge_1 is automatically skipped.
756
+
757
+ ${dependencyExample}`);
568
758
  parts.push(`
569
759
 
570
760
  BLOCK FORMAT SYNTAX:
@@ -615,6 +805,25 @@ Produces: { "items": ["first", "second"] }`);
615
805
  }
616
806
  return parts.join("");
617
807
  }
808
+ /**
809
+ * Add a user message.
810
+ * Content can be a string (text only) or an array of content parts (multimodal).
811
+ *
812
+ * @param content - Message content
813
+ * @param metadata - Optional metadata
814
+ *
815
+ * @example
816
+ * ```typescript
817
+ * // Text only
818
+ * builder.addUser("Hello!");
819
+ *
820
+ * // Multimodal
821
+ * builder.addUser([
822
+ * text("What's in this image?"),
823
+ * imageFromBuffer(imageData),
824
+ * ]);
825
+ * ```
826
+ */
618
827
  addUser(content, metadata) {
619
828
  this.messages.push({ role: "user", content, metadata });
620
829
  return this;
@@ -623,6 +832,104 @@ Produces: { "items": ["first", "second"] }`);
623
832
  this.messages.push({ role: "assistant", content, metadata });
624
833
  return this;
625
834
  }
835
+ /**
836
+ * Add a user message with an image attachment.
837
+ *
838
+ * @param textContent - Text prompt
839
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
840
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
841
+ *
842
+ * @example
843
+ * ```typescript
844
+ * builder.addUserWithImage(
845
+ * "What's in this image?",
846
+ * await fs.readFile("photo.jpg"),
847
+ * "image/jpeg" // Optional - auto-detected
848
+ * );
849
+ * ```
850
+ */
851
+ addUserWithImage(textContent, imageData, mimeType) {
852
+ const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
853
+ const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
854
+ if (!detectedMime) {
855
+ throw new Error(
856
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
857
+ );
858
+ }
859
+ const content = [
860
+ text(textContent),
861
+ {
862
+ type: "image",
863
+ source: {
864
+ type: "base64",
865
+ mediaType: detectedMime,
866
+ data: toBase64(imageBuffer)
867
+ }
868
+ }
869
+ ];
870
+ this.messages.push({ role: "user", content });
871
+ return this;
872
+ }
873
+ /**
874
+ * Add a user message with an image URL (OpenAI only).
875
+ *
876
+ * @param textContent - Text prompt
877
+ * @param imageUrl - URL to the image
878
+ *
879
+ * @example
880
+ * ```typescript
881
+ * builder.addUserWithImageUrl(
882
+ * "What's in this image?",
883
+ * "https://example.com/image.jpg"
884
+ * );
885
+ * ```
886
+ */
887
+ addUserWithImageUrl(textContent, imageUrl) {
888
+ const content = [text(textContent), imageFromUrl(imageUrl)];
889
+ this.messages.push({ role: "user", content });
890
+ return this;
891
+ }
892
+ /**
893
+ * Add a user message with an audio attachment (Gemini only).
894
+ *
895
+ * @param textContent - Text prompt
896
+ * @param audioData - Audio data (Buffer, Uint8Array, or base64 string)
897
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
898
+ *
899
+ * @example
900
+ * ```typescript
901
+ * builder.addUserWithAudio(
902
+ * "Transcribe this audio",
903
+ * await fs.readFile("recording.mp3"),
904
+ * "audio/mp3" // Optional - auto-detected
905
+ * );
906
+ * ```
907
+ */
908
+ addUserWithAudio(textContent, audioData, mimeType) {
909
+ const audioBuffer = typeof audioData === "string" ? Buffer.from(audioData, "base64") : audioData;
910
+ const content = [text(textContent), audioFromBuffer(audioBuffer, mimeType)];
911
+ this.messages.push({ role: "user", content });
912
+ return this;
913
+ }
914
+ /**
915
+ * Add a user message with multiple content parts.
916
+ * Provides full flexibility for complex multimodal messages.
917
+ *
918
+ * @param parts - Array of content parts
919
+ *
920
+ * @example
921
+ * ```typescript
922
+ * builder.addUserMultimodal([
923
+ * text("Compare these images:"),
924
+ * imageFromBuffer(image1),
925
+ * imageFromBuffer(image2),
926
+ * ]);
927
+ * ```
928
+ */
929
+ addUserMultimodal(parts) {
930
+ this.messages.push({ role: "user", content: parts });
931
+ return this;
932
+ }
626
933
  addGadgetCall(gadget, parameters, result) {
627
934
  const paramStr = this.formatBlockParameters(parameters, "");
628
935
  this.messages.push({
@@ -1941,7 +2248,7 @@ var init_conversation_manager = __esm({
1941
2248
  if (msg.role === "user") {
1942
2249
  this.historyBuilder.addUser(msg.content);
1943
2250
  } else if (msg.role === "assistant") {
1944
- this.historyBuilder.addAssistant(msg.content);
2251
+ this.historyBuilder.addAssistant(extractText(msg.content));
1945
2252
  }
1946
2253
  }
1947
2254
  }
@@ -1962,8 +2269,10 @@ async function runWithHandlers(agentGenerator, handlers) {
1962
2269
  if (handlers.onGadgetCall) {
1963
2270
  await handlers.onGadgetCall({
1964
2271
  gadgetName: event.call.gadgetName,
2272
+ invocationId: event.call.invocationId,
1965
2273
  parameters: event.call.parameters,
1966
- parametersRaw: event.call.parametersRaw
2274
+ parametersRaw: event.call.parametersRaw,
2275
+ dependencies: event.call.dependencies
1967
2276
  });
1968
2277
  }
1969
2278
  break;
@@ -2840,15 +3149,37 @@ var init_parser = __esm({
2840
3149
  return segment.trim().length > 0 ? segment : void 0;
2841
3150
  }
2842
3151
  /**
2843
- * Parse gadget name, handling both old format (name:invocationId) and new format (just name).
2844
- * For new format, generates a unique invocation ID.
3152
+ * Parse gadget name with optional invocation ID and dependencies.
3153
+ *
3154
+ * Supported formats:
3155
+ * - `GadgetName` - Auto-generate ID, no dependencies
3156
+ * - `GadgetName:my_id` - Explicit ID, no dependencies
3157
+ * - `GadgetName:my_id:dep1,dep2` - Explicit ID with dependencies
3158
+ *
3159
+ * Dependencies must be comma-separated invocation IDs.
2845
3160
  */
2846
3161
  parseGadgetName(gadgetName) {
2847
- if (gadgetName.includes(":")) {
2848
- const parts = gadgetName.split(":");
2849
- return { actualName: parts[0], invocationId: parts[1] };
3162
+ const parts = gadgetName.split(":");
3163
+ if (parts.length === 1) {
3164
+ return {
3165
+ actualName: parts[0],
3166
+ invocationId: `gadget_${++globalInvocationCounter}`,
3167
+ dependencies: []
3168
+ };
3169
+ } else if (parts.length === 2) {
3170
+ return {
3171
+ actualName: parts[0],
3172
+ invocationId: parts[1].trim(),
3173
+ dependencies: []
3174
+ };
3175
+ } else {
3176
+ const deps = parts[2].split(",").map((d) => d.trim()).filter((d) => d.length > 0);
3177
+ return {
3178
+ actualName: parts[0],
3179
+ invocationId: parts[1].trim(),
3180
+ dependencies: deps
3181
+ };
2850
3182
  }
2851
- return { actualName: gadgetName, invocationId: `gadget_${++globalInvocationCounter}` };
2852
3183
  }
2853
3184
  /**
2854
3185
  * Extract the error message from a parse error.
@@ -2884,39 +3215,20 @@ var init_parser = __esm({
2884
3215
  const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
2885
3216
  if (metadataEndIndex === -1) break;
2886
3217
  const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
2887
- const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
3218
+ const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
2888
3219
  const contentStartIndex = metadataEndIndex + 1;
2889
3220
  let partEndIndex;
2890
3221
  let endMarkerLength = 0;
2891
- if (gadgetName.includes(":")) {
2892
- const oldEndMarker = `${this.endPrefix + actualGadgetName}:${invocationId}`;
2893
- partEndIndex = this.buffer.indexOf(oldEndMarker, contentStartIndex);
2894
- if (partEndIndex === -1) break;
2895
- endMarkerLength = oldEndMarker.length;
3222
+ const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
3223
+ const endPos = this.buffer.indexOf(this.endPrefix, contentStartIndex);
3224
+ if (nextStartPos !== -1 && (endPos === -1 || nextStartPos < endPos)) {
3225
+ partEndIndex = nextStartPos;
3226
+ endMarkerLength = 0;
3227
+ } else if (endPos !== -1) {
3228
+ partEndIndex = endPos;
3229
+ endMarkerLength = this.endPrefix.length;
2896
3230
  } else {
2897
- const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
2898
- let validEndPos = -1;
2899
- let searchPos = contentStartIndex;
2900
- while (true) {
2901
- const endPos = this.buffer.indexOf(this.endPrefix, searchPos);
2902
- if (endPos === -1) break;
2903
- const afterEnd = this.buffer.substring(endPos + this.endPrefix.length);
2904
- if (afterEnd.startsWith("\n") || afterEnd.startsWith("\r") || afterEnd.startsWith(this.startPrefix) || afterEnd.length === 0) {
2905
- validEndPos = endPos;
2906
- break;
2907
- } else {
2908
- searchPos = endPos + this.endPrefix.length;
2909
- }
2910
- }
2911
- if (nextStartPos !== -1 && (validEndPos === -1 || nextStartPos < validEndPos)) {
2912
- partEndIndex = nextStartPos;
2913
- endMarkerLength = 0;
2914
- } else if (validEndPos !== -1) {
2915
- partEndIndex = validEndPos;
2916
- endMarkerLength = this.endPrefix.length;
2917
- } else {
2918
- break;
2919
- }
3231
+ break;
2920
3232
  }
2921
3233
  const parametersRaw = this.buffer.substring(contentStartIndex, partEndIndex).trim();
2922
3234
  const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2927,7 +3239,8 @@ var init_parser = __esm({
2927
3239
  invocationId,
2928
3240
  parametersRaw,
2929
3241
  parameters,
2930
- parseError
3242
+ parseError,
3243
+ dependencies
2931
3244
  }
2932
3245
  };
2933
3246
  startIndex = partEndIndex + endMarkerLength;
@@ -2950,7 +3263,7 @@ var init_parser = __esm({
2950
3263
  const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
2951
3264
  if (metadataEndIndex !== -1) {
2952
3265
  const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
2953
- const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
3266
+ const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
2954
3267
  const contentStartIndex = metadataEndIndex + 1;
2955
3268
  const parametersRaw = this.buffer.substring(contentStartIndex).trim();
2956
3269
  const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2961,7 +3274,8 @@ var init_parser = __esm({
2961
3274
  invocationId,
2962
3275
  parametersRaw,
2963
3276
  parameters,
2964
- parseError
3277
+ parseError,
3278
+ dependencies
2965
3279
  }
2966
3280
  };
2967
3281
  return;
@@ -3331,6 +3645,13 @@ var init_stream_processor = __esm({
3331
3645
  accumulatedText = "";
3332
3646
  shouldStopExecution = false;
3333
3647
  observerFailureCount = 0;
3648
+ // Dependency tracking for gadget execution DAG
3649
+ /** Gadgets waiting for their dependencies to complete */
3650
+ pendingGadgets = /* @__PURE__ */ new Map();
3651
+ /** Completed gadget results, keyed by invocation ID */
3652
+ completedResults = /* @__PURE__ */ new Map();
3653
+ /** Invocation IDs of gadgets that have failed (error or skipped due to dependency) */
3654
+ failedInvocations = /* @__PURE__ */ new Set();
3334
3655
  constructor(options) {
3335
3656
  this.iteration = options.iteration;
3336
3657
  this.registry = options.registry;
@@ -3431,6 +3752,16 @@ var init_stream_processor = __esm({
3431
3752
  }
3432
3753
  }
3433
3754
  }
3755
+ const finalPendingEvents = await this.processPendingGadgets();
3756
+ outputs.push(...finalPendingEvents);
3757
+ if (finalPendingEvents.some((e) => e.type === "gadget_result")) {
3758
+ didExecuteGadgets = true;
3759
+ }
3760
+ for (const evt of finalPendingEvents) {
3761
+ if (evt.type === "gadget_result" && evt.result.breaksLoop) {
3762
+ shouldBreakLoop = true;
3763
+ }
3764
+ }
3434
3765
  }
3435
3766
  let finalMessage = this.accumulatedText;
3436
3767
  if (this.hooks.interceptors?.interceptAssistantMessage) {
@@ -3482,7 +3813,11 @@ var init_stream_processor = __esm({
3482
3813
  return [{ type: "text", content }];
3483
3814
  }
3484
3815
  /**
3485
- * Process a gadget call through the full lifecycle.
3816
+ * Process a gadget call through the full lifecycle, handling dependencies.
3817
+ *
3818
+ * Gadgets without dependencies (or with all dependencies satisfied) execute immediately.
3819
+ * Gadgets with unsatisfied dependencies are queued for later execution.
3820
+ * After each execution, pending gadgets are checked to see if they can now run.
3486
3821
  */
3487
3822
  async processGadgetCall(call) {
3488
3823
  if (this.shouldStopExecution) {
@@ -3493,6 +3828,53 @@ var init_stream_processor = __esm({
3493
3828
  }
3494
3829
  const events = [];
3495
3830
  events.push({ type: "gadget_call", call });
3831
+ if (call.dependencies.length > 0) {
3832
+ if (call.dependencies.includes(call.invocationId)) {
3833
+ this.logger.warn("Gadget has self-referential dependency (depends on itself)", {
3834
+ gadgetName: call.gadgetName,
3835
+ invocationId: call.invocationId
3836
+ });
3837
+ this.failedInvocations.add(call.invocationId);
3838
+ const skipEvent = {
3839
+ type: "gadget_skipped",
3840
+ gadgetName: call.gadgetName,
3841
+ invocationId: call.invocationId,
3842
+ parameters: call.parameters ?? {},
3843
+ failedDependency: call.invocationId,
3844
+ failedDependencyError: `Gadget "${call.invocationId}" cannot depend on itself (self-referential dependency)`
3845
+ };
3846
+ events.push(skipEvent);
3847
+ return events;
3848
+ }
3849
+ const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
3850
+ if (failedDep) {
3851
+ const skipEvents = await this.handleFailedDependency(call, failedDep);
3852
+ events.push(...skipEvents);
3853
+ return events;
3854
+ }
3855
+ const unsatisfied = call.dependencies.filter((dep) => !this.completedResults.has(dep));
3856
+ if (unsatisfied.length > 0) {
3857
+ this.logger.debug("Queueing gadget for later - waiting on dependencies", {
3858
+ gadgetName: call.gadgetName,
3859
+ invocationId: call.invocationId,
3860
+ waitingOn: unsatisfied
3861
+ });
3862
+ this.pendingGadgets.set(call.invocationId, call);
3863
+ return events;
3864
+ }
3865
+ }
3866
+ const executeEvents = await this.executeGadgetWithHooks(call);
3867
+ events.push(...executeEvents);
3868
+ const triggeredEvents = await this.processPendingGadgets();
3869
+ events.push(...triggeredEvents);
3870
+ return events;
3871
+ }
3872
+ /**
3873
+ * Execute a gadget through the full hook lifecycle.
3874
+ * This is the core execution logic, extracted from processGadgetCall.
3875
+ */
3876
+ async executeGadgetWithHooks(call) {
3877
+ const events = [];
3496
3878
  if (call.parseError) {
3497
3879
  this.logger.warn("Gadget has parse error", {
3498
3880
  gadgetName: call.gadgetName,
@@ -3623,6 +4005,10 @@ var init_stream_processor = __esm({
3623
4005
  });
3624
4006
  }
3625
4007
  await this.runObserversInParallel(completeObservers);
4008
+ this.completedResults.set(result.invocationId, result);
4009
+ if (result.error) {
4010
+ this.failedInvocations.add(result.invocationId);
4011
+ }
3626
4012
  events.push({ type: "gadget_result", result });
3627
4013
  if (result.error) {
3628
4014
  const errorType = this.determineErrorType(call, result);
@@ -3638,6 +4024,162 @@ var init_stream_processor = __esm({
3638
4024
  }
3639
4025
  return events;
3640
4026
  }
4027
+ /**
4028
+ * Handle a gadget that cannot execute because a dependency failed.
4029
+ * Calls the onDependencySkipped controller to allow customization.
4030
+ */
4031
+ async handleFailedDependency(call, failedDep) {
4032
+ const events = [];
4033
+ const depResult = this.completedResults.get(failedDep);
4034
+ const depError = depResult?.error ?? "Dependency failed";
4035
+ let action = { action: "skip" };
4036
+ if (this.hooks.controllers?.onDependencySkipped) {
4037
+ const context = {
4038
+ iteration: this.iteration,
4039
+ gadgetName: call.gadgetName,
4040
+ invocationId: call.invocationId,
4041
+ parameters: call.parameters ?? {},
4042
+ failedDependency: failedDep,
4043
+ failedDependencyError: depError,
4044
+ logger: this.logger
4045
+ };
4046
+ action = await this.hooks.controllers.onDependencySkipped(context);
4047
+ }
4048
+ if (action.action === "skip") {
4049
+ this.failedInvocations.add(call.invocationId);
4050
+ const skipEvent = {
4051
+ type: "gadget_skipped",
4052
+ gadgetName: call.gadgetName,
4053
+ invocationId: call.invocationId,
4054
+ parameters: call.parameters ?? {},
4055
+ failedDependency: failedDep,
4056
+ failedDependencyError: depError
4057
+ };
4058
+ events.push(skipEvent);
4059
+ if (this.hooks.observers?.onGadgetSkipped) {
4060
+ const observeContext = {
4061
+ iteration: this.iteration,
4062
+ gadgetName: call.gadgetName,
4063
+ invocationId: call.invocationId,
4064
+ parameters: call.parameters ?? {},
4065
+ failedDependency: failedDep,
4066
+ failedDependencyError: depError,
4067
+ logger: this.logger
4068
+ };
4069
+ await this.safeObserve(() => this.hooks.observers.onGadgetSkipped(observeContext));
4070
+ }
4071
+ this.logger.info("Gadget skipped due to failed dependency", {
4072
+ gadgetName: call.gadgetName,
4073
+ invocationId: call.invocationId,
4074
+ failedDependency: failedDep
4075
+ });
4076
+ } else if (action.action === "execute_anyway") {
4077
+ this.logger.info("Executing gadget despite failed dependency (controller override)", {
4078
+ gadgetName: call.gadgetName,
4079
+ invocationId: call.invocationId,
4080
+ failedDependency: failedDep
4081
+ });
4082
+ const executeEvents = await this.executeGadgetWithHooks(call);
4083
+ events.push(...executeEvents);
4084
+ } else if (action.action === "use_fallback") {
4085
+ const fallbackResult = {
4086
+ gadgetName: call.gadgetName,
4087
+ invocationId: call.invocationId,
4088
+ parameters: call.parameters ?? {},
4089
+ result: action.fallbackResult,
4090
+ executionTimeMs: 0
4091
+ };
4092
+ this.completedResults.set(call.invocationId, fallbackResult);
4093
+ events.push({ type: "gadget_result", result: fallbackResult });
4094
+ this.logger.info("Using fallback result for gadget with failed dependency", {
4095
+ gadgetName: call.gadgetName,
4096
+ invocationId: call.invocationId,
4097
+ failedDependency: failedDep
4098
+ });
4099
+ }
4100
+ return events;
4101
+ }
4102
+ /**
4103
+ * Process pending gadgets whose dependencies are now satisfied.
4104
+ * Executes ready gadgets in parallel and continues until no more can be triggered.
4105
+ */
4106
+ async processPendingGadgets() {
4107
+ const events = [];
4108
+ let progress = true;
4109
+ while (progress && this.pendingGadgets.size > 0) {
4110
+ progress = false;
4111
+ const readyToExecute = [];
4112
+ const readyToSkip = [];
4113
+ for (const [invocationId, call] of this.pendingGadgets) {
4114
+ const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
4115
+ if (failedDep) {
4116
+ readyToSkip.push({ call, failedDep });
4117
+ continue;
4118
+ }
4119
+ const allSatisfied = call.dependencies.every((dep) => this.completedResults.has(dep));
4120
+ if (allSatisfied) {
4121
+ readyToExecute.push(call);
4122
+ }
4123
+ }
4124
+ for (const { call, failedDep } of readyToSkip) {
4125
+ this.pendingGadgets.delete(call.invocationId);
4126
+ const skipEvents = await this.handleFailedDependency(call, failedDep);
4127
+ events.push(...skipEvents);
4128
+ progress = true;
4129
+ }
4130
+ if (readyToExecute.length > 0) {
4131
+ this.logger.debug("Executing ready gadgets in parallel", {
4132
+ count: readyToExecute.length,
4133
+ invocationIds: readyToExecute.map((c) => c.invocationId)
4134
+ });
4135
+ for (const call of readyToExecute) {
4136
+ this.pendingGadgets.delete(call.invocationId);
4137
+ }
4138
+ const executePromises = readyToExecute.map((call) => this.executeGadgetWithHooks(call));
4139
+ const results = await Promise.all(executePromises);
4140
+ for (const executeEvents of results) {
4141
+ events.push(...executeEvents);
4142
+ }
4143
+ progress = true;
4144
+ }
4145
+ }
4146
+ if (this.pendingGadgets.size > 0) {
4147
+ const pendingIds = new Set(this.pendingGadgets.keys());
4148
+ for (const [invocationId, call] of this.pendingGadgets) {
4149
+ const missingDeps = call.dependencies.filter((dep) => !this.completedResults.has(dep));
4150
+ const circularDeps = missingDeps.filter((dep) => pendingIds.has(dep));
4151
+ const trulyMissingDeps = missingDeps.filter((dep) => !pendingIds.has(dep));
4152
+ let errorMessage;
4153
+ let logLevel = "warn";
4154
+ if (circularDeps.length > 0 && trulyMissingDeps.length > 0) {
4155
+ errorMessage = `Dependencies unresolvable: circular=[${circularDeps.join(", ")}], missing=[${trulyMissingDeps.join(", ")}]`;
4156
+ logLevel = "error";
4157
+ } else if (circularDeps.length > 0) {
4158
+ errorMessage = `Circular dependency detected: "${invocationId}" depends on "${circularDeps[0]}" which also depends on "${invocationId}" (directly or indirectly)`;
4159
+ } else {
4160
+ errorMessage = `Dependency "${missingDeps[0]}" was never executed - check that the invocation ID exists and is spelled correctly`;
4161
+ }
4162
+ this.logger[logLevel]("Gadget has unresolvable dependencies", {
4163
+ gadgetName: call.gadgetName,
4164
+ invocationId,
4165
+ circularDependencies: circularDeps,
4166
+ missingDependencies: trulyMissingDeps
4167
+ });
4168
+ this.failedInvocations.add(invocationId);
4169
+ const skipEvent = {
4170
+ type: "gadget_skipped",
4171
+ gadgetName: call.gadgetName,
4172
+ invocationId,
4173
+ parameters: call.parameters ?? {},
4174
+ failedDependency: missingDeps[0],
4175
+ failedDependencyError: errorMessage
4176
+ };
4177
+ events.push(skipEvent);
4178
+ }
4179
+ this.pendingGadgets.clear();
4180
+ }
4181
+ return events;
4182
+ }
3641
4183
  /**
3642
4184
  * Safely execute an observer, catching and logging any errors.
3643
4185
  * Observers are non-critical, so errors are logged but don't crash the system.
@@ -4075,9 +4617,9 @@ var init_agent = __esm({
4075
4617
  if (msg.role === "user") {
4076
4618
  this.conversation.addUserMessage(msg.content);
4077
4619
  } else if (msg.role === "assistant") {
4078
- this.conversation.addAssistantMessage(msg.content);
4620
+ this.conversation.addAssistantMessage(extractText(msg.content));
4079
4621
  } else if (msg.role === "system") {
4080
- this.conversation.addUserMessage(`[System] ${msg.content}`);
4622
+ this.conversation.addUserMessage(`[System] ${extractText(msg.content)}`);
4081
4623
  }
4082
4624
  }
4083
4625
  }
@@ -4656,6 +5198,7 @@ var init_anthropic = __esm({
4656
5198
  "src/providers/anthropic.ts"() {
4657
5199
  "use strict";
4658
5200
  import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
5201
+ init_messages();
4659
5202
  init_anthropic_models();
4660
5203
  init_base_provider();
4661
5204
  init_constants2();
@@ -4694,7 +5237,7 @@ var init_anthropic = __esm({
4694
5237
  const systemMessages = messages.filter((message) => message.role === "system");
4695
5238
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
4696
5239
  type: "text",
4697
- text: m.content,
5240
+ text: extractText(m.content),
4698
5241
  // Add cache_control to the LAST system message block
4699
5242
  ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
4700
5243
  })) : void 0;
@@ -4707,14 +5250,10 @@ var init_anthropic = __esm({
4707
5250
  );
4708
5251
  const conversation = nonSystemMessages.map((message, index) => ({
4709
5252
  role: message.role,
4710
- content: [
4711
- {
4712
- type: "text",
4713
- text: message.content,
4714
- // Add cache_control to the LAST user message
4715
- ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
4716
- }
4717
- ]
5253
+ content: this.convertToAnthropicContent(
5254
+ message.content,
5255
+ message.role === "user" && index === lastUserIndex
5256
+ )
4718
5257
  }));
4719
5258
  const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
4720
5259
  const payload = {
@@ -4730,6 +5269,52 @@ var init_anthropic = __esm({
4730
5269
  };
4731
5270
  return payload;
4732
5271
  }
5272
+ /**
5273
+ * Convert llmist content to Anthropic's content block format.
5274
+ * Handles text, images (base64 only), and applies cache_control.
5275
+ */
5276
+ convertToAnthropicContent(content, addCacheControl) {
5277
+ const parts = normalizeContent(content);
5278
+ return parts.map((part, index) => {
5279
+ const isLastPart = index === parts.length - 1;
5280
+ const cacheControl = addCacheControl && isLastPart ? { cache_control: { type: "ephemeral" } } : {};
5281
+ if (part.type === "text") {
5282
+ return {
5283
+ type: "text",
5284
+ text: part.text,
5285
+ ...cacheControl
5286
+ };
5287
+ }
5288
+ if (part.type === "image") {
5289
+ return this.convertImagePart(part, cacheControl);
5290
+ }
5291
+ if (part.type === "audio") {
5292
+ throw new Error(
5293
+ "Anthropic does not support audio input. Use Google Gemini for audio processing."
5294
+ );
5295
+ }
5296
+ throw new Error(`Unsupported content type: ${part.type}`);
5297
+ });
5298
+ }
5299
+ /**
5300
+ * Convert an image content part to Anthropic's image block format.
5301
+ */
5302
+ convertImagePart(part, cacheControl) {
5303
+ if (part.source.type === "url") {
5304
+ throw new Error(
5305
+ "Anthropic does not support image URLs. Please provide base64-encoded image data instead."
5306
+ );
5307
+ }
5308
+ return {
5309
+ type: "image",
5310
+ source: {
5311
+ type: "base64",
5312
+ media_type: part.source.mediaType,
5313
+ data: part.source.data
5314
+ },
5315
+ ...cacheControl
5316
+ };
5317
+ }
4733
5318
  async executeStreamRequest(payload, signal) {
4734
5319
  const client = this.client;
4735
5320
  const stream2 = await client.messages.create(payload, signal ? { signal } : void 0);
@@ -4812,17 +5397,12 @@ var init_anthropic = __esm({
4812
5397
  async countTokens(messages, descriptor, _spec) {
4813
5398
  const client = this.client;
4814
5399
  const systemMessages = messages.filter((message) => message.role === "system");
4815
- const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
5400
+ const system = systemMessages.length > 0 ? systemMessages.map((m) => extractText(m.content)).join("\n\n") : void 0;
4816
5401
  const conversation = messages.filter(
4817
5402
  (message) => message.role !== "system"
4818
5403
  ).map((message) => ({
4819
5404
  role: message.role,
4820
- content: [
4821
- {
4822
- type: "text",
4823
- text: message.content
4824
- }
4825
- ]
5405
+ content: this.convertToAnthropicContent(message.content, false)
4826
5406
  }));
4827
5407
  try {
4828
5408
  const response = await client.messages.countTokens({
@@ -4836,8 +5416,19 @@ var init_anthropic = __esm({
4836
5416
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
4837
5417
  error
4838
5418
  );
4839
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
4840
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
5419
+ let totalChars = 0;
5420
+ let imageCount = 0;
5421
+ for (const msg of messages) {
5422
+ const parts = normalizeContent(msg.content);
5423
+ for (const part of parts) {
5424
+ if (part.type === "text") {
5425
+ totalChars += part.text.length;
5426
+ } else if (part.type === "image") {
5427
+ imageCount++;
5428
+ }
5429
+ }
5430
+ }
5431
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 1e3;
4841
5432
  }
4842
5433
  }
4843
5434
  };
@@ -5366,6 +5957,7 @@ var init_gemini = __esm({
5366
5957
  "src/providers/gemini.ts"() {
5367
5958
  "use strict";
5368
5959
  import_genai = require("@google/genai");
5960
+ init_messages();
5369
5961
  init_base_provider();
5370
5962
  init_constants2();
5371
5963
  init_gemini_image_models();
@@ -5535,7 +6127,7 @@ var init_gemini = __esm({
5535
6127
  };
5536
6128
  return {
5537
6129
  model: descriptor.name,
5538
- contents: this.convertContentsForNewSDK(contents),
6130
+ contents,
5539
6131
  config
5540
6132
  };
5541
6133
  }
@@ -5570,18 +6162,25 @@ var init_gemini = __esm({
5570
6162
  if (message.role === "system") {
5571
6163
  expandedMessages.push({
5572
6164
  role: "user",
5573
- content: message.content
6165
+ content: extractText(message.content)
5574
6166
  });
5575
6167
  expandedMessages.push({
5576
6168
  role: "assistant",
5577
6169
  content: "Understood."
5578
6170
  });
5579
6171
  } else {
5580
- expandedMessages.push(message);
6172
+ expandedMessages.push({
6173
+ role: message.role,
6174
+ content: message.content
6175
+ });
5581
6176
  }
5582
6177
  }
5583
6178
  return this.mergeConsecutiveMessages(expandedMessages);
5584
6179
  }
6180
+ /**
6181
+ * Merge consecutive messages with the same role (required by Gemini).
6182
+ * Handles multimodal content by converting to Gemini's part format.
6183
+ */
5585
6184
  mergeConsecutiveMessages(messages) {
5586
6185
  if (messages.length === 0) {
5587
6186
  return [];
@@ -5590,15 +6189,16 @@ var init_gemini = __esm({
5590
6189
  let currentGroup = null;
5591
6190
  for (const message of messages) {
5592
6191
  const geminiRole = GEMINI_ROLE_MAP[message.role];
6192
+ const geminiParts = this.convertToGeminiParts(message.content);
5593
6193
  if (currentGroup && currentGroup.role === geminiRole) {
5594
- currentGroup.parts.push({ text: message.content });
6194
+ currentGroup.parts.push(...geminiParts);
5595
6195
  } else {
5596
6196
  if (currentGroup) {
5597
6197
  result.push(currentGroup);
5598
6198
  }
5599
6199
  currentGroup = {
5600
6200
  role: geminiRole,
5601
- parts: [{ text: message.content }]
6201
+ parts: geminiParts
5602
6202
  };
5603
6203
  }
5604
6204
  }
@@ -5607,11 +6207,39 @@ var init_gemini = __esm({
5607
6207
  }
5608
6208
  return result;
5609
6209
  }
5610
- convertContentsForNewSDK(contents) {
5611
- return contents.map((content) => ({
5612
- role: content.role,
5613
- parts: content.parts.map((part) => ({ text: part.text }))
5614
- }));
6210
+ /**
6211
+ * Convert llmist content to Gemini's part format.
6212
+ * Handles text, images, and audio (Gemini supports all three).
6213
+ */
6214
+ convertToGeminiParts(content) {
6215
+ const parts = normalizeContent(content);
6216
+ return parts.map((part) => {
6217
+ if (part.type === "text") {
6218
+ return { text: part.text };
6219
+ }
6220
+ if (part.type === "image") {
6221
+ if (part.source.type === "url") {
6222
+ throw new Error(
6223
+ "Gemini does not support image URLs directly. Please provide base64-encoded image data."
6224
+ );
6225
+ }
6226
+ return {
6227
+ inlineData: {
6228
+ mimeType: part.source.mediaType,
6229
+ data: part.source.data
6230
+ }
6231
+ };
6232
+ }
6233
+ if (part.type === "audio") {
6234
+ return {
6235
+ inlineData: {
6236
+ mimeType: part.source.mediaType,
6237
+ data: part.source.data
6238
+ }
6239
+ };
6240
+ }
6241
+ throw new Error(`Unsupported content type: ${part.type}`);
6242
+ });
5615
6243
  }
5616
6244
  buildGenerationConfig(options) {
5617
6245
  const config = {};
@@ -5632,9 +6260,9 @@ var init_gemini = __esm({
5632
6260
  async *wrapStream(iterable) {
5633
6261
  const stream2 = iterable;
5634
6262
  for await (const chunk of stream2) {
5635
- const text = this.extractText(chunk);
5636
- if (text) {
5637
- yield { text, rawEvent: chunk };
6263
+ const text3 = this.extractText(chunk);
6264
+ if (text3) {
6265
+ yield { text: text3, rawEvent: chunk };
5638
6266
  }
5639
6267
  const finishReason = this.extractFinishReason(chunk);
5640
6268
  const usage = this.extractUsage(chunk);
@@ -5695,7 +6323,7 @@ var init_gemini = __esm({
5695
6323
  try {
5696
6324
  const response = await client.models.countTokens({
5697
6325
  model: descriptor.name,
5698
- contents: this.convertContentsForNewSDK(contents)
6326
+ contents
5699
6327
  // Note: systemInstruction not used - it's not supported by countTokens()
5700
6328
  // and would cause a 2100% token counting error
5701
6329
  });
@@ -5705,8 +6333,19 @@ var init_gemini = __esm({
5705
6333
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
5706
6334
  error
5707
6335
  );
5708
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
5709
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
6336
+ let totalChars = 0;
6337
+ let mediaCount = 0;
6338
+ for (const msg of messages) {
6339
+ const parts = normalizeContent(msg.content);
6340
+ for (const part of parts) {
6341
+ if (part.type === "text") {
6342
+ totalChars += part.text.length;
6343
+ } else if (part.type === "image" || part.type === "audio") {
6344
+ mediaCount++;
6345
+ }
6346
+ }
6347
+ }
6348
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + mediaCount * 258;
5710
6349
  }
5711
6350
  }
5712
6351
  };
@@ -6349,6 +6988,7 @@ var init_openai = __esm({
6349
6988
  "use strict";
6350
6989
  import_openai = __toESM(require("openai"), 1);
6351
6990
  import_tiktoken = require("tiktoken");
6991
+ init_messages();
6352
6992
  init_base_provider();
6353
6993
  init_constants2();
6354
6994
  init_openai_image_models();
@@ -6456,11 +7096,7 @@ var init_openai = __esm({
6456
7096
  const sanitizedExtra = sanitizeExtra(extra, shouldIncludeTemperature);
6457
7097
  return {
6458
7098
  model: descriptor.name,
6459
- messages: messages.map((message) => ({
6460
- role: ROLE_MAP[message.role],
6461
- content: message.content,
6462
- name: message.name
6463
- })),
7099
+ messages: messages.map((message) => this.convertToOpenAIMessage(message)),
6464
7100
  // Only set max_completion_tokens if explicitly provided
6465
7101
  // Otherwise let the API use "as much as fits" in the context window
6466
7102
  ...maxTokens !== void 0 ? { max_completion_tokens: maxTokens } : {},
@@ -6472,6 +7108,77 @@ var init_openai = __esm({
6472
7108
  ...shouldIncludeTemperature ? { temperature } : {}
6473
7109
  };
6474
7110
  }
7111
+ /**
7112
+ * Convert an LLMMessage to OpenAI's ChatCompletionMessageParam.
7113
+ * Handles role-specific content type requirements:
7114
+ * - system/assistant: string content only
7115
+ * - user: string or multimodal array content
7116
+ */
7117
+ convertToOpenAIMessage(message) {
7118
+ const role = ROLE_MAP[message.role];
7119
+ if (role === "user") {
7120
+ const content = this.convertToOpenAIContent(message.content);
7121
+ return {
7122
+ role: "user",
7123
+ content,
7124
+ ...message.name ? { name: message.name } : {}
7125
+ };
7126
+ }
7127
+ const textContent = typeof message.content === "string" ? message.content : extractText(message.content);
7128
+ if (role === "system") {
7129
+ return {
7130
+ role: "system",
7131
+ content: textContent,
7132
+ ...message.name ? { name: message.name } : {}
7133
+ };
7134
+ }
7135
+ return {
7136
+ role: "assistant",
7137
+ content: textContent,
7138
+ ...message.name ? { name: message.name } : {}
7139
+ };
7140
+ }
7141
+ /**
7142
+ * Convert llmist content to OpenAI's content format.
7143
+ * Optimizes by returning string for text-only content, array for multimodal.
7144
+ */
7145
+ convertToOpenAIContent(content) {
7146
+ if (typeof content === "string") {
7147
+ return content;
7148
+ }
7149
+ return content.map((part) => {
7150
+ if (part.type === "text") {
7151
+ return { type: "text", text: part.text };
7152
+ }
7153
+ if (part.type === "image") {
7154
+ return this.convertImagePart(part);
7155
+ }
7156
+ if (part.type === "audio") {
7157
+ throw new Error(
7158
+ "OpenAI chat completions do not support audio input. Use Whisper for transcription or Gemini for audio understanding."
7159
+ );
7160
+ }
7161
+ throw new Error(`Unsupported content type: ${part.type}`);
7162
+ });
7163
+ }
7164
+ /**
7165
+ * Convert an image content part to OpenAI's image_url format.
7166
+ * Supports both URLs and base64 data URLs.
7167
+ */
7168
+ convertImagePart(part) {
7169
+ if (part.source.type === "url") {
7170
+ return {
7171
+ type: "image_url",
7172
+ image_url: { url: part.source.url }
7173
+ };
7174
+ }
7175
+ return {
7176
+ type: "image_url",
7177
+ image_url: {
7178
+ url: `data:${part.source.mediaType};base64,${part.source.data}`
7179
+ }
7180
+ };
7181
+ }
6475
7182
  async executeStreamRequest(payload, signal) {
6476
7183
  const client = this.client;
6477
7184
  const stream2 = await client.chat.completions.create(payload, signal ? { signal } : void 0);
@@ -6480,9 +7187,9 @@ var init_openai = __esm({
6480
7187
  async *wrapStream(iterable) {
6481
7188
  const stream2 = iterable;
6482
7189
  for await (const chunk of stream2) {
6483
- const text = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
6484
- if (text) {
6485
- yield { text, rawEvent: chunk };
7190
+ const text3 = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
7191
+ if (text3) {
7192
+ yield { text: text3, rawEvent: chunk };
6486
7193
  }
6487
7194
  const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
6488
7195
  const usage = chunk.usage ? {
@@ -6530,17 +7237,26 @@ var init_openai = __esm({
6530
7237
  }
6531
7238
  try {
6532
7239
  let tokenCount = 0;
7240
+ let imageCount = 0;
6533
7241
  for (const message of messages) {
6534
7242
  tokenCount += OPENAI_MESSAGE_OVERHEAD_TOKENS;
6535
7243
  const roleText = ROLE_MAP[message.role];
6536
7244
  tokenCount += encoding.encode(roleText).length;
6537
- tokenCount += encoding.encode(message.content ?? "").length;
7245
+ const textContent = extractText(message.content);
7246
+ tokenCount += encoding.encode(textContent).length;
7247
+ const parts = normalizeContent(message.content);
7248
+ for (const part of parts) {
7249
+ if (part.type === "image") {
7250
+ imageCount++;
7251
+ }
7252
+ }
6538
7253
  if (message.name) {
6539
7254
  tokenCount += encoding.encode(message.name).length;
6540
7255
  tokenCount += OPENAI_NAME_FIELD_OVERHEAD_TOKENS;
6541
7256
  }
6542
7257
  }
6543
7258
  tokenCount += OPENAI_REPLY_PRIMING_TOKENS;
7259
+ tokenCount += imageCount * 765;
6544
7260
  return tokenCount;
6545
7261
  } finally {
6546
7262
  encoding.free();
@@ -6550,8 +7266,19 @@ var init_openai = __esm({
6550
7266
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
6551
7267
  error
6552
7268
  );
6553
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
6554
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
7269
+ let totalChars = 0;
7270
+ let imageCount = 0;
7271
+ for (const msg of messages) {
7272
+ const parts = normalizeContent(msg.content);
7273
+ for (const part of parts) {
7274
+ if (part.type === "text") {
7275
+ totalChars += part.text.length;
7276
+ } else if (part.type === "image") {
7277
+ imageCount++;
7278
+ }
7279
+ }
7280
+ }
7281
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 765;
6555
7282
  }
6556
7283
  }
6557
7284
  };
@@ -6974,6 +7701,138 @@ var init_text = __esm({
6974
7701
  }
6975
7702
  });
6976
7703
 
7704
+ // src/core/namespaces/vision.ts
7705
+ var VisionNamespace;
7706
+ var init_vision = __esm({
7707
+ "src/core/namespaces/vision.ts"() {
7708
+ "use strict";
7709
+ init_input_content();
7710
+ init_messages();
7711
+ VisionNamespace = class {
7712
+ constructor(client) {
7713
+ this.client = client;
7714
+ }
7715
+ /**
7716
+ * Build a message builder with the image content attached.
7717
+ * Handles URLs, data URLs, base64 strings, and binary buffers.
7718
+ */
7719
+ buildImageMessage(options) {
7720
+ const builder = new LLMMessageBuilder();
7721
+ if (options.systemPrompt) {
7722
+ builder.addSystem(options.systemPrompt);
7723
+ }
7724
+ if (typeof options.image === "string") {
7725
+ if (options.image.startsWith("http://") || options.image.startsWith("https://")) {
7726
+ builder.addUserWithImageUrl(options.prompt, options.image);
7727
+ } else if (isDataUrl(options.image)) {
7728
+ const parsed = parseDataUrl(options.image);
7729
+ if (!parsed) {
7730
+ throw new Error("Invalid data URL format");
7731
+ }
7732
+ builder.addUserWithImage(
7733
+ options.prompt,
7734
+ parsed.data,
7735
+ parsed.mimeType
7736
+ );
7737
+ } else {
7738
+ const buffer = Buffer.from(options.image, "base64");
7739
+ builder.addUserWithImage(options.prompt, buffer, options.mimeType);
7740
+ }
7741
+ } else {
7742
+ builder.addUserWithImage(options.prompt, options.image, options.mimeType);
7743
+ }
7744
+ return builder;
7745
+ }
7746
+ /**
7747
+ * Stream the response and collect text and usage information.
7748
+ */
7749
+ async streamAndCollect(options, builder) {
7750
+ let response = "";
7751
+ let finalUsage;
7752
+ for await (const chunk of this.client.stream({
7753
+ model: options.model,
7754
+ messages: builder.build(),
7755
+ maxTokens: options.maxTokens,
7756
+ temperature: options.temperature
7757
+ })) {
7758
+ response += chunk.text;
7759
+ if (chunk.usage) {
7760
+ finalUsage = {
7761
+ inputTokens: chunk.usage.inputTokens,
7762
+ outputTokens: chunk.usage.outputTokens,
7763
+ totalTokens: chunk.usage.totalTokens
7764
+ };
7765
+ }
7766
+ }
7767
+ return { text: response.trim(), usage: finalUsage };
7768
+ }
7769
+ /**
7770
+ * Analyze an image with a vision-capable model.
7771
+ * Returns the analysis as a string.
7772
+ *
7773
+ * @param options - Vision analysis options
7774
+ * @returns Promise resolving to the analysis text
7775
+ * @throws Error if the image format is unsupported or model doesn't support vision
7776
+ *
7777
+ * @example
7778
+ * ```typescript
7779
+ * // From file
7780
+ * const result = await llmist.vision.analyze({
7781
+ * model: "gpt-4o",
7782
+ * image: await fs.readFile("photo.jpg"),
7783
+ * prompt: "What's in this image?",
7784
+ * });
7785
+ *
7786
+ * // From URL (OpenAI only)
7787
+ * const result = await llmist.vision.analyze({
7788
+ * model: "gpt-4o",
7789
+ * image: "https://example.com/image.jpg",
7790
+ * prompt: "Describe this image",
7791
+ * });
7792
+ * ```
7793
+ */
7794
+ async analyze(options) {
7795
+ const builder = this.buildImageMessage(options);
7796
+ const { text: text3 } = await this.streamAndCollect(options, builder);
7797
+ return text3;
7798
+ }
7799
+ /**
7800
+ * Analyze an image and return detailed result with usage info.
7801
+ *
7802
+ * @param options - Vision analysis options
7803
+ * @returns Promise resolving to the analysis result with usage info
7804
+ */
7805
+ async analyzeWithUsage(options) {
7806
+ const builder = this.buildImageMessage(options);
7807
+ const { text: text3, usage } = await this.streamAndCollect(options, builder);
7808
+ return {
7809
+ text: text3,
7810
+ model: options.model,
7811
+ usage
7812
+ };
7813
+ }
7814
+ /**
7815
+ * Check if a model supports vision/image input.
7816
+ *
7817
+ * @param modelId - Model ID to check
7818
+ * @returns True if the model supports vision
7819
+ */
7820
+ supportsModel(modelId) {
7821
+ const spec = this.client.modelRegistry.getModelSpec(modelId);
7822
+ return spec?.features?.vision === true;
7823
+ }
7824
+ /**
7825
+ * List all models that support vision.
7826
+ *
7827
+ * @returns Array of model IDs that support vision
7828
+ */
7829
+ listModels() {
7830
+ return this.client.modelRegistry.listModels().filter((spec) => spec.features?.vision === true).map((spec) => spec.modelId);
7831
+ }
7832
+ };
7833
+ }
7834
+ });
7835
+
6977
7836
  // src/core/options.ts
6978
7837
  var ModelIdentifierParser;
6979
7838
  var init_options = __esm({
@@ -7018,6 +7877,7 @@ var init_client = __esm({
7018
7877
  init_image();
7019
7878
  init_speech();
7020
7879
  init_text();
7880
+ init_vision();
7021
7881
  init_options();
7022
7882
  init_quick_methods();
7023
7883
  LLMist = class _LLMist {
@@ -7029,6 +7889,7 @@ var init_client = __esm({
7029
7889
  text;
7030
7890
  image;
7031
7891
  speech;
7892
+ vision;
7032
7893
  constructor(...args) {
7033
7894
  let adapters = [];
7034
7895
  let defaultProvider;
@@ -7079,6 +7940,7 @@ var init_client = __esm({
7079
7940
  this.text = new TextNamespace(this);
7080
7941
  this.image = new ImageNamespace(this.adapters, this.defaultProvider);
7081
7942
  this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
7943
+ this.vision = new VisionNamespace(this);
7082
7944
  }
7083
7945
  stream(options) {
7084
7946
  const descriptor = this.parser.parse(options.model);
@@ -7263,6 +8125,7 @@ var init_builder = __esm({
7263
8125
  "src/agent/builder.ts"() {
7264
8126
  "use strict";
7265
8127
  init_constants();
8128
+ init_input_content();
7266
8129
  init_model_shortcuts();
7267
8130
  init_registry();
7268
8131
  init_agent();
@@ -7910,13 +8773,17 @@ ${endPrefix}`
7910
8773
  * }
7911
8774
  * ```
7912
8775
  */
7913
- ask(userPrompt) {
8776
+ /**
8777
+ * Build AgentOptions with the given user prompt.
8778
+ * Centralizes options construction for ask(), askWithImage(), and askWithContent().
8779
+ */
8780
+ buildAgentOptions(userPrompt) {
7914
8781
  if (!this.client) {
7915
8782
  const { LLMist: LLMistClass } = (init_client(), __toCommonJS(client_exports));
7916
8783
  this.client = new LLMistClass();
7917
8784
  }
7918
8785
  const registry = GadgetRegistry.from(this.gadgets);
7919
- const options = {
8786
+ return {
7920
8787
  client: this.client,
7921
8788
  model: this.model ?? "openai:gpt-5-nano",
7922
8789
  systemPrompt: this.systemPrompt,
@@ -7942,6 +8809,83 @@ ${endPrefix}`
7942
8809
  compactionConfig: this.compactionConfig,
7943
8810
  signal: this.signal
7944
8811
  };
8812
+ }
8813
+ ask(userPrompt) {
8814
+ const options = this.buildAgentOptions(userPrompt);
8815
+ return new Agent(AGENT_INTERNAL_KEY, options);
8816
+ }
8817
+ /**
8818
+ * Build and create the agent with a multimodal user prompt (text + image).
8819
+ * Returns the Agent instance ready to run.
8820
+ *
8821
+ * @param textPrompt - Text prompt describing what to do with the image
8822
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
8823
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
8824
+ * @returns Configured Agent instance
8825
+ *
8826
+ * @example
8827
+ * ```typescript
8828
+ * const agent = LLMist.createAgent()
8829
+ * .withModel("gpt-4o")
8830
+ * .withSystem("You analyze images")
8831
+ * .askWithImage(
8832
+ * "What's in this image?",
8833
+ * await fs.readFile("photo.jpg")
8834
+ * );
8835
+ *
8836
+ * for await (const event of agent.run()) {
8837
+ * // handle events
8838
+ * }
8839
+ * ```
8840
+ */
8841
+ askWithImage(textPrompt, imageData, mimeType) {
8842
+ const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
8843
+ const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
8844
+ if (!detectedMime) {
8845
+ throw new Error(
8846
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
8847
+ );
8848
+ }
8849
+ const userContent = [
8850
+ text(textPrompt),
8851
+ {
8852
+ type: "image",
8853
+ source: {
8854
+ type: "base64",
8855
+ mediaType: detectedMime,
8856
+ data: toBase64(imageBuffer)
8857
+ }
8858
+ }
8859
+ ];
8860
+ const options = this.buildAgentOptions(userContent);
8861
+ return new Agent(AGENT_INTERNAL_KEY, options);
8862
+ }
8863
+ /**
8864
+ * Build and return an Agent configured with multimodal content.
8865
+ * More flexible than askWithImage - accepts any combination of content parts.
8866
+ *
8867
+ * @param content - Array of content parts (text, images, audio)
8868
+ * @returns A configured Agent ready for execution
8869
+ *
8870
+ * @example
8871
+ * ```typescript
8872
+ * import { text, imageFromBuffer, audioFromBuffer } from "llmist";
8873
+ *
8874
+ * const agent = LLMist.createAgent()
8875
+ * .withModel("gemini:gemini-2.5-flash")
8876
+ * .askWithContent([
8877
+ * text("Describe this image and transcribe the audio:"),
8878
+ * imageFromBuffer(imageData),
8879
+ * audioFromBuffer(audioData),
8880
+ * ]);
8881
+ *
8882
+ * for await (const event of agent.run()) {
8883
+ * // handle events
8884
+ * }
8885
+ * ```
8886
+ */
8887
+ askWithContent(content) {
8888
+ const options = this.buildAgentOptions(content);
7945
8889
  return new Agent(AGENT_INTERNAL_KEY, options);
7946
8890
  }
7947
8891
  /**
@@ -8087,6 +9031,8 @@ __export(index_exports, {
8087
9031
  StreamParser: () => StreamParser,
8088
9032
  StreamProcessor: () => StreamProcessor,
8089
9033
  SummarizationStrategy: () => SummarizationStrategy,
9034
+ audioFromBase64: () => audioFromBase64,
9035
+ audioFromBuffer: () => audioFromBuffer,
8090
9036
  collectEvents: () => collectEvents,
8091
9037
  collectText: () => collectText,
8092
9038
  complete: () => complete,
@@ -8102,20 +9048,34 @@ __export(index_exports, {
8102
9048
  createOpenAIProviderFromEnv: () => createOpenAIProviderFromEnv,
8103
9049
  createTextMockStream: () => createTextMockStream,
8104
9050
  defaultLogger: () => defaultLogger,
9051
+ detectAudioMimeType: () => detectAudioMimeType,
9052
+ detectImageMimeType: () => detectImageMimeType,
8105
9053
  discoverProviderAdapters: () => discoverProviderAdapters,
9054
+ extractText: () => extractText,
8106
9055
  getMockManager: () => getMockManager,
8107
9056
  getModelId: () => getModelId,
8108
9057
  getProvider: () => getProvider,
8109
9058
  hasProviderPrefix: () => hasProviderPrefix,
9059
+ imageFromBase64: () => imageFromBase64,
9060
+ imageFromBuffer: () => imageFromBuffer,
9061
+ imageFromUrl: () => imageFromUrl,
9062
+ isAudioPart: () => isAudioPart,
9063
+ isDataUrl: () => isDataUrl,
9064
+ isImagePart: () => isImagePart,
9065
+ isTextPart: () => isTextPart,
8110
9066
  iterationProgressHint: () => iterationProgressHint,
8111
9067
  mockLLM: () => mockLLM,
9068
+ normalizeContent: () => normalizeContent,
8112
9069
  parallelGadgetHint: () => parallelGadgetHint,
9070
+ parseDataUrl: () => parseDataUrl,
8113
9071
  resolveHintTemplate: () => resolveHintTemplate,
8114
9072
  resolveModel: () => resolveModel,
8115
9073
  resolvePromptTemplate: () => resolvePromptTemplate,
8116
9074
  resolveRulesTemplate: () => resolveRulesTemplate,
8117
9075
  runWithHandlers: () => runWithHandlers,
8118
9076
  stream: () => stream,
9077
+ text: () => text,
9078
+ toBase64: () => toBase64,
8119
9079
  validateAndApplyDefaults: () => validateAndApplyDefaults,
8120
9080
  validateGadgetParams: () => validateGadgetParams,
8121
9081
  z: () => import_zod2.z
@@ -9016,6 +9976,7 @@ function createHints(config) {
9016
9976
 
9017
9977
  // src/index.ts
9018
9978
  init_client();
9979
+ init_input_content();
9019
9980
  init_messages();
9020
9981
  init_model_registry();
9021
9982
  init_model_shortcuts();
@@ -9263,9 +10224,9 @@ function sleep(ms) {
9263
10224
  function generateInvocationId() {
9264
10225
  return `inv-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
9265
10226
  }
9266
- function splitIntoChunks(text, minChunkSize = 5, maxChunkSize = 30) {
10227
+ function splitIntoChunks(text3, minChunkSize = 5, maxChunkSize = 30) {
9267
10228
  const chunks = [];
9268
- let remaining = text;
10229
+ let remaining = text3;
9269
10230
  while (remaining.length > 0) {
9270
10231
  const chunkSize = Math.min(
9271
10232
  Math.floor(Math.random() * (maxChunkSize - minChunkSize + 1)) + minChunkSize,
@@ -9324,17 +10285,17 @@ ${String(value)}
9324
10285
  return result;
9325
10286
  }
9326
10287
  function formatGadgetCalls(gadgetCalls) {
9327
- let text = "";
10288
+ let text3 = "";
9328
10289
  const calls = [];
9329
10290
  for (const call of gadgetCalls) {
9330
10291
  const invocationId = call.invocationId ?? generateInvocationId();
9331
10292
  calls.push({ name: call.gadgetName, invocationId });
9332
10293
  const blockParams = serializeToBlockFormat(call.parameters);
9333
- text += `
10294
+ text3 += `
9334
10295
  ${GADGET_START_PREFIX}${call.gadgetName}
9335
10296
  ${blockParams}${GADGET_END_PREFIX}`;
9336
10297
  }
9337
- return { text, calls };
10298
+ return { text: text3, calls };
9338
10299
  }
9339
10300
  async function* createMockStream(response) {
9340
10301
  if (response.delayMs) {
@@ -9374,9 +10335,9 @@ async function* createMockStream(response) {
9374
10335
  };
9375
10336
  }
9376
10337
  }
9377
- function createTextMockStream(text, options) {
10338
+ function createTextMockStream(text3, options) {
9378
10339
  return createMockStream({
9379
- text,
10340
+ text: text3,
9380
10341
  delayMs: options?.delayMs,
9381
10342
  streamDelayMs: options?.streamDelayMs,
9382
10343
  usage: options?.usage,
@@ -9393,10 +10354,10 @@ var MockProviderAdapter = class {
9393
10354
  constructor(options) {
9394
10355
  this.mockManager = getMockManager(options);
9395
10356
  }
9396
- supports(descriptor) {
10357
+ supports(_descriptor) {
9397
10358
  return true;
9398
10359
  }
9399
- stream(options, descriptor, spec) {
10360
+ stream(options, descriptor, _spec) {
9400
10361
  const context = {
9401
10362
  model: options.model,
9402
10363
  provider: descriptor.provider,
@@ -9407,20 +10368,154 @@ var MockProviderAdapter = class {
9407
10368
  return this.createMockStreamFromContext(context);
9408
10369
  }
9409
10370
  async *createMockStreamFromContext(context) {
9410
- try {
9411
- const mockResponse = await this.mockManager.findMatch(context);
9412
- if (!mockResponse) {
9413
- yield {
9414
- text: "",
9415
- finishReason: "stop",
9416
- usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
9417
- };
9418
- return;
9419
- }
9420
- yield* createMockStream(mockResponse);
9421
- } catch (error) {
9422
- throw error;
10371
+ const mockResponse = await this.mockManager.findMatch(context);
10372
+ if (!mockResponse) {
10373
+ yield {
10374
+ text: "",
10375
+ finishReason: "stop",
10376
+ usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
10377
+ };
10378
+ return;
10379
+ }
10380
+ yield* createMockStream(mockResponse);
10381
+ }
10382
+ // ==========================================================================
10383
+ // Image Generation Support
10384
+ // ==========================================================================
10385
+ /**
10386
+ * Check if this adapter supports image generation for a given model.
10387
+ * Returns true if there's a registered mock with images for this model.
10388
+ */
10389
+ supportsImageGeneration(_modelId) {
10390
+ return true;
10391
+ }
10392
+ /**
10393
+ * Generate mock images based on registered mocks.
10394
+ *
10395
+ * @param options - Image generation options
10396
+ * @returns Mock image generation result
10397
+ */
10398
+ async generateImage(options) {
10399
+ const context = {
10400
+ model: options.model,
10401
+ provider: "mock",
10402
+ modelName: options.model,
10403
+ options: {
10404
+ model: options.model,
10405
+ messages: [{ role: "user", content: options.prompt }]
10406
+ },
10407
+ messages: [{ role: "user", content: options.prompt }]
10408
+ };
10409
+ const mockResponse = await this.mockManager.findMatch(context);
10410
+ if (!mockResponse?.images || mockResponse.images.length === 0) {
10411
+ throw new Error(
10412
+ `No mock registered for image generation with model "${options.model}". Use mockLLM().forModel("${options.model}").returnsImage(...).register() to add one.`
10413
+ );
10414
+ }
10415
+ return this.createImageResult(options, mockResponse);
10416
+ }
10417
+ /**
10418
+ * Transform mock response into ImageGenerationResult format.
10419
+ *
10420
+ * @param options - Original image generation options
10421
+ * @param mockResponse - Mock response containing image data
10422
+ * @returns ImageGenerationResult with mock data and zero cost
10423
+ */
10424
+ createImageResult(options, mockResponse) {
10425
+ const images = mockResponse.images ?? [];
10426
+ return {
10427
+ images: images.map((img) => ({
10428
+ b64Json: img.data,
10429
+ revisedPrompt: img.revisedPrompt
10430
+ })),
10431
+ model: options.model,
10432
+ usage: {
10433
+ imagesGenerated: images.length,
10434
+ size: options.size ?? "1024x1024",
10435
+ quality: options.quality ?? "standard"
10436
+ },
10437
+ cost: 0
10438
+ // Mock cost is always 0
10439
+ };
10440
+ }
10441
+ // ==========================================================================
10442
+ // Speech Generation Support
10443
+ // ==========================================================================
10444
+ /**
10445
+ * Check if this adapter supports speech generation for a given model.
10446
+ * Returns true if there's a registered mock with audio for this model.
10447
+ */
10448
+ supportsSpeechGeneration(_modelId) {
10449
+ return true;
10450
+ }
10451
+ /**
10452
+ * Generate mock speech based on registered mocks.
10453
+ *
10454
+ * @param options - Speech generation options
10455
+ * @returns Mock speech generation result
10456
+ */
10457
+ async generateSpeech(options) {
10458
+ const context = {
10459
+ model: options.model,
10460
+ provider: "mock",
10461
+ modelName: options.model,
10462
+ options: {
10463
+ model: options.model,
10464
+ messages: [{ role: "user", content: options.input }]
10465
+ },
10466
+ messages: [{ role: "user", content: options.input }]
10467
+ };
10468
+ const mockResponse = await this.mockManager.findMatch(context);
10469
+ if (!mockResponse?.audio) {
10470
+ throw new Error(
10471
+ `No mock registered for speech generation with model "${options.model}". Use mockLLM().forModel("${options.model}").returnsAudio(...).register() to add one.`
10472
+ );
9423
10473
  }
10474
+ return this.createSpeechResult(options, mockResponse);
10475
+ }
10476
+ /**
10477
+ * Transform mock response into SpeechGenerationResult format.
10478
+ * Converts base64 audio data to ArrayBuffer.
10479
+ *
10480
+ * @param options - Original speech generation options
10481
+ * @param mockResponse - Mock response containing audio data
10482
+ * @returns SpeechGenerationResult with mock data and zero cost
10483
+ */
10484
+ createSpeechResult(options, mockResponse) {
10485
+ const audio = mockResponse.audio;
10486
+ const binaryString = atob(audio.data);
10487
+ const bytes = new Uint8Array(binaryString.length);
10488
+ for (let i = 0; i < binaryString.length; i++) {
10489
+ bytes[i] = binaryString.charCodeAt(i);
10490
+ }
10491
+ const format = this.mimeTypeToAudioFormat(audio.mimeType);
10492
+ return {
10493
+ audio: bytes.buffer,
10494
+ model: options.model,
10495
+ usage: {
10496
+ characterCount: options.input.length
10497
+ },
10498
+ cost: 0,
10499
+ // Mock cost is always 0
10500
+ format
10501
+ };
10502
+ }
10503
+ /**
10504
+ * Map MIME type to audio format for SpeechGenerationResult.
10505
+ * Defaults to "mp3" for unknown MIME types.
10506
+ *
10507
+ * @param mimeType - Audio MIME type string
10508
+ * @returns Audio format identifier
10509
+ */
10510
+ mimeTypeToAudioFormat(mimeType) {
10511
+ const mapping = {
10512
+ "audio/mp3": "mp3",
10513
+ "audio/mpeg": "mp3",
10514
+ "audio/wav": "wav",
10515
+ "audio/webm": "opus",
10516
+ "audio/ogg": "opus"
10517
+ };
10518
+ return mapping[mimeType] ?? "mp3";
9424
10519
  }
9425
10520
  };
9426
10521
  function createMockAdapter(options) {
@@ -9428,6 +10523,20 @@ function createMockAdapter(options) {
9428
10523
  }
9429
10524
 
9430
10525
  // src/testing/mock-builder.ts
10526
+ init_input_content();
10527
+ init_messages();
10528
+ function hasImageContent(content) {
10529
+ if (typeof content === "string") return false;
10530
+ return content.some((part) => isImagePart(part));
10531
+ }
10532
+ function hasAudioContent(content) {
10533
+ if (typeof content === "string") return false;
10534
+ return content.some((part) => isAudioPart(part));
10535
+ }
10536
+ function countImages(content) {
10537
+ if (typeof content === "string") return 0;
10538
+ return content.filter((part) => isImagePart(part)).length;
10539
+ }
9431
10540
  var MockBuilder = class {
9432
10541
  matchers = [];
9433
10542
  response = {};
@@ -9490,9 +10599,9 @@ var MockBuilder = class {
9490
10599
  * @example
9491
10600
  * mockLLM().whenMessageContains('hello')
9492
10601
  */
9493
- whenMessageContains(text) {
10602
+ whenMessageContains(text3) {
9494
10603
  this.matchers.push(
9495
- (ctx) => ctx.messages.some((msg) => msg.content?.toLowerCase().includes(text.toLowerCase()))
10604
+ (ctx) => ctx.messages.some((msg) => extractText(msg.content).toLowerCase().includes(text3.toLowerCase()))
9496
10605
  );
9497
10606
  return this;
9498
10607
  }
@@ -9502,10 +10611,11 @@ var MockBuilder = class {
9502
10611
  * @example
9503
10612
  * mockLLM().whenLastMessageContains('goodbye')
9504
10613
  */
9505
- whenLastMessageContains(text) {
10614
+ whenLastMessageContains(text3) {
9506
10615
  this.matchers.push((ctx) => {
9507
10616
  const lastMsg = ctx.messages[ctx.messages.length - 1];
9508
- return lastMsg?.content?.toLowerCase().includes(text.toLowerCase()) ?? false;
10617
+ if (!lastMsg) return false;
10618
+ return extractText(lastMsg.content).toLowerCase().includes(text3.toLowerCase());
9509
10619
  });
9510
10620
  return this;
9511
10621
  }
@@ -9516,7 +10626,7 @@ var MockBuilder = class {
9516
10626
  * mockLLM().whenMessageMatches(/calculate \d+/)
9517
10627
  */
9518
10628
  whenMessageMatches(regex) {
9519
- this.matchers.push((ctx) => ctx.messages.some((msg) => regex.test(msg.content ?? "")));
10629
+ this.matchers.push((ctx) => ctx.messages.some((msg) => regex.test(extractText(msg.content))));
9520
10630
  return this;
9521
10631
  }
9522
10632
  /**
@@ -9525,10 +10635,10 @@ var MockBuilder = class {
9525
10635
  * @example
9526
10636
  * mockLLM().whenRoleContains('system', 'You are a helpful assistant')
9527
10637
  */
9528
- whenRoleContains(role, text) {
10638
+ whenRoleContains(role, text3) {
9529
10639
  this.matchers.push(
9530
10640
  (ctx) => ctx.messages.some(
9531
- (msg) => msg.role === role && msg.content?.toLowerCase().includes(text.toLowerCase())
10641
+ (msg) => msg.role === role && extractText(msg.content).toLowerCase().includes(text3.toLowerCase())
9532
10642
  )
9533
10643
  );
9534
10644
  return this;
@@ -9556,6 +10666,43 @@ var MockBuilder = class {
9556
10666
  this.matchers.push(matcher);
9557
10667
  return this;
9558
10668
  }
10669
+ // ==========================================================================
10670
+ // Multimodal Matchers
10671
+ // ==========================================================================
10672
+ /**
10673
+ * Match when any message contains an image.
10674
+ *
10675
+ * @example
10676
+ * mockLLM().whenMessageHasImage().returns("I see an image of a sunset.")
10677
+ */
10678
+ whenMessageHasImage() {
10679
+ this.matchers.push((ctx) => ctx.messages.some((msg) => hasImageContent(msg.content)));
10680
+ return this;
10681
+ }
10682
+ /**
10683
+ * Match when any message contains audio.
10684
+ *
10685
+ * @example
10686
+ * mockLLM().whenMessageHasAudio().returns("I hear music playing.")
10687
+ */
10688
+ whenMessageHasAudio() {
10689
+ this.matchers.push((ctx) => ctx.messages.some((msg) => hasAudioContent(msg.content)));
10690
+ return this;
10691
+ }
10692
+ /**
10693
+ * Match based on the number of images in the last message.
10694
+ *
10695
+ * @example
10696
+ * mockLLM().whenImageCount((n) => n >= 2).returns("Comparing multiple images...")
10697
+ */
10698
+ whenImageCount(predicate) {
10699
+ this.matchers.push((ctx) => {
10700
+ const lastMsg = ctx.messages[ctx.messages.length - 1];
10701
+ if (!lastMsg) return false;
10702
+ return predicate(countImages(lastMsg.content));
10703
+ });
10704
+ return this;
10705
+ }
9559
10706
  /**
9560
10707
  * Set the text response to return.
9561
10708
  * Can be a static string or a function that returns a string dynamically.
@@ -9565,17 +10712,17 @@ var MockBuilder = class {
9565
10712
  * mockLLM().returns(() => `Response at ${Date.now()}`)
9566
10713
  * mockLLM().returns((ctx) => `You said: ${ctx.messages[0]?.content}`)
9567
10714
  */
9568
- returns(text) {
9569
- if (typeof text === "function") {
10715
+ returns(text3) {
10716
+ if (typeof text3 === "function") {
9570
10717
  this.response = async (ctx) => {
9571
- const resolvedText = await Promise.resolve().then(() => text(ctx));
10718
+ const resolvedText = await Promise.resolve().then(() => text3(ctx));
9572
10719
  return { text: resolvedText };
9573
10720
  };
9574
10721
  } else {
9575
10722
  if (typeof this.response === "function") {
9576
10723
  throw new Error("Cannot use returns() after withResponse() with a function");
9577
10724
  }
9578
- this.response.text = text;
10725
+ this.response.text = text3;
9579
10726
  }
9580
10727
  return this;
9581
10728
  }
@@ -9612,6 +10759,112 @@ var MockBuilder = class {
9612
10759
  this.response.gadgetCalls.push({ gadgetName, parameters });
9613
10760
  return this;
9614
10761
  }
10762
+ // ==========================================================================
10763
+ // Multimodal Response Helpers
10764
+ // ==========================================================================
10765
+ /**
10766
+ * Return a single image in the response.
10767
+ * Useful for mocking image generation endpoints.
10768
+ *
10769
+ * @param data - Image data (base64 string or Buffer)
10770
+ * @param mimeType - MIME type (auto-detected if Buffer provided without type)
10771
+ *
10772
+ * @example
10773
+ * mockLLM()
10774
+ * .forModel('dall-e-3')
10775
+ * .returnsImage(pngBuffer)
10776
+ * .register();
10777
+ */
10778
+ returnsImage(data, mimeType) {
10779
+ if (typeof this.response === "function") {
10780
+ throw new Error("Cannot use returnsImage() after withResponse() with a function");
10781
+ }
10782
+ let imageData;
10783
+ let imageMime;
10784
+ if (typeof data === "string") {
10785
+ imageData = data;
10786
+ if (!mimeType) {
10787
+ throw new Error("MIME type is required when providing base64 string data");
10788
+ }
10789
+ imageMime = mimeType;
10790
+ } else {
10791
+ imageData = toBase64(data);
10792
+ const detected = mimeType ?? detectImageMimeType(data);
10793
+ if (!detected) {
10794
+ throw new Error(
10795
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
10796
+ );
10797
+ }
10798
+ imageMime = detected;
10799
+ }
10800
+ if (!this.response.images) {
10801
+ this.response.images = [];
10802
+ }
10803
+ this.response.images.push({ data: imageData, mimeType: imageMime });
10804
+ return this;
10805
+ }
10806
+ /**
10807
+ * Return multiple images in the response.
10808
+ *
10809
+ * @example
10810
+ * mockLLM()
10811
+ * .forModel('dall-e-3')
10812
+ * .returnsImages([
10813
+ * { data: pngBuffer1 },
10814
+ * { data: pngBuffer2 },
10815
+ * ])
10816
+ * .register();
10817
+ */
10818
+ returnsImages(images) {
10819
+ for (const img of images) {
10820
+ this.returnsImage(img.data, img.mimeType);
10821
+ if (img.revisedPrompt && this.response && typeof this.response !== "function") {
10822
+ const lastImage = this.response.images?.[this.response.images.length - 1];
10823
+ if (lastImage) {
10824
+ lastImage.revisedPrompt = img.revisedPrompt;
10825
+ }
10826
+ }
10827
+ }
10828
+ return this;
10829
+ }
10830
+ /**
10831
+ * Return audio data in the response.
10832
+ * Useful for mocking speech synthesis endpoints.
10833
+ *
10834
+ * @param data - Audio data (base64 string or Buffer)
10835
+ * @param mimeType - MIME type (auto-detected if Buffer provided without type)
10836
+ *
10837
+ * @example
10838
+ * mockLLM()
10839
+ * .forModel('tts-1')
10840
+ * .returnsAudio(mp3Buffer)
10841
+ * .register();
10842
+ */
10843
+ returnsAudio(data, mimeType) {
10844
+ if (typeof this.response === "function") {
10845
+ throw new Error("Cannot use returnsAudio() after withResponse() with a function");
10846
+ }
10847
+ let audioData;
10848
+ let audioMime;
10849
+ if (typeof data === "string") {
10850
+ audioData = data;
10851
+ if (!mimeType) {
10852
+ throw new Error("MIME type is required when providing base64 string data");
10853
+ }
10854
+ audioMime = mimeType;
10855
+ } else {
10856
+ audioData = toBase64(data);
10857
+ const detected = mimeType ?? detectAudioMimeType(data);
10858
+ if (!detected) {
10859
+ throw new Error(
10860
+ "Could not detect audio MIME type. Please provide the mimeType parameter explicitly."
10861
+ );
10862
+ }
10863
+ audioMime = detected;
10864
+ }
10865
+ this.response.audio = { data: audioData, mimeType: audioMime };
10866
+ return this;
10867
+ }
9615
10868
  /**
9616
10869
  * Set the complete mock response object.
9617
10870
  * This allows full control over all response properties.
@@ -9825,6 +11078,8 @@ var import_node_stream = require("stream");
9825
11078
  StreamParser,
9826
11079
  StreamProcessor,
9827
11080
  SummarizationStrategy,
11081
+ audioFromBase64,
11082
+ audioFromBuffer,
9828
11083
  collectEvents,
9829
11084
  collectText,
9830
11085
  complete,
@@ -9840,20 +11095,34 @@ var import_node_stream = require("stream");
9840
11095
  createOpenAIProviderFromEnv,
9841
11096
  createTextMockStream,
9842
11097
  defaultLogger,
11098
+ detectAudioMimeType,
11099
+ detectImageMimeType,
9843
11100
  discoverProviderAdapters,
11101
+ extractText,
9844
11102
  getMockManager,
9845
11103
  getModelId,
9846
11104
  getProvider,
9847
11105
  hasProviderPrefix,
11106
+ imageFromBase64,
11107
+ imageFromBuffer,
11108
+ imageFromUrl,
11109
+ isAudioPart,
11110
+ isDataUrl,
11111
+ isImagePart,
11112
+ isTextPart,
9848
11113
  iterationProgressHint,
9849
11114
  mockLLM,
11115
+ normalizeContent,
9850
11116
  parallelGadgetHint,
11117
+ parseDataUrl,
9851
11118
  resolveHintTemplate,
9852
11119
  resolveModel,
9853
11120
  resolvePromptTemplate,
9854
11121
  resolveRulesTemplate,
9855
11122
  runWithHandlers,
9856
11123
  stream,
11124
+ text,
11125
+ toBase64,
9857
11126
  validateAndApplyDefaults,
9858
11127
  validateGadgetParams,
9859
11128
  z