llmist 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -45,6 +45,158 @@ var init_constants = __esm({
45
45
  }
46
46
  });
47
47
 
48
+ // src/core/input-content.ts
49
+ function isTextPart(part) {
50
+ return part.type === "text";
51
+ }
52
+ function isImagePart(part) {
53
+ return part.type === "image";
54
+ }
55
+ function isAudioPart(part) {
56
+ return part.type === "audio";
57
+ }
58
+ function text(content) {
59
+ return { type: "text", text: content };
60
+ }
61
+ function imageFromBase64(data, mediaType) {
62
+ return {
63
+ type: "image",
64
+ source: { type: "base64", mediaType, data }
65
+ };
66
+ }
67
+ function imageFromUrl(url) {
68
+ return {
69
+ type: "image",
70
+ source: { type: "url", url }
71
+ };
72
+ }
73
+ function detectImageMimeType(data) {
74
+ const bytes = data instanceof Buffer ? data : Buffer.from(data);
75
+ for (const { bytes: magic, mimeType } of IMAGE_MAGIC_BYTES) {
76
+ if (bytes.length >= magic.length) {
77
+ let matches = true;
78
+ for (let i = 0; i < magic.length; i++) {
79
+ if (bytes[i] !== magic[i]) {
80
+ matches = false;
81
+ break;
82
+ }
83
+ }
84
+ if (matches) {
85
+ if (mimeType === "image/webp") {
86
+ if (bytes.length >= 12) {
87
+ const webpMarker = bytes[8] === 87 && bytes[9] === 69 && bytes[10] === 66 && bytes[11] === 80;
88
+ if (!webpMarker) continue;
89
+ }
90
+ }
91
+ return mimeType;
92
+ }
93
+ }
94
+ }
95
+ return null;
96
+ }
97
+ function detectAudioMimeType(data) {
98
+ const bytes = data instanceof Buffer ? data : Buffer.from(data);
99
+ for (const { bytes: magic, mimeType } of AUDIO_MAGIC_BYTES) {
100
+ if (bytes.length >= magic.length) {
101
+ let matches = true;
102
+ for (let i = 0; i < magic.length; i++) {
103
+ if (bytes[i] !== magic[i]) {
104
+ matches = false;
105
+ break;
106
+ }
107
+ }
108
+ if (matches) {
109
+ if (mimeType === "audio/wav") {
110
+ if (bytes.length >= 12) {
111
+ const waveMarker = bytes[8] === 87 && bytes[9] === 65 && bytes[10] === 86 && bytes[11] === 69;
112
+ if (!waveMarker) continue;
113
+ }
114
+ }
115
+ return mimeType;
116
+ }
117
+ }
118
+ }
119
+ return null;
120
+ }
121
+ function toBase64(data) {
122
+ if (typeof data === "string") {
123
+ return data;
124
+ }
125
+ return Buffer.from(data).toString("base64");
126
+ }
127
+ function imageFromBuffer(buffer, mediaType) {
128
+ const detectedType = mediaType ?? detectImageMimeType(buffer);
129
+ if (!detectedType) {
130
+ throw new Error(
131
+ "Could not detect image MIME type. Please provide the mediaType parameter explicitly."
132
+ );
133
+ }
134
+ return {
135
+ type: "image",
136
+ source: {
137
+ type: "base64",
138
+ mediaType: detectedType,
139
+ data: toBase64(buffer)
140
+ }
141
+ };
142
+ }
143
+ function audioFromBase64(data, mediaType) {
144
+ return {
145
+ type: "audio",
146
+ source: { type: "base64", mediaType, data }
147
+ };
148
+ }
149
+ function audioFromBuffer(buffer, mediaType) {
150
+ const detectedType = mediaType ?? detectAudioMimeType(buffer);
151
+ if (!detectedType) {
152
+ throw new Error(
153
+ "Could not detect audio MIME type. Please provide the mediaType parameter explicitly."
154
+ );
155
+ }
156
+ return {
157
+ type: "audio",
158
+ source: {
159
+ type: "base64",
160
+ mediaType: detectedType,
161
+ data: toBase64(buffer)
162
+ }
163
+ };
164
+ }
165
+ function isDataUrl(input) {
166
+ return input.startsWith("data:");
167
+ }
168
+ function parseDataUrl(url) {
169
+ const match = url.match(/^data:([^;]+);base64,(.+)$/);
170
+ if (!match) return null;
171
+ return { mimeType: match[1], data: match[2] };
172
+ }
173
+ var IMAGE_MAGIC_BYTES, AUDIO_MAGIC_BYTES;
174
+ var init_input_content = __esm({
175
+ "src/core/input-content.ts"() {
176
+ "use strict";
177
+ IMAGE_MAGIC_BYTES = [
178
+ { bytes: [255, 216, 255], mimeType: "image/jpeg" },
179
+ { bytes: [137, 80, 78, 71], mimeType: "image/png" },
180
+ { bytes: [71, 73, 70, 56], mimeType: "image/gif" },
181
+ // WebP starts with RIFF....WEBP
182
+ { bytes: [82, 73, 70, 70], mimeType: "image/webp" }
183
+ ];
184
+ AUDIO_MAGIC_BYTES = [
185
+ // MP3 frame sync
186
+ { bytes: [255, 251], mimeType: "audio/mp3" },
187
+ { bytes: [255, 250], mimeType: "audio/mp3" },
188
+ // ID3 tag (MP3)
189
+ { bytes: [73, 68, 51], mimeType: "audio/mp3" },
190
+ // OGG
191
+ { bytes: [79, 103, 103, 83], mimeType: "audio/ogg" },
192
+ // WAV (RIFF)
193
+ { bytes: [82, 73, 70, 70], mimeType: "audio/wav" },
194
+ // WebM
195
+ { bytes: [26, 69, 223, 163], mimeType: "audio/webm" }
196
+ ];
197
+ }
198
+ });
199
+
48
200
  // src/core/model-shortcuts.ts
49
201
  function isKnownModelPattern(model) {
50
202
  const normalized = model.toLowerCase();
@@ -402,7 +554,9 @@ var init_prompt_config = __esm({
402
554
  rules: () => [
403
555
  "Output ONLY plain text with the exact markers - never use function/tool calling",
404
556
  "You can invoke multiple gadgets in a single response",
405
- "For dependent gadgets, invoke the first one and wait for the result"
557
+ "Gadgets without dependencies execute immediately (in parallel if multiple)",
558
+ "Use :invocation_id:dep1,dep2 syntax when a gadget needs results from prior gadgets",
559
+ "If any dependency fails, dependent gadgets are automatically skipped"
406
560
  ],
407
561
  customExamples: null
408
562
  };
@@ -410,11 +564,24 @@ var init_prompt_config = __esm({
410
564
  });
411
565
 
412
566
  // src/core/messages.ts
567
+ function normalizeContent(content) {
568
+ if (typeof content === "string") {
569
+ return [{ type: "text", text: content }];
570
+ }
571
+ return content;
572
+ }
573
+ function extractText(content) {
574
+ if (typeof content === "string") {
575
+ return content;
576
+ }
577
+ return content.filter((part) => part.type === "text").map((part) => part.text).join("");
578
+ }
413
579
  var LLMMessageBuilder;
414
580
  var init_messages = __esm({
415
581
  "src/core/messages.ts"() {
416
582
  "use strict";
417
583
  init_constants();
584
+ init_input_content();
418
585
  init_prompt_config();
419
586
  LLMMessageBuilder = class {
420
587
  messages = [];
@@ -516,6 +683,10 @@ CRITICAL: ${criticalUsage}
516
683
  parts.push(`
517
684
  1. Start marker: ${this.startPrefix}gadget_name`);
518
685
  parts.push(`
686
+ With ID: ${this.startPrefix}gadget_name:my_id`);
687
+ parts.push(`
688
+ With dependencies: ${this.startPrefix}gadget_name:my_id:dep1,dep2`);
689
+ parts.push(`
519
690
  2. ${formatDescription}`);
520
691
  parts.push(`
521
692
  3. End marker: ${this.endPrefix}`);
@@ -565,6 +736,25 @@ ${this.endPrefix}`;
565
736
  EXAMPLE (Multiple Gadgets):
566
737
 
567
738
  ${multipleExample}`);
739
+ const dependencyExample = `${this.startPrefix}fetch_data:fetch_1
740
+ ${this.argPrefix}url
741
+ https://api.example.com/users
742
+ ${this.endPrefix}
743
+ ${this.startPrefix}fetch_data:fetch_2
744
+ ${this.argPrefix}url
745
+ https://api.example.com/orders
746
+ ${this.endPrefix}
747
+ ${this.startPrefix}merge_data:merge_1:fetch_1,fetch_2
748
+ ${this.argPrefix}format
749
+ json
750
+ ${this.endPrefix}`;
751
+ parts.push(`
752
+
753
+ EXAMPLE (With Dependencies):
754
+ merge_1 waits for fetch_1 AND fetch_2 to complete.
755
+ If either fails, merge_1 is automatically skipped.
756
+
757
+ ${dependencyExample}`);
568
758
  parts.push(`
569
759
 
570
760
  BLOCK FORMAT SYNTAX:
@@ -615,6 +805,25 @@ Produces: { "items": ["first", "second"] }`);
615
805
  }
616
806
  return parts.join("");
617
807
  }
808
+ /**
809
+ * Add a user message.
810
+ * Content can be a string (text only) or an array of content parts (multimodal).
811
+ *
812
+ * @param content - Message content
813
+ * @param metadata - Optional metadata
814
+ *
815
+ * @example
816
+ * ```typescript
817
+ * // Text only
818
+ * builder.addUser("Hello!");
819
+ *
820
+ * // Multimodal
821
+ * builder.addUser([
822
+ * text("What's in this image?"),
823
+ * imageFromBuffer(imageData),
824
+ * ]);
825
+ * ```
826
+ */
618
827
  addUser(content, metadata) {
619
828
  this.messages.push({ role: "user", content, metadata });
620
829
  return this;
@@ -623,6 +832,104 @@ Produces: { "items": ["first", "second"] }`);
623
832
  this.messages.push({ role: "assistant", content, metadata });
624
833
  return this;
625
834
  }
835
+ /**
836
+ * Add a user message with an image attachment.
837
+ *
838
+ * @param textContent - Text prompt
839
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
840
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
841
+ *
842
+ * @example
843
+ * ```typescript
844
+ * builder.addUserWithImage(
845
+ * "What's in this image?",
846
+ * await fs.readFile("photo.jpg"),
847
+ * "image/jpeg" // Optional - auto-detected
848
+ * );
849
+ * ```
850
+ */
851
+ addUserWithImage(textContent, imageData, mimeType) {
852
+ const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
853
+ const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
854
+ if (!detectedMime) {
855
+ throw new Error(
856
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
857
+ );
858
+ }
859
+ const content = [
860
+ text(textContent),
861
+ {
862
+ type: "image",
863
+ source: {
864
+ type: "base64",
865
+ mediaType: detectedMime,
866
+ data: toBase64(imageBuffer)
867
+ }
868
+ }
869
+ ];
870
+ this.messages.push({ role: "user", content });
871
+ return this;
872
+ }
873
+ /**
874
+ * Add a user message with an image URL (OpenAI only).
875
+ *
876
+ * @param textContent - Text prompt
877
+ * @param imageUrl - URL to the image
878
+ *
879
+ * @example
880
+ * ```typescript
881
+ * builder.addUserWithImageUrl(
882
+ * "What's in this image?",
883
+ * "https://example.com/image.jpg"
884
+ * );
885
+ * ```
886
+ */
887
+ addUserWithImageUrl(textContent, imageUrl) {
888
+ const content = [text(textContent), imageFromUrl(imageUrl)];
889
+ this.messages.push({ role: "user", content });
890
+ return this;
891
+ }
892
+ /**
893
+ * Add a user message with an audio attachment (Gemini only).
894
+ *
895
+ * @param textContent - Text prompt
896
+ * @param audioData - Audio data (Buffer, Uint8Array, or base64 string)
897
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
898
+ *
899
+ * @example
900
+ * ```typescript
901
+ * builder.addUserWithAudio(
902
+ * "Transcribe this audio",
903
+ * await fs.readFile("recording.mp3"),
904
+ * "audio/mp3" // Optional - auto-detected
905
+ * );
906
+ * ```
907
+ */
908
+ addUserWithAudio(textContent, audioData, mimeType) {
909
+ const audioBuffer = typeof audioData === "string" ? Buffer.from(audioData, "base64") : audioData;
910
+ const content = [text(textContent), audioFromBuffer(audioBuffer, mimeType)];
911
+ this.messages.push({ role: "user", content });
912
+ return this;
913
+ }
914
+ /**
915
+ * Add a user message with multiple content parts.
916
+ * Provides full flexibility for complex multimodal messages.
917
+ *
918
+ * @param parts - Array of content parts
919
+ *
920
+ * @example
921
+ * ```typescript
922
+ * builder.addUserMultimodal([
923
+ * text("Compare these images:"),
924
+ * imageFromBuffer(image1),
925
+ * imageFromBuffer(image2),
926
+ * ]);
927
+ * ```
928
+ */
929
+ addUserMultimodal(parts) {
930
+ this.messages.push({ role: "user", content: parts });
931
+ return this;
932
+ }
626
933
  addGadgetCall(gadget, parameters, result) {
627
934
  const paramStr = this.formatBlockParameters(parameters, "");
628
935
  this.messages.push({
@@ -1941,7 +2248,7 @@ var init_conversation_manager = __esm({
1941
2248
  if (msg.role === "user") {
1942
2249
  this.historyBuilder.addUser(msg.content);
1943
2250
  } else if (msg.role === "assistant") {
1944
- this.historyBuilder.addAssistant(msg.content);
2251
+ this.historyBuilder.addAssistant(extractText(msg.content));
1945
2252
  }
1946
2253
  }
1947
2254
  }
@@ -1962,8 +2269,10 @@ async function runWithHandlers(agentGenerator, handlers) {
1962
2269
  if (handlers.onGadgetCall) {
1963
2270
  await handlers.onGadgetCall({
1964
2271
  gadgetName: event.call.gadgetName,
2272
+ invocationId: event.call.invocationId,
1965
2273
  parameters: event.call.parameters,
1966
- parametersRaw: event.call.parametersRaw
2274
+ parametersRaw: event.call.parametersRaw,
2275
+ dependencies: event.call.dependencies
1967
2276
  });
1968
2277
  }
1969
2278
  break;
@@ -2555,7 +2864,27 @@ var init_cost_reporting_client = __esm({
2555
2864
  constructor(client, reportCost) {
2556
2865
  this.client = client;
2557
2866
  this.reportCost = reportCost;
2867
+ this.image = {
2868
+ generate: async (options) => {
2869
+ const result = await this.client.image.generate(options);
2870
+ if (result.cost !== void 0 && result.cost > 0) {
2871
+ this.reportCost(result.cost);
2872
+ }
2873
+ return result;
2874
+ }
2875
+ };
2876
+ this.speech = {
2877
+ generate: async (options) => {
2878
+ const result = await this.client.speech.generate(options);
2879
+ if (result.cost !== void 0 && result.cost > 0) {
2880
+ this.reportCost(result.cost);
2881
+ }
2882
+ return result;
2883
+ }
2884
+ };
2558
2885
  }
2886
+ image;
2887
+ speech;
2559
2888
  /**
2560
2889
  * Access to model registry for cost estimation.
2561
2890
  */
@@ -2820,15 +3149,37 @@ var init_parser = __esm({
2820
3149
  return segment.trim().length > 0 ? segment : void 0;
2821
3150
  }
2822
3151
  /**
2823
- * Parse gadget name, handling both old format (name:invocationId) and new format (just name).
2824
- * For new format, generates a unique invocation ID.
3152
+ * Parse gadget name with optional invocation ID and dependencies.
3153
+ *
3154
+ * Supported formats:
3155
+ * - `GadgetName` - Auto-generate ID, no dependencies
3156
+ * - `GadgetName:my_id` - Explicit ID, no dependencies
3157
+ * - `GadgetName:my_id:dep1,dep2` - Explicit ID with dependencies
3158
+ *
3159
+ * Dependencies must be comma-separated invocation IDs.
2825
3160
  */
2826
3161
  parseGadgetName(gadgetName) {
2827
- if (gadgetName.includes(":")) {
2828
- const parts = gadgetName.split(":");
2829
- return { actualName: parts[0], invocationId: parts[1] };
3162
+ const parts = gadgetName.split(":");
3163
+ if (parts.length === 1) {
3164
+ return {
3165
+ actualName: parts[0],
3166
+ invocationId: `gadget_${++globalInvocationCounter}`,
3167
+ dependencies: []
3168
+ };
3169
+ } else if (parts.length === 2) {
3170
+ return {
3171
+ actualName: parts[0],
3172
+ invocationId: parts[1].trim(),
3173
+ dependencies: []
3174
+ };
3175
+ } else {
3176
+ const deps = parts[2].split(",").map((d) => d.trim()).filter((d) => d.length > 0);
3177
+ return {
3178
+ actualName: parts[0],
3179
+ invocationId: parts[1].trim(),
3180
+ dependencies: deps
3181
+ };
2830
3182
  }
2831
- return { actualName: gadgetName, invocationId: `gadget_${++globalInvocationCounter}` };
2832
3183
  }
2833
3184
  /**
2834
3185
  * Extract the error message from a parse error.
@@ -2864,39 +3215,20 @@ var init_parser = __esm({
2864
3215
  const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
2865
3216
  if (metadataEndIndex === -1) break;
2866
3217
  const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
2867
- const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
3218
+ const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
2868
3219
  const contentStartIndex = metadataEndIndex + 1;
2869
3220
  let partEndIndex;
2870
3221
  let endMarkerLength = 0;
2871
- if (gadgetName.includes(":")) {
2872
- const oldEndMarker = `${this.endPrefix + actualGadgetName}:${invocationId}`;
2873
- partEndIndex = this.buffer.indexOf(oldEndMarker, contentStartIndex);
2874
- if (partEndIndex === -1) break;
2875
- endMarkerLength = oldEndMarker.length;
3222
+ const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
3223
+ const endPos = this.buffer.indexOf(this.endPrefix, contentStartIndex);
3224
+ if (nextStartPos !== -1 && (endPos === -1 || nextStartPos < endPos)) {
3225
+ partEndIndex = nextStartPos;
3226
+ endMarkerLength = 0;
3227
+ } else if (endPos !== -1) {
3228
+ partEndIndex = endPos;
3229
+ endMarkerLength = this.endPrefix.length;
2876
3230
  } else {
2877
- const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
2878
- let validEndPos = -1;
2879
- let searchPos = contentStartIndex;
2880
- while (true) {
2881
- const endPos = this.buffer.indexOf(this.endPrefix, searchPos);
2882
- if (endPos === -1) break;
2883
- const afterEnd = this.buffer.substring(endPos + this.endPrefix.length);
2884
- if (afterEnd.startsWith("\n") || afterEnd.startsWith("\r") || afterEnd.startsWith(this.startPrefix) || afterEnd.length === 0) {
2885
- validEndPos = endPos;
2886
- break;
2887
- } else {
2888
- searchPos = endPos + this.endPrefix.length;
2889
- }
2890
- }
2891
- if (nextStartPos !== -1 && (validEndPos === -1 || nextStartPos < validEndPos)) {
2892
- partEndIndex = nextStartPos;
2893
- endMarkerLength = 0;
2894
- } else if (validEndPos !== -1) {
2895
- partEndIndex = validEndPos;
2896
- endMarkerLength = this.endPrefix.length;
2897
- } else {
2898
- break;
2899
- }
3231
+ break;
2900
3232
  }
2901
3233
  const parametersRaw = this.buffer.substring(contentStartIndex, partEndIndex).trim();
2902
3234
  const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2907,7 +3239,8 @@ var init_parser = __esm({
2907
3239
  invocationId,
2908
3240
  parametersRaw,
2909
3241
  parameters,
2910
- parseError
3242
+ parseError,
3243
+ dependencies
2911
3244
  }
2912
3245
  };
2913
3246
  startIndex = partEndIndex + endMarkerLength;
@@ -2930,7 +3263,7 @@ var init_parser = __esm({
2930
3263
  const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
2931
3264
  if (metadataEndIndex !== -1) {
2932
3265
  const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
2933
- const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
3266
+ const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
2934
3267
  const contentStartIndex = metadataEndIndex + 1;
2935
3268
  const parametersRaw = this.buffer.substring(contentStartIndex).trim();
2936
3269
  const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2941,7 +3274,8 @@ var init_parser = __esm({
2941
3274
  invocationId,
2942
3275
  parametersRaw,
2943
3276
  parameters,
2944
- parseError
3277
+ parseError,
3278
+ dependencies
2945
3279
  }
2946
3280
  };
2947
3281
  return;
@@ -3311,6 +3645,13 @@ var init_stream_processor = __esm({
3311
3645
  accumulatedText = "";
3312
3646
  shouldStopExecution = false;
3313
3647
  observerFailureCount = 0;
3648
+ // Dependency tracking for gadget execution DAG
3649
+ /** Gadgets waiting for their dependencies to complete */
3650
+ pendingGadgets = /* @__PURE__ */ new Map();
3651
+ /** Completed gadget results, keyed by invocation ID */
3652
+ completedResults = /* @__PURE__ */ new Map();
3653
+ /** Invocation IDs of gadgets that have failed (error or skipped due to dependency) */
3654
+ failedInvocations = /* @__PURE__ */ new Set();
3314
3655
  constructor(options) {
3315
3656
  this.iteration = options.iteration;
3316
3657
  this.registry = options.registry;
@@ -3411,6 +3752,16 @@ var init_stream_processor = __esm({
3411
3752
  }
3412
3753
  }
3413
3754
  }
3755
+ const finalPendingEvents = await this.processPendingGadgets();
3756
+ outputs.push(...finalPendingEvents);
3757
+ if (finalPendingEvents.some((e) => e.type === "gadget_result")) {
3758
+ didExecuteGadgets = true;
3759
+ }
3760
+ for (const evt of finalPendingEvents) {
3761
+ if (evt.type === "gadget_result" && evt.result.breaksLoop) {
3762
+ shouldBreakLoop = true;
3763
+ }
3764
+ }
3414
3765
  }
3415
3766
  let finalMessage = this.accumulatedText;
3416
3767
  if (this.hooks.interceptors?.interceptAssistantMessage) {
@@ -3462,7 +3813,11 @@ var init_stream_processor = __esm({
3462
3813
  return [{ type: "text", content }];
3463
3814
  }
3464
3815
  /**
3465
- * Process a gadget call through the full lifecycle.
3816
+ * Process a gadget call through the full lifecycle, handling dependencies.
3817
+ *
3818
+ * Gadgets without dependencies (or with all dependencies satisfied) execute immediately.
3819
+ * Gadgets with unsatisfied dependencies are queued for later execution.
3820
+ * After each execution, pending gadgets are checked to see if they can now run.
3466
3821
  */
3467
3822
  async processGadgetCall(call) {
3468
3823
  if (this.shouldStopExecution) {
@@ -3473,6 +3828,53 @@ var init_stream_processor = __esm({
3473
3828
  }
3474
3829
  const events = [];
3475
3830
  events.push({ type: "gadget_call", call });
3831
+ if (call.dependencies.length > 0) {
3832
+ if (call.dependencies.includes(call.invocationId)) {
3833
+ this.logger.warn("Gadget has self-referential dependency (depends on itself)", {
3834
+ gadgetName: call.gadgetName,
3835
+ invocationId: call.invocationId
3836
+ });
3837
+ this.failedInvocations.add(call.invocationId);
3838
+ const skipEvent = {
3839
+ type: "gadget_skipped",
3840
+ gadgetName: call.gadgetName,
3841
+ invocationId: call.invocationId,
3842
+ parameters: call.parameters ?? {},
3843
+ failedDependency: call.invocationId,
3844
+ failedDependencyError: `Gadget "${call.invocationId}" cannot depend on itself (self-referential dependency)`
3845
+ };
3846
+ events.push(skipEvent);
3847
+ return events;
3848
+ }
3849
+ const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
3850
+ if (failedDep) {
3851
+ const skipEvents = await this.handleFailedDependency(call, failedDep);
3852
+ events.push(...skipEvents);
3853
+ return events;
3854
+ }
3855
+ const unsatisfied = call.dependencies.filter((dep) => !this.completedResults.has(dep));
3856
+ if (unsatisfied.length > 0) {
3857
+ this.logger.debug("Queueing gadget for later - waiting on dependencies", {
3858
+ gadgetName: call.gadgetName,
3859
+ invocationId: call.invocationId,
3860
+ waitingOn: unsatisfied
3861
+ });
3862
+ this.pendingGadgets.set(call.invocationId, call);
3863
+ return events;
3864
+ }
3865
+ }
3866
+ const executeEvents = await this.executeGadgetWithHooks(call);
3867
+ events.push(...executeEvents);
3868
+ const triggeredEvents = await this.processPendingGadgets();
3869
+ events.push(...triggeredEvents);
3870
+ return events;
3871
+ }
3872
+ /**
3873
+ * Execute a gadget through the full hook lifecycle.
3874
+ * This is the core execution logic, extracted from processGadgetCall.
3875
+ */
3876
+ async executeGadgetWithHooks(call) {
3877
+ const events = [];
3476
3878
  if (call.parseError) {
3477
3879
  this.logger.warn("Gadget has parse error", {
3478
3880
  gadgetName: call.gadgetName,
@@ -3603,6 +4005,10 @@ var init_stream_processor = __esm({
3603
4005
  });
3604
4006
  }
3605
4007
  await this.runObserversInParallel(completeObservers);
4008
+ this.completedResults.set(result.invocationId, result);
4009
+ if (result.error) {
4010
+ this.failedInvocations.add(result.invocationId);
4011
+ }
3606
4012
  events.push({ type: "gadget_result", result });
3607
4013
  if (result.error) {
3608
4014
  const errorType = this.determineErrorType(call, result);
@@ -3618,6 +4024,162 @@ var init_stream_processor = __esm({
3618
4024
  }
3619
4025
  return events;
3620
4026
  }
4027
+ /**
4028
+ * Handle a gadget that cannot execute because a dependency failed.
4029
+ * Calls the onDependencySkipped controller to allow customization.
4030
+ */
4031
+ async handleFailedDependency(call, failedDep) {
4032
+ const events = [];
4033
+ const depResult = this.completedResults.get(failedDep);
4034
+ const depError = depResult?.error ?? "Dependency failed";
4035
+ let action = { action: "skip" };
4036
+ if (this.hooks.controllers?.onDependencySkipped) {
4037
+ const context = {
4038
+ iteration: this.iteration,
4039
+ gadgetName: call.gadgetName,
4040
+ invocationId: call.invocationId,
4041
+ parameters: call.parameters ?? {},
4042
+ failedDependency: failedDep,
4043
+ failedDependencyError: depError,
4044
+ logger: this.logger
4045
+ };
4046
+ action = await this.hooks.controllers.onDependencySkipped(context);
4047
+ }
4048
+ if (action.action === "skip") {
4049
+ this.failedInvocations.add(call.invocationId);
4050
+ const skipEvent = {
4051
+ type: "gadget_skipped",
4052
+ gadgetName: call.gadgetName,
4053
+ invocationId: call.invocationId,
4054
+ parameters: call.parameters ?? {},
4055
+ failedDependency: failedDep,
4056
+ failedDependencyError: depError
4057
+ };
4058
+ events.push(skipEvent);
4059
+ if (this.hooks.observers?.onGadgetSkipped) {
4060
+ const observeContext = {
4061
+ iteration: this.iteration,
4062
+ gadgetName: call.gadgetName,
4063
+ invocationId: call.invocationId,
4064
+ parameters: call.parameters ?? {},
4065
+ failedDependency: failedDep,
4066
+ failedDependencyError: depError,
4067
+ logger: this.logger
4068
+ };
4069
+ await this.safeObserve(() => this.hooks.observers.onGadgetSkipped(observeContext));
4070
+ }
4071
+ this.logger.info("Gadget skipped due to failed dependency", {
4072
+ gadgetName: call.gadgetName,
4073
+ invocationId: call.invocationId,
4074
+ failedDependency: failedDep
4075
+ });
4076
+ } else if (action.action === "execute_anyway") {
4077
+ this.logger.info("Executing gadget despite failed dependency (controller override)", {
4078
+ gadgetName: call.gadgetName,
4079
+ invocationId: call.invocationId,
4080
+ failedDependency: failedDep
4081
+ });
4082
+ const executeEvents = await this.executeGadgetWithHooks(call);
4083
+ events.push(...executeEvents);
4084
+ } else if (action.action === "use_fallback") {
4085
+ const fallbackResult = {
4086
+ gadgetName: call.gadgetName,
4087
+ invocationId: call.invocationId,
4088
+ parameters: call.parameters ?? {},
4089
+ result: action.fallbackResult,
4090
+ executionTimeMs: 0
4091
+ };
4092
+ this.completedResults.set(call.invocationId, fallbackResult);
4093
+ events.push({ type: "gadget_result", result: fallbackResult });
4094
+ this.logger.info("Using fallback result for gadget with failed dependency", {
4095
+ gadgetName: call.gadgetName,
4096
+ invocationId: call.invocationId,
4097
+ failedDependency: failedDep
4098
+ });
4099
+ }
4100
+ return events;
4101
+ }
4102
+ /**
4103
+ * Process pending gadgets whose dependencies are now satisfied.
4104
+ * Executes ready gadgets in parallel and continues until no more can be triggered.
4105
+ */
4106
+ async processPendingGadgets() {
4107
+ const events = [];
4108
+ let progress = true;
4109
+ while (progress && this.pendingGadgets.size > 0) {
4110
+ progress = false;
4111
+ const readyToExecute = [];
4112
+ const readyToSkip = [];
4113
+ for (const [invocationId, call] of this.pendingGadgets) {
4114
+ const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
4115
+ if (failedDep) {
4116
+ readyToSkip.push({ call, failedDep });
4117
+ continue;
4118
+ }
4119
+ const allSatisfied = call.dependencies.every((dep) => this.completedResults.has(dep));
4120
+ if (allSatisfied) {
4121
+ readyToExecute.push(call);
4122
+ }
4123
+ }
4124
+ for (const { call, failedDep } of readyToSkip) {
4125
+ this.pendingGadgets.delete(call.invocationId);
4126
+ const skipEvents = await this.handleFailedDependency(call, failedDep);
4127
+ events.push(...skipEvents);
4128
+ progress = true;
4129
+ }
4130
+ if (readyToExecute.length > 0) {
4131
+ this.logger.debug("Executing ready gadgets in parallel", {
4132
+ count: readyToExecute.length,
4133
+ invocationIds: readyToExecute.map((c) => c.invocationId)
4134
+ });
4135
+ for (const call of readyToExecute) {
4136
+ this.pendingGadgets.delete(call.invocationId);
4137
+ }
4138
+ const executePromises = readyToExecute.map((call) => this.executeGadgetWithHooks(call));
4139
+ const results = await Promise.all(executePromises);
4140
+ for (const executeEvents of results) {
4141
+ events.push(...executeEvents);
4142
+ }
4143
+ progress = true;
4144
+ }
4145
+ }
4146
+ if (this.pendingGadgets.size > 0) {
4147
+ const pendingIds = new Set(this.pendingGadgets.keys());
4148
+ for (const [invocationId, call] of this.pendingGadgets) {
4149
+ const missingDeps = call.dependencies.filter((dep) => !this.completedResults.has(dep));
4150
+ const circularDeps = missingDeps.filter((dep) => pendingIds.has(dep));
4151
+ const trulyMissingDeps = missingDeps.filter((dep) => !pendingIds.has(dep));
4152
+ let errorMessage;
4153
+ let logLevel = "warn";
4154
+ if (circularDeps.length > 0 && trulyMissingDeps.length > 0) {
4155
+ errorMessage = `Dependencies unresolvable: circular=[${circularDeps.join(", ")}], missing=[${trulyMissingDeps.join(", ")}]`;
4156
+ logLevel = "error";
4157
+ } else if (circularDeps.length > 0) {
4158
+ errorMessage = `Circular dependency detected: "${invocationId}" depends on "${circularDeps[0]}" which also depends on "${invocationId}" (directly or indirectly)`;
4159
+ } else {
4160
+ errorMessage = `Dependency "${missingDeps[0]}" was never executed - check that the invocation ID exists and is spelled correctly`;
4161
+ }
4162
+ this.logger[logLevel]("Gadget has unresolvable dependencies", {
4163
+ gadgetName: call.gadgetName,
4164
+ invocationId,
4165
+ circularDependencies: circularDeps,
4166
+ missingDependencies: trulyMissingDeps
4167
+ });
4168
+ this.failedInvocations.add(invocationId);
4169
+ const skipEvent = {
4170
+ type: "gadget_skipped",
4171
+ gadgetName: call.gadgetName,
4172
+ invocationId,
4173
+ parameters: call.parameters ?? {},
4174
+ failedDependency: missingDeps[0],
4175
+ failedDependencyError: errorMessage
4176
+ };
4177
+ events.push(skipEvent);
4178
+ }
4179
+ this.pendingGadgets.clear();
4180
+ }
4181
+ return events;
4182
+ }
3621
4183
  /**
3622
4184
  * Safely execute an observer, catching and logging any errors.
3623
4185
  * Observers are non-critical, so errors are logged but don't crash the system.
@@ -4055,9 +4617,9 @@ var init_agent = __esm({
4055
4617
  if (msg.role === "user") {
4056
4618
  this.conversation.addUserMessage(msg.content);
4057
4619
  } else if (msg.role === "assistant") {
4058
- this.conversation.addAssistantMessage(msg.content);
4620
+ this.conversation.addAssistantMessage(extractText(msg.content));
4059
4621
  } else if (msg.role === "system") {
4060
- this.conversation.addUserMessage(`[System] ${msg.content}`);
4622
+ this.conversation.addUserMessage(`[System] ${extractText(msg.content)}`);
4061
4623
  }
4062
4624
  }
4063
4625
  }
@@ -4636,6 +5198,7 @@ var init_anthropic = __esm({
4636
5198
  "src/providers/anthropic.ts"() {
4637
5199
  "use strict";
4638
5200
  import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
5201
+ init_messages();
4639
5202
  init_anthropic_models();
4640
5203
  init_base_provider();
4641
5204
  init_constants2();
@@ -4648,11 +5211,33 @@ var init_anthropic = __esm({
4648
5211
  getModelSpecs() {
4649
5212
  return ANTHROPIC_MODELS;
4650
5213
  }
5214
+ // =========================================================================
5215
+ // Image Generation (Not Supported)
5216
+ // =========================================================================
5217
+ supportsImageGeneration(_modelId) {
5218
+ return false;
5219
+ }
5220
+ async generateImage() {
5221
+ throw new Error(
5222
+ "Anthropic does not support image generation. Use OpenAI (DALL-E, GPT Image) or Google Gemini (Imagen) instead."
5223
+ );
5224
+ }
5225
+ // =========================================================================
5226
+ // Speech Generation (Not Supported)
5227
+ // =========================================================================
5228
+ supportsSpeechGeneration(_modelId) {
5229
+ return false;
5230
+ }
5231
+ async generateSpeech() {
5232
+ throw new Error(
5233
+ "Anthropic does not support speech generation. Use OpenAI (TTS) or Google Gemini (TTS) instead."
5234
+ );
5235
+ }
4651
5236
  buildRequestPayload(options, descriptor, spec, messages) {
4652
5237
  const systemMessages = messages.filter((message) => message.role === "system");
4653
5238
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
4654
5239
  type: "text",
4655
- text: m.content,
5240
+ text: extractText(m.content),
4656
5241
  // Add cache_control to the LAST system message block
4657
5242
  ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
4658
5243
  })) : void 0;
@@ -4665,14 +5250,10 @@ var init_anthropic = __esm({
4665
5250
  );
4666
5251
  const conversation = nonSystemMessages.map((message, index) => ({
4667
5252
  role: message.role,
4668
- content: [
4669
- {
4670
- type: "text",
4671
- text: message.content,
4672
- // Add cache_control to the LAST user message
4673
- ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
4674
- }
4675
- ]
5253
+ content: this.convertToAnthropicContent(
5254
+ message.content,
5255
+ message.role === "user" && index === lastUserIndex
5256
+ )
4676
5257
  }));
4677
5258
  const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
4678
5259
  const payload = {
@@ -4688,16 +5269,62 @@ var init_anthropic = __esm({
4688
5269
  };
4689
5270
  return payload;
4690
5271
  }
4691
- async executeStreamRequest(payload, signal) {
4692
- const client = this.client;
4693
- const stream2 = await client.messages.create(payload, signal ? { signal } : void 0);
4694
- return stream2;
4695
- }
4696
- async *wrapStream(iterable) {
4697
- const stream2 = iterable;
4698
- let inputTokens = 0;
4699
- let cachedInputTokens = 0;
4700
- let cacheCreationInputTokens = 0;
5272
+ /**
5273
+ * Convert llmist content to Anthropic's content block format.
5274
+ * Handles text, images (base64 only), and applies cache_control.
5275
+ */
5276
+ convertToAnthropicContent(content, addCacheControl) {
5277
+ const parts = normalizeContent(content);
5278
+ return parts.map((part, index) => {
5279
+ const isLastPart = index === parts.length - 1;
5280
+ const cacheControl = addCacheControl && isLastPart ? { cache_control: { type: "ephemeral" } } : {};
5281
+ if (part.type === "text") {
5282
+ return {
5283
+ type: "text",
5284
+ text: part.text,
5285
+ ...cacheControl
5286
+ };
5287
+ }
5288
+ if (part.type === "image") {
5289
+ return this.convertImagePart(part, cacheControl);
5290
+ }
5291
+ if (part.type === "audio") {
5292
+ throw new Error(
5293
+ "Anthropic does not support audio input. Use Google Gemini for audio processing."
5294
+ );
5295
+ }
5296
+ throw new Error(`Unsupported content type: ${part.type}`);
5297
+ });
5298
+ }
5299
+ /**
5300
+ * Convert an image content part to Anthropic's image block format.
5301
+ */
5302
+ convertImagePart(part, cacheControl) {
5303
+ if (part.source.type === "url") {
5304
+ throw new Error(
5305
+ "Anthropic does not support image URLs. Please provide base64-encoded image data instead."
5306
+ );
5307
+ }
5308
+ return {
5309
+ type: "image",
5310
+ source: {
5311
+ type: "base64",
5312
+ media_type: part.source.mediaType,
5313
+ data: part.source.data
5314
+ },
5315
+ ...cacheControl
5316
+ };
5317
+ }
5318
+ async executeStreamRequest(payload, signal) {
5319
+ const client = this.client;
5320
+ const stream2 = await client.messages.create(payload, signal ? { signal } : void 0);
5321
+ return stream2;
5322
+ }
5323
+ async *wrapStream(iterable) {
5324
+ const stream2 = iterable;
5325
+ let inputTokens = 0;
5326
+ let cachedInputTokens = 0;
5327
+ let cacheCreationInputTokens = 0;
4701
5328
  for await (const event of stream2) {
4702
5329
  if (event.type === "message_start") {
4703
5330
  const usage = event.message.usage;
@@ -4770,17 +5397,12 @@ var init_anthropic = __esm({
4770
5397
  async countTokens(messages, descriptor, _spec) {
4771
5398
  const client = this.client;
4772
5399
  const systemMessages = messages.filter((message) => message.role === "system");
4773
- const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
5400
+ const system = systemMessages.length > 0 ? systemMessages.map((m) => extractText(m.content)).join("\n\n") : void 0;
4774
5401
  const conversation = messages.filter(
4775
5402
  (message) => message.role !== "system"
4776
5403
  ).map((message) => ({
4777
5404
  role: message.role,
4778
- content: [
4779
- {
4780
- type: "text",
4781
- text: message.content
4782
- }
4783
- ]
5405
+ content: this.convertToAnthropicContent(message.content, false)
4784
5406
  }));
4785
5407
  try {
4786
5408
  const response = await client.messages.countTokens({
@@ -4794,14 +5416,201 @@ var init_anthropic = __esm({
4794
5416
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
4795
5417
  error
4796
5418
  );
4797
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
4798
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
5419
+ let totalChars = 0;
5420
+ let imageCount = 0;
5421
+ for (const msg of messages) {
5422
+ const parts = normalizeContent(msg.content);
5423
+ for (const part of parts) {
5424
+ if (part.type === "text") {
5425
+ totalChars += part.text.length;
5426
+ } else if (part.type === "image") {
5427
+ imageCount++;
5428
+ }
5429
+ }
5430
+ }
5431
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 1e3;
4799
5432
  }
4800
5433
  }
4801
5434
  };
4802
5435
  }
4803
5436
  });
4804
5437
 
5438
+ // src/providers/gemini-image-models.ts
5439
+ function getGeminiImageModelSpec(modelId) {
5440
+ return geminiImageModels.find((m) => m.modelId === modelId);
5441
+ }
5442
+ function isGeminiImageModel(modelId) {
5443
+ return geminiImageModels.some((m) => m.modelId === modelId);
5444
+ }
5445
+ function calculateGeminiImageCost(modelId, size = "1:1", n = 1) {
5446
+ const spec = getGeminiImageModelSpec(modelId);
5447
+ if (!spec) return void 0;
5448
+ if (spec.pricing.perImage !== void 0) {
5449
+ return spec.pricing.perImage * n;
5450
+ }
5451
+ if (spec.pricing.bySize) {
5452
+ const sizePrice = spec.pricing.bySize[size];
5453
+ if (typeof sizePrice === "number") {
5454
+ return sizePrice * n;
5455
+ }
5456
+ }
5457
+ return void 0;
5458
+ }
5459
+ var IMAGEN4_ASPECT_RATIOS, GEMINI_IMAGE_ASPECT_RATIOS, geminiImageModels;
5460
+ var init_gemini_image_models = __esm({
5461
+ "src/providers/gemini-image-models.ts"() {
5462
+ "use strict";
5463
+ IMAGEN4_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
5464
+ GEMINI_IMAGE_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
5465
+ geminiImageModels = [
5466
+ // Imagen 4 Family (standalone image generation)
5467
+ {
5468
+ provider: "gemini",
5469
+ modelId: "imagen-4.0-fast-generate-001",
5470
+ displayName: "Imagen 4 Fast",
5471
+ pricing: {
5472
+ perImage: 0.02
5473
+ },
5474
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5475
+ maxImages: 4,
5476
+ defaultSize: "1:1",
5477
+ features: {
5478
+ textRendering: true
5479
+ }
5480
+ },
5481
+ {
5482
+ provider: "gemini",
5483
+ modelId: "imagen-4.0-generate-001",
5484
+ displayName: "Imagen 4",
5485
+ pricing: {
5486
+ perImage: 0.04
5487
+ },
5488
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5489
+ maxImages: 4,
5490
+ defaultSize: "1:1",
5491
+ features: {
5492
+ textRendering: true
5493
+ }
5494
+ },
5495
+ {
5496
+ provider: "gemini",
5497
+ modelId: "imagen-4.0-ultra-generate-001",
5498
+ displayName: "Imagen 4 Ultra",
5499
+ pricing: {
5500
+ perImage: 0.06
5501
+ },
5502
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5503
+ maxImages: 4,
5504
+ defaultSize: "1:1",
5505
+ features: {
5506
+ textRendering: true
5507
+ }
5508
+ },
5509
+ // Preview versions
5510
+ {
5511
+ provider: "gemini",
5512
+ modelId: "imagen-4.0-generate-preview-06-06",
5513
+ displayName: "Imagen 4 (Preview)",
5514
+ pricing: {
5515
+ perImage: 0.04
5516
+ },
5517
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5518
+ maxImages: 4,
5519
+ defaultSize: "1:1",
5520
+ features: {
5521
+ textRendering: true
5522
+ }
5523
+ },
5524
+ {
5525
+ provider: "gemini",
5526
+ modelId: "imagen-4.0-ultra-generate-preview-06-06",
5527
+ displayName: "Imagen 4 Ultra (Preview)",
5528
+ pricing: {
5529
+ perImage: 0.06
5530
+ },
5531
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5532
+ maxImages: 4,
5533
+ defaultSize: "1:1",
5534
+ features: {
5535
+ textRendering: true
5536
+ }
5537
+ },
5538
+ // Gemini Native Image Generation (multimodal models)
5539
+ {
5540
+ provider: "gemini",
5541
+ modelId: "gemini-2.5-flash-image",
5542
+ displayName: "Gemini 2.5 Flash Image",
5543
+ pricing: {
5544
+ perImage: 0.039
5545
+ },
5546
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
5547
+ maxImages: 1,
5548
+ defaultSize: "1:1",
5549
+ features: {
5550
+ conversational: true,
5551
+ textRendering: true
5552
+ }
5553
+ },
5554
+ {
5555
+ provider: "gemini",
5556
+ modelId: "gemini-2.5-flash-image-preview",
5557
+ displayName: "Gemini 2.5 Flash Image (Preview)",
5558
+ pricing: {
5559
+ perImage: 0.039
5560
+ },
5561
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
5562
+ maxImages: 1,
5563
+ defaultSize: "1:1",
5564
+ features: {
5565
+ conversational: true,
5566
+ textRendering: true
5567
+ }
5568
+ },
5569
+ {
5570
+ provider: "gemini",
5571
+ modelId: "gemini-3-pro-image-preview",
5572
+ displayName: "Gemini 3 Pro Image (Preview)",
5573
+ pricing: {
5574
+ // Token-based: ~$0.134 per 1K/2K image, $0.24 per 4K
5575
+ // Using 2K as default
5576
+ bySize: {
5577
+ "1K": 0.134,
5578
+ "2K": 0.134,
5579
+ "4K": 0.24
5580
+ }
5581
+ },
5582
+ supportedSizes: ["1K", "2K", "4K"],
5583
+ maxImages: 1,
5584
+ defaultSize: "2K",
5585
+ features: {
5586
+ conversational: true,
5587
+ textRendering: true
5588
+ }
5589
+ },
5590
+ // Alias: nano-banana-pro-preview is gemini-3-pro-image-preview
5591
+ {
5592
+ provider: "gemini",
5593
+ modelId: "nano-banana-pro-preview",
5594
+ displayName: "Nano Banana Pro (Gemini 3 Pro Image)",
5595
+ pricing: {
5596
+ bySize: {
5597
+ "1K": 0.134,
5598
+ "2K": 0.134,
5599
+ "4K": 0.24
5600
+ }
5601
+ },
5602
+ supportedSizes: ["1K", "2K", "4K"],
5603
+ maxImages: 1,
5604
+ defaultSize: "2K",
5605
+ features: {
5606
+ conversational: true,
5607
+ textRendering: true
5608
+ }
5609
+ }
5610
+ ];
5611
+ }
5612
+ });
5613
+
4805
5614
  // src/providers/gemini-models.ts
4806
5615
  var GEMINI_MODELS;
4807
5616
  var init_gemini_models = __esm({
@@ -4975,7 +5784,171 @@ var init_gemini_models = __esm({
4975
5784
  }
4976
5785
  });
4977
5786
 
5787
+ // src/providers/gemini-speech-models.ts
5788
+ function getGeminiSpeechModelSpec(modelId) {
5789
+ return geminiSpeechModels.find((m) => m.modelId === modelId);
5790
+ }
5791
+ function isGeminiSpeechModel(modelId) {
5792
+ return geminiSpeechModels.some((m) => m.modelId === modelId);
5793
+ }
5794
+ function calculateGeminiSpeechCost(modelId, characterCount, estimatedMinutes) {
5795
+ const spec = getGeminiSpeechModelSpec(modelId);
5796
+ if (!spec) return void 0;
5797
+ if (spec.pricing.perMinute !== void 0) {
5798
+ if (estimatedMinutes !== void 0) {
5799
+ return estimatedMinutes * spec.pricing.perMinute;
5800
+ }
5801
+ const approxMinutes = characterCount / 750;
5802
+ return approxMinutes * spec.pricing.perMinute;
5803
+ }
5804
+ return void 0;
5805
+ }
5806
+ var GEMINI_TTS_VOICES, GEMINI_TTS_FORMATS, geminiSpeechModels;
5807
+ var init_gemini_speech_models = __esm({
5808
+ "src/providers/gemini-speech-models.ts"() {
5809
+ "use strict";
5810
+ GEMINI_TTS_VOICES = [
5811
+ "Zephyr",
5812
+ // Bright
5813
+ "Puck",
5814
+ // Upbeat
5815
+ "Charon",
5816
+ // Informative
5817
+ "Kore",
5818
+ // Firm
5819
+ "Fenrir",
5820
+ // Excitable
5821
+ "Leda",
5822
+ // Youthful
5823
+ "Orus",
5824
+ // Firm
5825
+ "Aoede",
5826
+ // Breezy
5827
+ "Callirrhoe",
5828
+ // Easy-going
5829
+ "Autonoe",
5830
+ // Bright
5831
+ "Enceladus",
5832
+ // Breathy
5833
+ "Iapetus",
5834
+ // Clear
5835
+ "Umbriel",
5836
+ // Easy-going
5837
+ "Algieba",
5838
+ // Smooth
5839
+ "Despina",
5840
+ // Smooth
5841
+ "Erinome",
5842
+ // Clear
5843
+ "Algenib",
5844
+ // Gravelly
5845
+ "Rasalgethi",
5846
+ // Informative
5847
+ "Laomedeia",
5848
+ // Upbeat
5849
+ "Achernar",
5850
+ // Soft
5851
+ "Alnilam",
5852
+ // Firm
5853
+ "Schedar",
5854
+ // Even
5855
+ "Gacrux",
5856
+ // Mature
5857
+ "Pulcherrima",
5858
+ // Forward
5859
+ "Achird",
5860
+ // Friendly
5861
+ "Zubenelgenubi",
5862
+ // Casual
5863
+ "Vindemiatrix",
5864
+ // Gentle
5865
+ "Sadachbia",
5866
+ // Lively
5867
+ "Sadaltager",
5868
+ // Knowledgeable
5869
+ "Sulafat"
5870
+ // Warm
5871
+ ];
5872
+ GEMINI_TTS_FORMATS = ["pcm", "wav"];
5873
+ geminiSpeechModels = [
5874
+ {
5875
+ provider: "gemini",
5876
+ modelId: "gemini-2.5-flash-preview-tts",
5877
+ displayName: "Gemini 2.5 Flash TTS (Preview)",
5878
+ pricing: {
5879
+ // $0.50 per 1M input tokens = $0.0000005 per token
5880
+ perInputToken: 5e-7,
5881
+ // $10.00 per 1M audio output tokens = $0.00001 per token
5882
+ perAudioOutputToken: 1e-5,
5883
+ // Rough estimate: ~$0.01 per minute of audio
5884
+ perMinute: 0.01
5885
+ },
5886
+ voices: [...GEMINI_TTS_VOICES],
5887
+ formats: GEMINI_TTS_FORMATS,
5888
+ maxInputLength: 8e3,
5889
+ // bytes (text + prompt combined)
5890
+ defaultVoice: "Zephyr",
5891
+ defaultFormat: "wav",
5892
+ features: {
5893
+ multiSpeaker: true,
5894
+ languages: 24,
5895
+ voiceInstructions: true
5896
+ }
5897
+ },
5898
+ {
5899
+ provider: "gemini",
5900
+ modelId: "gemini-2.5-pro-preview-tts",
5901
+ displayName: "Gemini 2.5 Pro TTS (Preview)",
5902
+ pricing: {
5903
+ // $1.00 per 1M input tokens = $0.000001 per token
5904
+ perInputToken: 1e-6,
5905
+ // $20.00 per 1M audio output tokens = $0.00002 per token
5906
+ perAudioOutputToken: 2e-5,
5907
+ // Rough estimate: ~$0.02 per minute of audio
5908
+ perMinute: 0.02
5909
+ },
5910
+ voices: [...GEMINI_TTS_VOICES],
5911
+ formats: GEMINI_TTS_FORMATS,
5912
+ maxInputLength: 8e3,
5913
+ // bytes
5914
+ defaultVoice: "Zephyr",
5915
+ defaultFormat: "wav",
5916
+ features: {
5917
+ multiSpeaker: true,
5918
+ languages: 24,
5919
+ voiceInstructions: true
5920
+ }
5921
+ }
5922
+ ];
5923
+ }
5924
+ });
5925
+
4978
5926
  // src/providers/gemini.ts
5927
+ function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
5928
+ const byteRate = sampleRate * numChannels * bitsPerSample / 8;
5929
+ const blockAlign = numChannels * bitsPerSample / 8;
5930
+ const dataSize = pcmData.length;
5931
+ const headerSize = 44;
5932
+ const fileSize = headerSize + dataSize - 8;
5933
+ const buffer = new ArrayBuffer(headerSize + dataSize);
5934
+ const view = new DataView(buffer);
5935
+ const uint8 = new Uint8Array(buffer);
5936
+ view.setUint32(0, 1380533830, false);
5937
+ view.setUint32(4, fileSize, true);
5938
+ view.setUint32(8, 1463899717, false);
5939
+ view.setUint32(12, 1718449184, false);
5940
+ view.setUint32(16, 16, true);
5941
+ view.setUint16(20, 1, true);
5942
+ view.setUint16(22, numChannels, true);
5943
+ view.setUint32(24, sampleRate, true);
5944
+ view.setUint32(28, byteRate, true);
5945
+ view.setUint16(32, blockAlign, true);
5946
+ view.setUint16(34, bitsPerSample, true);
5947
+ view.setUint32(36, 1684108385, false);
5948
+ view.setUint32(40, dataSize, true);
5949
+ uint8.set(pcmData, headerSize);
5950
+ return buffer;
5951
+ }
4979
5952
  function createGeminiProviderFromEnv() {
4980
5953
  return createProviderFromEnv("GEMINI_API_KEY", import_genai.GoogleGenAI, GeminiGenerativeProvider);
4981
5954
  }
@@ -4984,9 +5957,12 @@ var init_gemini = __esm({
4984
5957
  "src/providers/gemini.ts"() {
4985
5958
  "use strict";
4986
5959
  import_genai = require("@google/genai");
5960
+ init_messages();
4987
5961
  init_base_provider();
4988
5962
  init_constants2();
5963
+ init_gemini_image_models();
4989
5964
  init_gemini_models();
5965
+ init_gemini_speech_models();
4990
5966
  init_utils();
4991
5967
  GEMINI_ROLE_MAP = {
4992
5968
  system: "user",
@@ -5001,6 +5977,139 @@ var init_gemini = __esm({
5001
5977
  getModelSpecs() {
5002
5978
  return GEMINI_MODELS;
5003
5979
  }
5980
+ // =========================================================================
5981
+ // Image Generation
5982
+ // =========================================================================
5983
+ getImageModelSpecs() {
5984
+ return geminiImageModels;
5985
+ }
5986
+ supportsImageGeneration(modelId) {
5987
+ return isGeminiImageModel(modelId);
5988
+ }
5989
+ async generateImage(options) {
5990
+ const client = this.client;
5991
+ const spec = getGeminiImageModelSpec(options.model);
5992
+ const isImagenModel = options.model.startsWith("imagen");
5993
+ const aspectRatio = options.size ?? spec?.defaultSize ?? "1:1";
5994
+ const n = options.n ?? 1;
5995
+ if (isImagenModel) {
5996
+ const response2 = await client.models.generateImages({
5997
+ model: options.model,
5998
+ prompt: options.prompt,
5999
+ config: {
6000
+ numberOfImages: n,
6001
+ aspectRatio,
6002
+ outputMimeType: options.responseFormat === "b64_json" ? "image/png" : "image/jpeg"
6003
+ }
6004
+ });
6005
+ const images2 = response2.generatedImages ?? [];
6006
+ const cost2 = calculateGeminiImageCost(options.model, aspectRatio, images2.length);
6007
+ return {
6008
+ // Gemini's imageBytes is already base64 encoded, so use it directly
6009
+ images: images2.map((img) => ({
6010
+ b64Json: img.image?.imageBytes ?? void 0
6011
+ })),
6012
+ model: options.model,
6013
+ usage: {
6014
+ imagesGenerated: images2.length,
6015
+ size: aspectRatio,
6016
+ quality: "standard"
6017
+ },
6018
+ cost: cost2
6019
+ };
6020
+ }
6021
+ const response = await client.models.generateContent({
6022
+ model: options.model,
6023
+ contents: [{ role: "user", parts: [{ text: options.prompt }] }],
6024
+ config: {
6025
+ responseModalities: [import_genai.Modality.IMAGE, import_genai.Modality.TEXT]
6026
+ }
6027
+ });
6028
+ const images = [];
6029
+ const candidate = response.candidates?.[0];
6030
+ if (candidate?.content?.parts) {
6031
+ for (const part of candidate.content.parts) {
6032
+ if ("inlineData" in part && part.inlineData) {
6033
+ images.push({
6034
+ b64Json: part.inlineData.data
6035
+ });
6036
+ }
6037
+ }
6038
+ }
6039
+ const cost = calculateGeminiImageCost(options.model, aspectRatio, images.length);
6040
+ return {
6041
+ images,
6042
+ model: options.model,
6043
+ usage: {
6044
+ imagesGenerated: images.length,
6045
+ size: aspectRatio,
6046
+ quality: "standard"
6047
+ },
6048
+ cost
6049
+ };
6050
+ }
6051
+ // =========================================================================
6052
+ // Speech Generation
6053
+ // =========================================================================
6054
+ getSpeechModelSpecs() {
6055
+ return geminiSpeechModels;
6056
+ }
6057
+ supportsSpeechGeneration(modelId) {
6058
+ return isGeminiSpeechModel(modelId);
6059
+ }
6060
+ async generateSpeech(options) {
6061
+ const client = this.client;
6062
+ const spec = getGeminiSpeechModelSpec(options.model);
6063
+ const voice = options.voice ?? spec?.defaultVoice ?? "Zephyr";
6064
+ const response = await client.models.generateContent({
6065
+ model: options.model,
6066
+ contents: [
6067
+ {
6068
+ role: "user",
6069
+ parts: [{ text: options.input }]
6070
+ }
6071
+ ],
6072
+ config: {
6073
+ responseModalities: [import_genai.Modality.AUDIO],
6074
+ speechConfig: {
6075
+ voiceConfig: {
6076
+ prebuiltVoiceConfig: {
6077
+ voiceName: voice
6078
+ }
6079
+ }
6080
+ }
6081
+ }
6082
+ });
6083
+ let pcmData;
6084
+ const candidate = response.candidates?.[0];
6085
+ if (candidate?.content?.parts) {
6086
+ for (const part of candidate.content.parts) {
6087
+ if ("inlineData" in part && part.inlineData?.data) {
6088
+ const base64 = part.inlineData.data;
6089
+ const binary = atob(base64);
6090
+ pcmData = new Uint8Array(binary.length);
6091
+ for (let i = 0; i < binary.length; i++) {
6092
+ pcmData[i] = binary.charCodeAt(i);
6093
+ }
6094
+ break;
6095
+ }
6096
+ }
6097
+ }
6098
+ if (!pcmData) {
6099
+ throw new Error("No audio data in Gemini TTS response");
6100
+ }
6101
+ const audioData = wrapPcmInWav(pcmData, 24e3, 16, 1);
6102
+ const cost = calculateGeminiSpeechCost(options.model, options.input.length);
6103
+ return {
6104
+ audio: audioData,
6105
+ model: options.model,
6106
+ usage: {
6107
+ characterCount: options.input.length
6108
+ },
6109
+ cost,
6110
+ format: spec?.defaultFormat ?? "wav"
6111
+ };
6112
+ }
5004
6113
  buildRequestPayload(options, descriptor, _spec, messages) {
5005
6114
  const contents = this.convertMessagesToContents(messages);
5006
6115
  const generationConfig = this.buildGenerationConfig(options);
@@ -5018,7 +6127,7 @@ var init_gemini = __esm({
5018
6127
  };
5019
6128
  return {
5020
6129
  model: descriptor.name,
5021
- contents: this.convertContentsForNewSDK(contents),
6130
+ contents,
5022
6131
  config
5023
6132
  };
5024
6133
  }
@@ -5053,18 +6162,25 @@ var init_gemini = __esm({
5053
6162
  if (message.role === "system") {
5054
6163
  expandedMessages.push({
5055
6164
  role: "user",
5056
- content: message.content
6165
+ content: extractText(message.content)
5057
6166
  });
5058
6167
  expandedMessages.push({
5059
6168
  role: "assistant",
5060
6169
  content: "Understood."
5061
6170
  });
5062
6171
  } else {
5063
- expandedMessages.push(message);
6172
+ expandedMessages.push({
6173
+ role: message.role,
6174
+ content: message.content
6175
+ });
5064
6176
  }
5065
6177
  }
5066
6178
  return this.mergeConsecutiveMessages(expandedMessages);
5067
6179
  }
6180
+ /**
6181
+ * Merge consecutive messages with the same role (required by Gemini).
6182
+ * Handles multimodal content by converting to Gemini's part format.
6183
+ */
5068
6184
  mergeConsecutiveMessages(messages) {
5069
6185
  if (messages.length === 0) {
5070
6186
  return [];
@@ -5073,15 +6189,16 @@ var init_gemini = __esm({
5073
6189
  let currentGroup = null;
5074
6190
  for (const message of messages) {
5075
6191
  const geminiRole = GEMINI_ROLE_MAP[message.role];
6192
+ const geminiParts = this.convertToGeminiParts(message.content);
5076
6193
  if (currentGroup && currentGroup.role === geminiRole) {
5077
- currentGroup.parts.push({ text: message.content });
6194
+ currentGroup.parts.push(...geminiParts);
5078
6195
  } else {
5079
6196
  if (currentGroup) {
5080
6197
  result.push(currentGroup);
5081
6198
  }
5082
6199
  currentGroup = {
5083
6200
  role: geminiRole,
5084
- parts: [{ text: message.content }]
6201
+ parts: geminiParts
5085
6202
  };
5086
6203
  }
5087
6204
  }
@@ -5090,11 +6207,39 @@ var init_gemini = __esm({
5090
6207
  }
5091
6208
  return result;
5092
6209
  }
5093
- convertContentsForNewSDK(contents) {
5094
- return contents.map((content) => ({
5095
- role: content.role,
5096
- parts: content.parts.map((part) => ({ text: part.text }))
5097
- }));
6210
+ /**
6211
+ * Convert llmist content to Gemini's part format.
6212
+ * Handles text, images, and audio (Gemini supports all three).
6213
+ */
6214
+ convertToGeminiParts(content) {
6215
+ const parts = normalizeContent(content);
6216
+ return parts.map((part) => {
6217
+ if (part.type === "text") {
6218
+ return { text: part.text };
6219
+ }
6220
+ if (part.type === "image") {
6221
+ if (part.source.type === "url") {
6222
+ throw new Error(
6223
+ "Gemini does not support image URLs directly. Please provide base64-encoded image data."
6224
+ );
6225
+ }
6226
+ return {
6227
+ inlineData: {
6228
+ mimeType: part.source.mediaType,
6229
+ data: part.source.data
6230
+ }
6231
+ };
6232
+ }
6233
+ if (part.type === "audio") {
6234
+ return {
6235
+ inlineData: {
6236
+ mimeType: part.source.mediaType,
6237
+ data: part.source.data
6238
+ }
6239
+ };
6240
+ }
6241
+ throw new Error(`Unsupported content type: ${part.type}`);
6242
+ });
5098
6243
  }
5099
6244
  buildGenerationConfig(options) {
5100
6245
  const config = {};
@@ -5115,9 +6260,9 @@ var init_gemini = __esm({
5115
6260
  async *wrapStream(iterable) {
5116
6261
  const stream2 = iterable;
5117
6262
  for await (const chunk of stream2) {
5118
- const text = this.extractText(chunk);
5119
- if (text) {
5120
- yield { text, rawEvent: chunk };
6263
+ const text3 = this.extractText(chunk);
6264
+ if (text3) {
6265
+ yield { text: text3, rawEvent: chunk };
5121
6266
  }
5122
6267
  const finishReason = this.extractFinishReason(chunk);
5123
6268
  const usage = this.extractUsage(chunk);
@@ -5178,7 +6323,7 @@ var init_gemini = __esm({
5178
6323
  try {
5179
6324
  const response = await client.models.countTokens({
5180
6325
  model: descriptor.name,
5181
- contents: this.convertContentsForNewSDK(contents)
6326
+ contents
5182
6327
  // Note: systemInstruction not used - it's not supported by countTokens()
5183
6328
  // and would cause a 2100% token counting error
5184
6329
  });
@@ -5188,14 +6333,140 @@ var init_gemini = __esm({
5188
6333
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
5189
6334
  error
5190
6335
  );
5191
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
5192
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
6336
+ let totalChars = 0;
6337
+ let mediaCount = 0;
6338
+ for (const msg of messages) {
6339
+ const parts = normalizeContent(msg.content);
6340
+ for (const part of parts) {
6341
+ if (part.type === "text") {
6342
+ totalChars += part.text.length;
6343
+ } else if (part.type === "image" || part.type === "audio") {
6344
+ mediaCount++;
6345
+ }
6346
+ }
6347
+ }
6348
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + mediaCount * 258;
5193
6349
  }
5194
6350
  }
5195
6351
  };
5196
6352
  }
5197
6353
  });
5198
6354
 
6355
+ // src/providers/openai-image-models.ts
6356
+ function getOpenAIImageModelSpec(modelId) {
6357
+ return openaiImageModels.find((m) => m.modelId === modelId);
6358
+ }
6359
+ function isOpenAIImageModel(modelId) {
6360
+ return openaiImageModels.some((m) => m.modelId === modelId);
6361
+ }
6362
+ function calculateOpenAIImageCost(modelId, size, quality = "standard", n = 1) {
6363
+ const spec = getOpenAIImageModelSpec(modelId);
6364
+ if (!spec) return void 0;
6365
+ const sizePrice = spec.pricing.bySize?.[size];
6366
+ if (sizePrice === void 0) return void 0;
6367
+ let pricePerImage;
6368
+ if (typeof sizePrice === "number") {
6369
+ pricePerImage = sizePrice;
6370
+ } else {
6371
+ pricePerImage = sizePrice[quality];
6372
+ if (pricePerImage === void 0) return void 0;
6373
+ }
6374
+ return pricePerImage * n;
6375
+ }
6376
+ var GPT_IMAGE_SIZES, GPT_IMAGE_QUALITIES, DALLE3_SIZES, DALLE3_QUALITIES, DALLE2_SIZES, openaiImageModels;
6377
+ var init_openai_image_models = __esm({
6378
+ "src/providers/openai-image-models.ts"() {
6379
+ "use strict";
6380
+ GPT_IMAGE_SIZES = ["1024x1024", "1024x1536", "1536x1024"];
6381
+ GPT_IMAGE_QUALITIES = ["low", "medium", "high"];
6382
+ DALLE3_SIZES = ["1024x1024", "1024x1792", "1792x1024"];
6383
+ DALLE3_QUALITIES = ["standard", "hd"];
6384
+ DALLE2_SIZES = ["256x256", "512x512", "1024x1024"];
6385
+ openaiImageModels = [
6386
+ // GPT Image 1 Family (flagship)
6387
+ {
6388
+ provider: "openai",
6389
+ modelId: "gpt-image-1",
6390
+ displayName: "GPT Image 1",
6391
+ pricing: {
6392
+ bySize: {
6393
+ "1024x1024": { low: 0.011, medium: 0.04, high: 0.17 },
6394
+ "1024x1536": { low: 0.016, medium: 0.06, high: 0.25 },
6395
+ "1536x1024": { low: 0.016, medium: 0.06, high: 0.25 }
6396
+ }
6397
+ },
6398
+ supportedSizes: [...GPT_IMAGE_SIZES],
6399
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
6400
+ maxImages: 1,
6401
+ defaultSize: "1024x1024",
6402
+ defaultQuality: "medium",
6403
+ features: {
6404
+ textRendering: true,
6405
+ transparency: true
6406
+ }
6407
+ },
6408
+ {
6409
+ provider: "openai",
6410
+ modelId: "gpt-image-1-mini",
6411
+ displayName: "GPT Image 1 Mini",
6412
+ pricing: {
6413
+ bySize: {
6414
+ "1024x1024": { low: 5e-3, medium: 0.02, high: 0.052 },
6415
+ "1024x1536": { low: 75e-4, medium: 0.03, high: 0.078 },
6416
+ "1536x1024": { low: 75e-4, medium: 0.03, high: 0.078 }
6417
+ }
6418
+ },
6419
+ supportedSizes: [...GPT_IMAGE_SIZES],
6420
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
6421
+ maxImages: 1,
6422
+ defaultSize: "1024x1024",
6423
+ defaultQuality: "medium",
6424
+ features: {
6425
+ textRendering: true,
6426
+ transparency: true
6427
+ }
6428
+ },
6429
+ // DALL-E Family
6430
+ {
6431
+ provider: "openai",
6432
+ modelId: "dall-e-3",
6433
+ displayName: "DALL-E 3",
6434
+ pricing: {
6435
+ bySize: {
6436
+ "1024x1024": { standard: 0.04, hd: 0.08 },
6437
+ "1024x1792": { standard: 0.08, hd: 0.12 },
6438
+ "1792x1024": { standard: 0.08, hd: 0.12 }
6439
+ }
6440
+ },
6441
+ supportedSizes: [...DALLE3_SIZES],
6442
+ supportedQualities: [...DALLE3_QUALITIES],
6443
+ maxImages: 1,
6444
+ // DALL-E 3 only supports n=1
6445
+ defaultSize: "1024x1024",
6446
+ defaultQuality: "standard",
6447
+ features: {
6448
+ textRendering: true
6449
+ }
6450
+ },
6451
+ {
6452
+ provider: "openai",
6453
+ modelId: "dall-e-2",
6454
+ displayName: "DALL-E 2 (Legacy)",
6455
+ pricing: {
6456
+ bySize: {
6457
+ "256x256": 0.016,
6458
+ "512x512": 0.018,
6459
+ "1024x1024": 0.02
6460
+ }
6461
+ },
6462
+ supportedSizes: [...DALLE2_SIZES],
6463
+ maxImages: 10,
6464
+ defaultSize: "1024x1024"
6465
+ }
6466
+ ];
6467
+ }
6468
+ });
6469
+
5199
6470
  // src/providers/openai-models.ts
5200
6471
  var OPENAI_MODELS;
5201
6472
  var init_openai_models = __esm({
@@ -5560,15 +6831,153 @@ var init_openai_models = __esm({
5560
6831
  }
5561
6832
  });
5562
6833
 
5563
- // src/providers/openai.ts
5564
- function sanitizeExtra(extra, allowTemperature) {
5565
- if (!extra) {
5566
- return void 0;
6834
+ // src/providers/openai-speech-models.ts
6835
+ function getOpenAISpeechModelSpec(modelId) {
6836
+ return openaiSpeechModels.find((m) => m.modelId === modelId);
6837
+ }
6838
+ function isOpenAISpeechModel(modelId) {
6839
+ return openaiSpeechModels.some((m) => m.modelId === modelId);
6840
+ }
6841
+ function calculateOpenAISpeechCost(modelId, characterCount, estimatedMinutes) {
6842
+ const spec = getOpenAISpeechModelSpec(modelId);
6843
+ if (!spec) return void 0;
6844
+ if (spec.pricing.perCharacter !== void 0) {
6845
+ return characterCount * spec.pricing.perCharacter;
5567
6846
  }
5568
- if (allowTemperature || !Object.hasOwn(extra, "temperature")) {
5569
- return extra;
6847
+ if (spec.pricing.perMinute !== void 0 && estimatedMinutes !== void 0) {
6848
+ return estimatedMinutes * spec.pricing.perMinute;
5570
6849
  }
5571
- return Object.fromEntries(Object.entries(extra).filter(([key]) => key !== "temperature"));
6850
+ if (spec.pricing.perMinute !== void 0) {
6851
+ const approxMinutes = characterCount / 750;
6852
+ return approxMinutes * spec.pricing.perMinute;
6853
+ }
6854
+ return void 0;
6855
+ }
6856
+ var OPENAI_TTS_VOICES, OPENAI_TTS_EXTENDED_VOICES, OPENAI_TTS_FORMATS, openaiSpeechModels;
6857
+ var init_openai_speech_models = __esm({
6858
+ "src/providers/openai-speech-models.ts"() {
6859
+ "use strict";
6860
+ OPENAI_TTS_VOICES = [
6861
+ "alloy",
6862
+ "echo",
6863
+ "fable",
6864
+ "onyx",
6865
+ "nova",
6866
+ "shimmer"
6867
+ ];
6868
+ OPENAI_TTS_EXTENDED_VOICES = [
6869
+ ...OPENAI_TTS_VOICES,
6870
+ "ash",
6871
+ "ballad",
6872
+ "coral",
6873
+ "sage",
6874
+ "verse"
6875
+ ];
6876
+ OPENAI_TTS_FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
6877
+ openaiSpeechModels = [
6878
+ // Standard TTS models (character-based pricing)
6879
+ {
6880
+ provider: "openai",
6881
+ modelId: "tts-1",
6882
+ displayName: "TTS-1",
6883
+ pricing: {
6884
+ // $15 per 1M characters = $0.000015 per character
6885
+ perCharacter: 15e-6
6886
+ },
6887
+ voices: [...OPENAI_TTS_VOICES],
6888
+ formats: OPENAI_TTS_FORMATS,
6889
+ maxInputLength: 4096,
6890
+ defaultVoice: "alloy",
6891
+ defaultFormat: "mp3",
6892
+ features: {
6893
+ voiceInstructions: false
6894
+ }
6895
+ },
6896
+ {
6897
+ provider: "openai",
6898
+ modelId: "tts-1-1106",
6899
+ displayName: "TTS-1 (Nov 2023)",
6900
+ pricing: {
6901
+ perCharacter: 15e-6
6902
+ },
6903
+ voices: [...OPENAI_TTS_VOICES],
6904
+ formats: OPENAI_TTS_FORMATS,
6905
+ maxInputLength: 4096,
6906
+ defaultVoice: "alloy",
6907
+ defaultFormat: "mp3",
6908
+ features: {
6909
+ voiceInstructions: false
6910
+ }
6911
+ },
6912
+ {
6913
+ provider: "openai",
6914
+ modelId: "tts-1-hd",
6915
+ displayName: "TTS-1 HD",
6916
+ pricing: {
6917
+ // $30 per 1M characters = $0.00003 per character
6918
+ perCharacter: 3e-5
6919
+ },
6920
+ voices: [...OPENAI_TTS_VOICES],
6921
+ formats: OPENAI_TTS_FORMATS,
6922
+ maxInputLength: 4096,
6923
+ defaultVoice: "alloy",
6924
+ defaultFormat: "mp3",
6925
+ features: {
6926
+ voiceInstructions: false
6927
+ }
6928
+ },
6929
+ {
6930
+ provider: "openai",
6931
+ modelId: "tts-1-hd-1106",
6932
+ displayName: "TTS-1 HD (Nov 2023)",
6933
+ pricing: {
6934
+ perCharacter: 3e-5
6935
+ },
6936
+ voices: [...OPENAI_TTS_VOICES],
6937
+ formats: OPENAI_TTS_FORMATS,
6938
+ maxInputLength: 4096,
6939
+ defaultVoice: "alloy",
6940
+ defaultFormat: "mp3",
6941
+ features: {
6942
+ voiceInstructions: false
6943
+ }
6944
+ },
6945
+ // Token-based TTS model with voice instructions support
6946
+ {
6947
+ provider: "openai",
6948
+ modelId: "gpt-4o-mini-tts",
6949
+ displayName: "GPT-4o Mini TTS",
6950
+ pricing: {
6951
+ // $0.60 per 1M input tokens = $0.0000006 per token
6952
+ perInputToken: 6e-7,
6953
+ // $12 per 1M audio output tokens = $0.000012 per token
6954
+ perAudioOutputToken: 12e-6,
6955
+ // ~$0.015 per minute of audio
6956
+ perMinute: 0.015
6957
+ },
6958
+ voices: [...OPENAI_TTS_EXTENDED_VOICES],
6959
+ formats: OPENAI_TTS_FORMATS,
6960
+ maxInputLength: 2e3,
6961
+ // tokens, not characters
6962
+ defaultVoice: "alloy",
6963
+ defaultFormat: "mp3",
6964
+ features: {
6965
+ voiceInstructions: true
6966
+ }
6967
+ }
6968
+ ];
6969
+ }
6970
+ });
6971
+
6972
+ // src/providers/openai.ts
6973
+ function sanitizeExtra(extra, allowTemperature) {
6974
+ if (!extra) {
6975
+ return void 0;
6976
+ }
6977
+ if (allowTemperature || !Object.hasOwn(extra, "temperature")) {
6978
+ return extra;
6979
+ }
6980
+ return Object.fromEntries(Object.entries(extra).filter(([key]) => key !== "temperature"));
5572
6981
  }
5573
6982
  function createOpenAIProviderFromEnv() {
5574
6983
  return createProviderFromEnv("OPENAI_API_KEY", import_openai.default, OpenAIChatProvider);
@@ -5579,9 +6988,12 @@ var init_openai = __esm({
5579
6988
  "use strict";
5580
6989
  import_openai = __toESM(require("openai"), 1);
5581
6990
  import_tiktoken = require("tiktoken");
6991
+ init_messages();
5582
6992
  init_base_provider();
5583
6993
  init_constants2();
6994
+ init_openai_image_models();
5584
6995
  init_openai_models();
6996
+ init_openai_speech_models();
5585
6997
  init_utils();
5586
6998
  ROLE_MAP = {
5587
6999
  system: "system",
@@ -5596,6 +7008,87 @@ var init_openai = __esm({
5596
7008
  getModelSpecs() {
5597
7009
  return OPENAI_MODELS;
5598
7010
  }
7011
+ // =========================================================================
7012
+ // Image Generation
7013
+ // =========================================================================
7014
+ getImageModelSpecs() {
7015
+ return openaiImageModels;
7016
+ }
7017
+ supportsImageGeneration(modelId) {
7018
+ return isOpenAIImageModel(modelId);
7019
+ }
7020
+ async generateImage(options) {
7021
+ const client = this.client;
7022
+ const spec = getOpenAIImageModelSpec(options.model);
7023
+ const size = options.size ?? spec?.defaultSize ?? "1024x1024";
7024
+ const quality = options.quality ?? spec?.defaultQuality ?? "standard";
7025
+ const n = options.n ?? 1;
7026
+ const isDallE2 = options.model === "dall-e-2";
7027
+ const isGptImage = options.model.startsWith("gpt-image");
7028
+ const requestParams = {
7029
+ model: options.model,
7030
+ prompt: options.prompt,
7031
+ size,
7032
+ n
7033
+ };
7034
+ if (!isDallE2 && !isGptImage) {
7035
+ requestParams.quality = quality;
7036
+ }
7037
+ if (isGptImage) {
7038
+ } else if (!isDallE2) {
7039
+ requestParams.response_format = options.responseFormat ?? "url";
7040
+ }
7041
+ const response = await client.images.generate(requestParams);
7042
+ const cost = calculateOpenAIImageCost(options.model, size, quality, n);
7043
+ const images = response.data ?? [];
7044
+ return {
7045
+ images: images.map((img) => ({
7046
+ url: img.url,
7047
+ b64Json: img.b64_json,
7048
+ revisedPrompt: img.revised_prompt
7049
+ })),
7050
+ model: options.model,
7051
+ usage: {
7052
+ imagesGenerated: images.length,
7053
+ size,
7054
+ quality
7055
+ },
7056
+ cost
7057
+ };
7058
+ }
7059
+ // =========================================================================
7060
+ // Speech Generation
7061
+ // =========================================================================
7062
+ getSpeechModelSpecs() {
7063
+ return openaiSpeechModels;
7064
+ }
7065
+ supportsSpeechGeneration(modelId) {
7066
+ return isOpenAISpeechModel(modelId);
7067
+ }
7068
+ async generateSpeech(options) {
7069
+ const client = this.client;
7070
+ const spec = getOpenAISpeechModelSpec(options.model);
7071
+ const format = options.responseFormat ?? spec?.defaultFormat ?? "mp3";
7072
+ const voice = options.voice ?? spec?.defaultVoice ?? "alloy";
7073
+ const response = await client.audio.speech.create({
7074
+ model: options.model,
7075
+ input: options.input,
7076
+ voice,
7077
+ response_format: format,
7078
+ speed: options.speed ?? 1
7079
+ });
7080
+ const audioBuffer = await response.arrayBuffer();
7081
+ const cost = calculateOpenAISpeechCost(options.model, options.input.length);
7082
+ return {
7083
+ audio: audioBuffer,
7084
+ model: options.model,
7085
+ usage: {
7086
+ characterCount: options.input.length
7087
+ },
7088
+ cost,
7089
+ format
7090
+ };
7091
+ }
5599
7092
  buildRequestPayload(options, descriptor, spec, messages) {
5600
7093
  const { maxTokens, temperature, topP, stopSequences, extra } = options;
5601
7094
  const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
@@ -5603,11 +7096,7 @@ var init_openai = __esm({
5603
7096
  const sanitizedExtra = sanitizeExtra(extra, shouldIncludeTemperature);
5604
7097
  return {
5605
7098
  model: descriptor.name,
5606
- messages: messages.map((message) => ({
5607
- role: ROLE_MAP[message.role],
5608
- content: message.content,
5609
- name: message.name
5610
- })),
7099
+ messages: messages.map((message) => this.convertToOpenAIMessage(message)),
5611
7100
  // Only set max_completion_tokens if explicitly provided
5612
7101
  // Otherwise let the API use "as much as fits" in the context window
5613
7102
  ...maxTokens !== void 0 ? { max_completion_tokens: maxTokens } : {},
@@ -5619,6 +7108,77 @@ var init_openai = __esm({
5619
7108
  ...shouldIncludeTemperature ? { temperature } : {}
5620
7109
  };
5621
7110
  }
7111
+ /**
7112
+ * Convert an LLMMessage to OpenAI's ChatCompletionMessageParam.
7113
+ * Handles role-specific content type requirements:
7114
+ * - system/assistant: string content only
7115
+ * - user: string or multimodal array content
7116
+ */
7117
+ convertToOpenAIMessage(message) {
7118
+ const role = ROLE_MAP[message.role];
7119
+ if (role === "user") {
7120
+ const content = this.convertToOpenAIContent(message.content);
7121
+ return {
7122
+ role: "user",
7123
+ content,
7124
+ ...message.name ? { name: message.name } : {}
7125
+ };
7126
+ }
7127
+ const textContent = typeof message.content === "string" ? message.content : extractText(message.content);
7128
+ if (role === "system") {
7129
+ return {
7130
+ role: "system",
7131
+ content: textContent,
7132
+ ...message.name ? { name: message.name } : {}
7133
+ };
7134
+ }
7135
+ return {
7136
+ role: "assistant",
7137
+ content: textContent,
7138
+ ...message.name ? { name: message.name } : {}
7139
+ };
7140
+ }
7141
+ /**
7142
+ * Convert llmist content to OpenAI's content format.
7143
+ * Optimizes by returning string for text-only content, array for multimodal.
7144
+ */
7145
+ convertToOpenAIContent(content) {
7146
+ if (typeof content === "string") {
7147
+ return content;
7148
+ }
7149
+ return content.map((part) => {
7150
+ if (part.type === "text") {
7151
+ return { type: "text", text: part.text };
7152
+ }
7153
+ if (part.type === "image") {
7154
+ return this.convertImagePart(part);
7155
+ }
7156
+ if (part.type === "audio") {
7157
+ throw new Error(
7158
+ "OpenAI chat completions do not support audio input. Use Whisper for transcription or Gemini for audio understanding."
7159
+ );
7160
+ }
7161
+ throw new Error(`Unsupported content type: ${part.type}`);
7162
+ });
7163
+ }
7164
+ /**
7165
+ * Convert an image content part to OpenAI's image_url format.
7166
+ * Supports both URLs and base64 data URLs.
7167
+ */
7168
+ convertImagePart(part) {
7169
+ if (part.source.type === "url") {
7170
+ return {
7171
+ type: "image_url",
7172
+ image_url: { url: part.source.url }
7173
+ };
7174
+ }
7175
+ return {
7176
+ type: "image_url",
7177
+ image_url: {
7178
+ url: `data:${part.source.mediaType};base64,${part.source.data}`
7179
+ }
7180
+ };
7181
+ }
5622
7182
  async executeStreamRequest(payload, signal) {
5623
7183
  const client = this.client;
5624
7184
  const stream2 = await client.chat.completions.create(payload, signal ? { signal } : void 0);
@@ -5627,9 +7187,9 @@ var init_openai = __esm({
5627
7187
  async *wrapStream(iterable) {
5628
7188
  const stream2 = iterable;
5629
7189
  for await (const chunk of stream2) {
5630
- const text = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
5631
- if (text) {
5632
- yield { text, rawEvent: chunk };
7190
+ const text3 = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
7191
+ if (text3) {
7192
+ yield { text: text3, rawEvent: chunk };
5633
7193
  }
5634
7194
  const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
5635
7195
  const usage = chunk.usage ? {
@@ -5677,17 +7237,26 @@ var init_openai = __esm({
5677
7237
  }
5678
7238
  try {
5679
7239
  let tokenCount = 0;
7240
+ let imageCount = 0;
5680
7241
  for (const message of messages) {
5681
7242
  tokenCount += OPENAI_MESSAGE_OVERHEAD_TOKENS;
5682
7243
  const roleText = ROLE_MAP[message.role];
5683
7244
  tokenCount += encoding.encode(roleText).length;
5684
- tokenCount += encoding.encode(message.content ?? "").length;
7245
+ const textContent = extractText(message.content);
7246
+ tokenCount += encoding.encode(textContent).length;
7247
+ const parts = normalizeContent(message.content);
7248
+ for (const part of parts) {
7249
+ if (part.type === "image") {
7250
+ imageCount++;
7251
+ }
7252
+ }
5685
7253
  if (message.name) {
5686
7254
  tokenCount += encoding.encode(message.name).length;
5687
7255
  tokenCount += OPENAI_NAME_FIELD_OVERHEAD_TOKENS;
5688
7256
  }
5689
7257
  }
5690
7258
  tokenCount += OPENAI_REPLY_PRIMING_TOKENS;
7259
+ tokenCount += imageCount * 765;
5691
7260
  return tokenCount;
5692
7261
  } finally {
5693
7262
  encoding.free();
@@ -5697,8 +7266,19 @@ var init_openai = __esm({
5697
7266
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
5698
7267
  error
5699
7268
  );
5700
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
5701
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
7269
+ let totalChars = 0;
7270
+ let imageCount = 0;
7271
+ for (const msg of messages) {
7272
+ const parts = normalizeContent(msg.content);
7273
+ for (const part of parts) {
7274
+ if (part.type === "text") {
7275
+ totalChars += part.text.length;
7276
+ } else if (part.type === "image") {
7277
+ imageCount++;
7278
+ }
7279
+ }
7280
+ }
7281
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 765;
5702
7282
  }
5703
7283
  }
5704
7284
  };
@@ -5886,51 +7466,368 @@ var init_model_registry = __esm({
5886
7466
  * @param requestedTokens - Total tokens requested (input + output)
5887
7467
  * @returns true if valid, false if model not found or exceeds limits
5888
7468
  */
5889
- validateModelConfig(modelId, requestedTokens) {
5890
- const limits = this.getModelLimits(modelId);
5891
- if (!limits) return false;
5892
- return requestedTokens <= limits.contextWindow;
7469
+ validateModelConfig(modelId, requestedTokens) {
7470
+ const limits = this.getModelLimits(modelId);
7471
+ if (!limits) return false;
7472
+ return requestedTokens <= limits.contextWindow;
7473
+ }
7474
+ /**
7475
+ * Check if a model supports a specific feature
7476
+ * @param modelId - Full model identifier
7477
+ * @param feature - Feature to check ('streaming', 'functionCalling', 'vision', etc.)
7478
+ * @returns true if model supports feature, false otherwise
7479
+ */
7480
+ supportsFeature(modelId, feature) {
7481
+ const spec = this.getModelSpec(modelId);
7482
+ if (!spec) return false;
7483
+ return spec.features[feature] === true;
7484
+ }
7485
+ /**
7486
+ * Get all models that support a specific feature
7487
+ * @param feature - Feature to filter by
7488
+ * @param providerId - Optional provider ID to filter by
7489
+ * @returns Array of ModelSpec objects that support the feature
7490
+ */
7491
+ getModelsByFeature(feature, providerId) {
7492
+ const models = this.listModels(providerId);
7493
+ return models.filter((model) => model.features[feature] === true);
7494
+ }
7495
+ /**
7496
+ * Get the most cost-effective model for a given provider and token budget
7497
+ * @param inputTokens - Expected input tokens
7498
+ * @param outputTokens - Expected output tokens
7499
+ * @param providerId - Optional provider ID to filter by
7500
+ * @returns ModelSpec with lowest total cost, or undefined if no models found
7501
+ */
7502
+ getCheapestModel(inputTokens, outputTokens, providerId) {
7503
+ const models = this.listModels(providerId);
7504
+ if (models.length === 0) return void 0;
7505
+ let cheapest;
7506
+ for (const model of models) {
7507
+ const estimate = this.estimateCost(model.modelId, inputTokens, outputTokens);
7508
+ if (!estimate) continue;
7509
+ if (!cheapest || estimate.totalCost < cheapest.cost) {
7510
+ cheapest = { model, cost: estimate.totalCost };
7511
+ }
7512
+ }
7513
+ return cheapest?.model;
7514
+ }
7515
+ };
7516
+ }
7517
+ });
7518
+
7519
+ // src/core/namespaces/image.ts
7520
+ var ImageNamespace;
7521
+ var init_image = __esm({
7522
+ "src/core/namespaces/image.ts"() {
7523
+ "use strict";
7524
+ ImageNamespace = class {
7525
+ constructor(adapters, defaultProvider) {
7526
+ this.adapters = adapters;
7527
+ this.defaultProvider = defaultProvider;
7528
+ }
7529
+ /**
7530
+ * Generate images from a text prompt.
7531
+ *
7532
+ * @param options - Image generation options
7533
+ * @returns Promise resolving to the generation result with images and cost
7534
+ * @throws Error if the provider doesn't support image generation
7535
+ */
7536
+ async generate(options) {
7537
+ const modelId = options.model;
7538
+ const adapter = this.findImageAdapter(modelId);
7539
+ if (!adapter || !adapter.generateImage) {
7540
+ throw new Error(
7541
+ `No provider supports image generation for model "${modelId}". Available image models: ${this.listModels().map((m) => m.modelId).join(", ")}`
7542
+ );
7543
+ }
7544
+ return adapter.generateImage(options);
7545
+ }
7546
+ /**
7547
+ * List all available image generation models.
7548
+ */
7549
+ listModels() {
7550
+ const models = [];
7551
+ for (const adapter of this.adapters) {
7552
+ if (adapter.getImageModelSpecs) {
7553
+ models.push(...adapter.getImageModelSpecs());
7554
+ }
7555
+ }
7556
+ return models;
7557
+ }
7558
+ /**
7559
+ * Check if a model is supported for image generation.
7560
+ */
7561
+ supportsModel(modelId) {
7562
+ return this.findImageAdapter(modelId) !== void 0;
7563
+ }
7564
+ findImageAdapter(modelId) {
7565
+ return this.adapters.find(
7566
+ (adapter) => adapter.supportsImageGeneration?.(modelId) ?? false
7567
+ );
7568
+ }
7569
+ };
7570
+ }
7571
+ });
7572
+
7573
+ // src/core/namespaces/speech.ts
7574
+ var SpeechNamespace;
7575
+ var init_speech = __esm({
7576
+ "src/core/namespaces/speech.ts"() {
7577
+ "use strict";
7578
+ SpeechNamespace = class {
7579
+ constructor(adapters, defaultProvider) {
7580
+ this.adapters = adapters;
7581
+ this.defaultProvider = defaultProvider;
7582
+ }
7583
+ /**
7584
+ * Generate speech audio from text.
7585
+ *
7586
+ * @param options - Speech generation options
7587
+ * @returns Promise resolving to the generation result with audio and cost
7588
+ * @throws Error if the provider doesn't support speech generation
7589
+ */
7590
+ async generate(options) {
7591
+ const modelId = options.model;
7592
+ const adapter = this.findSpeechAdapter(modelId);
7593
+ if (!adapter || !adapter.generateSpeech) {
7594
+ throw new Error(
7595
+ `No provider supports speech generation for model "${modelId}". Available speech models: ${this.listModels().map((m) => m.modelId).join(", ")}`
7596
+ );
7597
+ }
7598
+ return adapter.generateSpeech(options);
7599
+ }
7600
+ /**
7601
+ * List all available speech generation models.
7602
+ */
7603
+ listModels() {
7604
+ const models = [];
7605
+ for (const adapter of this.adapters) {
7606
+ if (adapter.getSpeechModelSpecs) {
7607
+ models.push(...adapter.getSpeechModelSpecs());
7608
+ }
7609
+ }
7610
+ return models;
7611
+ }
7612
+ /**
7613
+ * Check if a model is supported for speech generation.
7614
+ */
7615
+ supportsModel(modelId) {
7616
+ return this.findSpeechAdapter(modelId) !== void 0;
7617
+ }
7618
+ findSpeechAdapter(modelId) {
7619
+ return this.adapters.find(
7620
+ (adapter) => adapter.supportsSpeechGeneration?.(modelId) ?? false
7621
+ );
7622
+ }
7623
+ };
7624
+ }
7625
+ });
7626
+
7627
+ // src/core/quick-methods.ts
7628
+ async function complete(client, prompt, options = {}) {
7629
+ const model = resolveModel(options.model ?? "gpt-5-nano");
7630
+ const builder = new LLMMessageBuilder();
7631
+ if (options.systemPrompt) {
7632
+ builder.addSystem(options.systemPrompt);
7633
+ }
7634
+ builder.addUser(prompt);
7635
+ let fullResponse = "";
7636
+ for await (const chunk of client.stream({
7637
+ model,
7638
+ messages: builder.build(),
7639
+ temperature: options.temperature,
7640
+ maxTokens: options.maxTokens
7641
+ })) {
7642
+ fullResponse += chunk.text;
7643
+ }
7644
+ return fullResponse.trim();
7645
+ }
7646
+ async function* stream(client, prompt, options = {}) {
7647
+ const model = resolveModel(options.model ?? "gpt-5-nano");
7648
+ const builder = new LLMMessageBuilder();
7649
+ if (options.systemPrompt) {
7650
+ builder.addSystem(options.systemPrompt);
7651
+ }
7652
+ builder.addUser(prompt);
7653
+ for await (const chunk of client.stream({
7654
+ model,
7655
+ messages: builder.build(),
7656
+ temperature: options.temperature,
7657
+ maxTokens: options.maxTokens
7658
+ })) {
7659
+ yield chunk.text;
7660
+ }
7661
+ }
7662
+ var init_quick_methods = __esm({
7663
+ "src/core/quick-methods.ts"() {
7664
+ "use strict";
7665
+ init_messages();
7666
+ init_model_shortcuts();
7667
+ }
7668
+ });
7669
+
7670
+ // src/core/namespaces/text.ts
7671
+ var TextNamespace;
7672
+ var init_text = __esm({
7673
+ "src/core/namespaces/text.ts"() {
7674
+ "use strict";
7675
+ init_quick_methods();
7676
+ TextNamespace = class {
7677
+ constructor(client) {
7678
+ this.client = client;
7679
+ }
7680
+ /**
7681
+ * Generate a complete text response.
7682
+ *
7683
+ * @param prompt - User prompt
7684
+ * @param options - Optional configuration
7685
+ * @returns Complete text response
7686
+ */
7687
+ async complete(prompt, options) {
7688
+ return complete(this.client, prompt, options);
7689
+ }
7690
+ /**
7691
+ * Stream text chunks.
7692
+ *
7693
+ * @param prompt - User prompt
7694
+ * @param options - Optional configuration
7695
+ * @returns Async generator yielding text chunks
7696
+ */
7697
+ stream(prompt, options) {
7698
+ return stream(this.client, prompt, options);
7699
+ }
7700
+ };
7701
+ }
7702
+ });
7703
+
7704
+ // src/core/namespaces/vision.ts
7705
+ var VisionNamespace;
7706
+ var init_vision = __esm({
7707
+ "src/core/namespaces/vision.ts"() {
7708
+ "use strict";
7709
+ init_input_content();
7710
+ init_messages();
7711
+ VisionNamespace = class {
7712
+ constructor(client) {
7713
+ this.client = client;
7714
+ }
7715
+ /**
7716
+ * Build a message builder with the image content attached.
7717
+ * Handles URLs, data URLs, base64 strings, and binary buffers.
7718
+ */
7719
+ buildImageMessage(options) {
7720
+ const builder = new LLMMessageBuilder();
7721
+ if (options.systemPrompt) {
7722
+ builder.addSystem(options.systemPrompt);
7723
+ }
7724
+ if (typeof options.image === "string") {
7725
+ if (options.image.startsWith("http://") || options.image.startsWith("https://")) {
7726
+ builder.addUserWithImageUrl(options.prompt, options.image);
7727
+ } else if (isDataUrl(options.image)) {
7728
+ const parsed = parseDataUrl(options.image);
7729
+ if (!parsed) {
7730
+ throw new Error("Invalid data URL format");
7731
+ }
7732
+ builder.addUserWithImage(
7733
+ options.prompt,
7734
+ parsed.data,
7735
+ parsed.mimeType
7736
+ );
7737
+ } else {
7738
+ const buffer = Buffer.from(options.image, "base64");
7739
+ builder.addUserWithImage(options.prompt, buffer, options.mimeType);
7740
+ }
7741
+ } else {
7742
+ builder.addUserWithImage(options.prompt, options.image, options.mimeType);
7743
+ }
7744
+ return builder;
7745
+ }
7746
+ /**
7747
+ * Stream the response and collect text and usage information.
7748
+ */
7749
+ async streamAndCollect(options, builder) {
7750
+ let response = "";
7751
+ let finalUsage;
7752
+ for await (const chunk of this.client.stream({
7753
+ model: options.model,
7754
+ messages: builder.build(),
7755
+ maxTokens: options.maxTokens,
7756
+ temperature: options.temperature
7757
+ })) {
7758
+ response += chunk.text;
7759
+ if (chunk.usage) {
7760
+ finalUsage = {
7761
+ inputTokens: chunk.usage.inputTokens,
7762
+ outputTokens: chunk.usage.outputTokens,
7763
+ totalTokens: chunk.usage.totalTokens
7764
+ };
7765
+ }
7766
+ }
7767
+ return { text: response.trim(), usage: finalUsage };
7768
+ }
7769
+ /**
7770
+ * Analyze an image with a vision-capable model.
7771
+ * Returns the analysis as a string.
7772
+ *
7773
+ * @param options - Vision analysis options
7774
+ * @returns Promise resolving to the analysis text
7775
+ * @throws Error if the image format is unsupported or model doesn't support vision
7776
+ *
7777
+ * @example
7778
+ * ```typescript
7779
+ * // From file
7780
+ * const result = await llmist.vision.analyze({
7781
+ * model: "gpt-4o",
7782
+ * image: await fs.readFile("photo.jpg"),
7783
+ * prompt: "What's in this image?",
7784
+ * });
7785
+ *
7786
+ * // From URL (OpenAI only)
7787
+ * const result = await llmist.vision.analyze({
7788
+ * model: "gpt-4o",
7789
+ * image: "https://example.com/image.jpg",
7790
+ * prompt: "Describe this image",
7791
+ * });
7792
+ * ```
7793
+ */
7794
+ async analyze(options) {
7795
+ const builder = this.buildImageMessage(options);
7796
+ const { text: text3 } = await this.streamAndCollect(options, builder);
7797
+ return text3;
5893
7798
  }
5894
7799
  /**
5895
- * Check if a model supports a specific feature
5896
- * @param modelId - Full model identifier
5897
- * @param feature - Feature to check ('streaming', 'functionCalling', 'vision', etc.)
5898
- * @returns true if model supports feature, false otherwise
7800
+ * Analyze an image and return detailed result with usage info.
7801
+ *
7802
+ * @param options - Vision analysis options
7803
+ * @returns Promise resolving to the analysis result with usage info
5899
7804
  */
5900
- supportsFeature(modelId, feature) {
5901
- const spec = this.getModelSpec(modelId);
5902
- if (!spec) return false;
5903
- return spec.features[feature] === true;
7805
+ async analyzeWithUsage(options) {
7806
+ const builder = this.buildImageMessage(options);
7807
+ const { text: text3, usage } = await this.streamAndCollect(options, builder);
7808
+ return {
7809
+ text: text3,
7810
+ model: options.model,
7811
+ usage
7812
+ };
5904
7813
  }
5905
7814
  /**
5906
- * Get all models that support a specific feature
5907
- * @param feature - Feature to filter by
5908
- * @param providerId - Optional provider ID to filter by
5909
- * @returns Array of ModelSpec objects that support the feature
7815
+ * Check if a model supports vision/image input.
7816
+ *
7817
+ * @param modelId - Model ID to check
7818
+ * @returns True if the model supports vision
5910
7819
  */
5911
- getModelsByFeature(feature, providerId) {
5912
- const models = this.listModels(providerId);
5913
- return models.filter((model) => model.features[feature] === true);
7820
+ supportsModel(modelId) {
7821
+ const spec = this.client.modelRegistry.getModelSpec(modelId);
7822
+ return spec?.features?.vision === true;
5914
7823
  }
5915
7824
  /**
5916
- * Get the most cost-effective model for a given provider and token budget
5917
- * @param inputTokens - Expected input tokens
5918
- * @param outputTokens - Expected output tokens
5919
- * @param providerId - Optional provider ID to filter by
5920
- * @returns ModelSpec with lowest total cost, or undefined if no models found
7825
+ * List all models that support vision.
7826
+ *
7827
+ * @returns Array of model IDs that support vision
5921
7828
  */
5922
- getCheapestModel(inputTokens, outputTokens, providerId) {
5923
- const models = this.listModels(providerId);
5924
- if (models.length === 0) return void 0;
5925
- let cheapest;
5926
- for (const model of models) {
5927
- const estimate = this.estimateCost(model.modelId, inputTokens, outputTokens);
5928
- if (!estimate) continue;
5929
- if (!cheapest || estimate.totalCost < cheapest.cost) {
5930
- cheapest = { model, cost: estimate.totalCost };
5931
- }
5932
- }
5933
- return cheapest?.model;
7829
+ listModels() {
7830
+ return this.client.modelRegistry.listModels().filter((spec) => spec.features?.vision === true).map((spec) => spec.modelId);
5934
7831
  }
5935
7832
  };
5936
7833
  }
@@ -5965,49 +7862,6 @@ var init_options = __esm({
5965
7862
  }
5966
7863
  });
5967
7864
 
5968
- // src/core/quick-methods.ts
5969
- async function complete(client, prompt, options = {}) {
5970
- const model = resolveModel(options.model ?? "gpt-5-nano");
5971
- const builder = new LLMMessageBuilder();
5972
- if (options.systemPrompt) {
5973
- builder.addSystem(options.systemPrompt);
5974
- }
5975
- builder.addUser(prompt);
5976
- let fullResponse = "";
5977
- for await (const chunk of client.stream({
5978
- model,
5979
- messages: builder.build(),
5980
- temperature: options.temperature,
5981
- maxTokens: options.maxTokens
5982
- })) {
5983
- fullResponse += chunk.text;
5984
- }
5985
- return fullResponse.trim();
5986
- }
5987
- async function* stream(client, prompt, options = {}) {
5988
- const model = resolveModel(options.model ?? "gpt-5-nano");
5989
- const builder = new LLMMessageBuilder();
5990
- if (options.systemPrompt) {
5991
- builder.addSystem(options.systemPrompt);
5992
- }
5993
- builder.addUser(prompt);
5994
- for await (const chunk of client.stream({
5995
- model,
5996
- messages: builder.build(),
5997
- temperature: options.temperature,
5998
- maxTokens: options.maxTokens
5999
- })) {
6000
- yield chunk.text;
6001
- }
6002
- }
6003
- var init_quick_methods = __esm({
6004
- "src/core/quick-methods.ts"() {
6005
- "use strict";
6006
- init_messages();
6007
- init_model_shortcuts();
6008
- }
6009
- });
6010
-
6011
7865
  // src/core/client.ts
6012
7866
  var client_exports = {};
6013
7867
  __export(client_exports, {
@@ -6020,12 +7874,22 @@ var init_client = __esm({
6020
7874
  init_builder();
6021
7875
  init_discovery();
6022
7876
  init_model_registry();
7877
+ init_image();
7878
+ init_speech();
7879
+ init_text();
7880
+ init_vision();
6023
7881
  init_options();
6024
7882
  init_quick_methods();
6025
7883
  LLMist = class _LLMist {
6026
7884
  parser;
7885
+ defaultProvider;
6027
7886
  modelRegistry;
6028
7887
  adapters;
7888
+ // Namespaces for different generation types
7889
+ text;
7890
+ image;
7891
+ speech;
7892
+ vision;
6029
7893
  constructor(...args) {
6030
7894
  let adapters = [];
6031
7895
  let defaultProvider;
@@ -6064,6 +7928,7 @@ var init_client = __esm({
6064
7928
  const priorityB = b.priority ?? 0;
6065
7929
  return priorityB - priorityA;
6066
7930
  });
7931
+ this.defaultProvider = resolvedDefaultProvider;
6067
7932
  this.parser = new ModelIdentifierParser(resolvedDefaultProvider);
6068
7933
  this.modelRegistry = new ModelRegistry();
6069
7934
  for (const adapter of this.adapters) {
@@ -6072,6 +7937,10 @@ var init_client = __esm({
6072
7937
  if (customModels.length > 0) {
6073
7938
  this.modelRegistry.registerModels(customModels);
6074
7939
  }
7940
+ this.text = new TextNamespace(this);
7941
+ this.image = new ImageNamespace(this.adapters, this.defaultProvider);
7942
+ this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
7943
+ this.vision = new VisionNamespace(this);
6075
7944
  }
6076
7945
  stream(options) {
6077
7946
  const descriptor = this.parser.parse(options.model);
@@ -6256,6 +8125,7 @@ var init_builder = __esm({
6256
8125
  "src/agent/builder.ts"() {
6257
8126
  "use strict";
6258
8127
  init_constants();
8128
+ init_input_content();
6259
8129
  init_model_shortcuts();
6260
8130
  init_registry();
6261
8131
  init_agent();
@@ -6903,13 +8773,17 @@ ${endPrefix}`
6903
8773
  * }
6904
8774
  * ```
6905
8775
  */
6906
- ask(userPrompt) {
8776
+ /**
8777
+ * Build AgentOptions with the given user prompt.
8778
+ * Centralizes options construction for ask(), askWithImage(), and askWithContent().
8779
+ */
8780
+ buildAgentOptions(userPrompt) {
6907
8781
  if (!this.client) {
6908
8782
  const { LLMist: LLMistClass } = (init_client(), __toCommonJS(client_exports));
6909
8783
  this.client = new LLMistClass();
6910
8784
  }
6911
8785
  const registry = GadgetRegistry.from(this.gadgets);
6912
- const options = {
8786
+ return {
6913
8787
  client: this.client,
6914
8788
  model: this.model ?? "openai:gpt-5-nano",
6915
8789
  systemPrompt: this.systemPrompt,
@@ -6935,6 +8809,83 @@ ${endPrefix}`
6935
8809
  compactionConfig: this.compactionConfig,
6936
8810
  signal: this.signal
6937
8811
  };
8812
+ }
8813
+ ask(userPrompt) {
8814
+ const options = this.buildAgentOptions(userPrompt);
8815
+ return new Agent(AGENT_INTERNAL_KEY, options);
8816
+ }
8817
+ /**
8818
+ * Build and create the agent with a multimodal user prompt (text + image).
8819
+ * Returns the Agent instance ready to run.
8820
+ *
8821
+ * @param textPrompt - Text prompt describing what to do with the image
8822
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
8823
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
8824
+ * @returns Configured Agent instance
8825
+ *
8826
+ * @example
8827
+ * ```typescript
8828
+ * const agent = LLMist.createAgent()
8829
+ * .withModel("gpt-4o")
8830
+ * .withSystem("You analyze images")
8831
+ * .askWithImage(
8832
+ * "What's in this image?",
8833
+ * await fs.readFile("photo.jpg")
8834
+ * );
8835
+ *
8836
+ * for await (const event of agent.run()) {
8837
+ * // handle events
8838
+ * }
8839
+ * ```
8840
+ */
8841
+ askWithImage(textPrompt, imageData, mimeType) {
8842
+ const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
8843
+ const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
8844
+ if (!detectedMime) {
8845
+ throw new Error(
8846
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
8847
+ );
8848
+ }
8849
+ const userContent = [
8850
+ text(textPrompt),
8851
+ {
8852
+ type: "image",
8853
+ source: {
8854
+ type: "base64",
8855
+ mediaType: detectedMime,
8856
+ data: toBase64(imageBuffer)
8857
+ }
8858
+ }
8859
+ ];
8860
+ const options = this.buildAgentOptions(userContent);
8861
+ return new Agent(AGENT_INTERNAL_KEY, options);
8862
+ }
8863
+ /**
8864
+ * Build and return an Agent configured with multimodal content.
8865
+ * More flexible than askWithImage - accepts any combination of content parts.
8866
+ *
8867
+ * @param content - Array of content parts (text, images, audio)
8868
+ * @returns A configured Agent ready for execution
8869
+ *
8870
+ * @example
8871
+ * ```typescript
8872
+ * import { text, imageFromBuffer, audioFromBuffer } from "llmist";
8873
+ *
8874
+ * const agent = LLMist.createAgent()
8875
+ * .withModel("gemini:gemini-2.5-flash")
8876
+ * .askWithContent([
8877
+ * text("Describe this image and transcribe the audio:"),
8878
+ * imageFromBuffer(imageData),
8879
+ * audioFromBuffer(audioData),
8880
+ * ]);
8881
+ *
8882
+ * for await (const event of agent.run()) {
8883
+ * // handle events
8884
+ * }
8885
+ * ```
8886
+ */
8887
+ askWithContent(content) {
8888
+ const options = this.buildAgentOptions(content);
6938
8889
  return new Agent(AGENT_INTERNAL_KEY, options);
6939
8890
  }
6940
8891
  /**
@@ -7080,6 +9031,8 @@ __export(index_exports, {
7080
9031
  StreamParser: () => StreamParser,
7081
9032
  StreamProcessor: () => StreamProcessor,
7082
9033
  SummarizationStrategy: () => SummarizationStrategy,
9034
+ audioFromBase64: () => audioFromBase64,
9035
+ audioFromBuffer: () => audioFromBuffer,
7083
9036
  collectEvents: () => collectEvents,
7084
9037
  collectText: () => collectText,
7085
9038
  complete: () => complete,
@@ -7095,20 +9048,34 @@ __export(index_exports, {
7095
9048
  createOpenAIProviderFromEnv: () => createOpenAIProviderFromEnv,
7096
9049
  createTextMockStream: () => createTextMockStream,
7097
9050
  defaultLogger: () => defaultLogger,
9051
+ detectAudioMimeType: () => detectAudioMimeType,
9052
+ detectImageMimeType: () => detectImageMimeType,
7098
9053
  discoverProviderAdapters: () => discoverProviderAdapters,
9054
+ extractText: () => extractText,
7099
9055
  getMockManager: () => getMockManager,
7100
9056
  getModelId: () => getModelId,
7101
9057
  getProvider: () => getProvider,
7102
9058
  hasProviderPrefix: () => hasProviderPrefix,
9059
+ imageFromBase64: () => imageFromBase64,
9060
+ imageFromBuffer: () => imageFromBuffer,
9061
+ imageFromUrl: () => imageFromUrl,
9062
+ isAudioPart: () => isAudioPart,
9063
+ isDataUrl: () => isDataUrl,
9064
+ isImagePart: () => isImagePart,
9065
+ isTextPart: () => isTextPart,
7103
9066
  iterationProgressHint: () => iterationProgressHint,
7104
9067
  mockLLM: () => mockLLM,
9068
+ normalizeContent: () => normalizeContent,
7105
9069
  parallelGadgetHint: () => parallelGadgetHint,
9070
+ parseDataUrl: () => parseDataUrl,
7106
9071
  resolveHintTemplate: () => resolveHintTemplate,
7107
9072
  resolveModel: () => resolveModel,
7108
9073
  resolvePromptTemplate: () => resolvePromptTemplate,
7109
9074
  resolveRulesTemplate: () => resolveRulesTemplate,
7110
9075
  runWithHandlers: () => runWithHandlers,
7111
9076
  stream: () => stream,
9077
+ text: () => text,
9078
+ toBase64: () => toBase64,
7112
9079
  validateAndApplyDefaults: () => validateAndApplyDefaults,
7113
9080
  validateGadgetParams: () => validateGadgetParams,
7114
9081
  z: () => import_zod2.z
@@ -8009,6 +9976,7 @@ function createHints(config) {
8009
9976
 
8010
9977
  // src/index.ts
8011
9978
  init_client();
9979
+ init_input_content();
8012
9980
  init_messages();
8013
9981
  init_model_registry();
8014
9982
  init_model_shortcuts();
@@ -8256,9 +10224,9 @@ function sleep(ms) {
8256
10224
  function generateInvocationId() {
8257
10225
  return `inv-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
8258
10226
  }
8259
- function splitIntoChunks(text, minChunkSize = 5, maxChunkSize = 30) {
10227
+ function splitIntoChunks(text3, minChunkSize = 5, maxChunkSize = 30) {
8260
10228
  const chunks = [];
8261
- let remaining = text;
10229
+ let remaining = text3;
8262
10230
  while (remaining.length > 0) {
8263
10231
  const chunkSize = Math.min(
8264
10232
  Math.floor(Math.random() * (maxChunkSize - minChunkSize + 1)) + minChunkSize,
@@ -8317,17 +10285,17 @@ ${String(value)}
8317
10285
  return result;
8318
10286
  }
8319
10287
  function formatGadgetCalls(gadgetCalls) {
8320
- let text = "";
10288
+ let text3 = "";
8321
10289
  const calls = [];
8322
10290
  for (const call of gadgetCalls) {
8323
10291
  const invocationId = call.invocationId ?? generateInvocationId();
8324
10292
  calls.push({ name: call.gadgetName, invocationId });
8325
10293
  const blockParams = serializeToBlockFormat(call.parameters);
8326
- text += `
10294
+ text3 += `
8327
10295
  ${GADGET_START_PREFIX}${call.gadgetName}
8328
10296
  ${blockParams}${GADGET_END_PREFIX}`;
8329
10297
  }
8330
- return { text, calls };
10298
+ return { text: text3, calls };
8331
10299
  }
8332
10300
  async function* createMockStream(response) {
8333
10301
  if (response.delayMs) {
@@ -8367,9 +10335,9 @@ async function* createMockStream(response) {
8367
10335
  };
8368
10336
  }
8369
10337
  }
8370
- function createTextMockStream(text, options) {
10338
+ function createTextMockStream(text3, options) {
8371
10339
  return createMockStream({
8372
- text,
10340
+ text: text3,
8373
10341
  delayMs: options?.delayMs,
8374
10342
  streamDelayMs: options?.streamDelayMs,
8375
10343
  usage: options?.usage,
@@ -8386,10 +10354,10 @@ var MockProviderAdapter = class {
8386
10354
  constructor(options) {
8387
10355
  this.mockManager = getMockManager(options);
8388
10356
  }
8389
- supports(descriptor) {
10357
+ supports(_descriptor) {
8390
10358
  return true;
8391
10359
  }
8392
- stream(options, descriptor, spec) {
10360
+ stream(options, descriptor, _spec) {
8393
10361
  const context = {
8394
10362
  model: options.model,
8395
10363
  provider: descriptor.provider,
@@ -8400,20 +10368,154 @@ var MockProviderAdapter = class {
8400
10368
  return this.createMockStreamFromContext(context);
8401
10369
  }
8402
10370
  async *createMockStreamFromContext(context) {
8403
- try {
8404
- const mockResponse = await this.mockManager.findMatch(context);
8405
- if (!mockResponse) {
8406
- yield {
8407
- text: "",
8408
- finishReason: "stop",
8409
- usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
8410
- };
8411
- return;
8412
- }
8413
- yield* createMockStream(mockResponse);
8414
- } catch (error) {
8415
- throw error;
10371
+ const mockResponse = await this.mockManager.findMatch(context);
10372
+ if (!mockResponse) {
10373
+ yield {
10374
+ text: "",
10375
+ finishReason: "stop",
10376
+ usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
10377
+ };
10378
+ return;
10379
+ }
10380
+ yield* createMockStream(mockResponse);
10381
+ }
10382
+ // ==========================================================================
10383
+ // Image Generation Support
10384
+ // ==========================================================================
10385
+ /**
10386
+ * Check if this adapter supports image generation for a given model.
10387
+ * Returns true if there's a registered mock with images for this model.
10388
+ */
10389
+ supportsImageGeneration(_modelId) {
10390
+ return true;
10391
+ }
10392
+ /**
10393
+ * Generate mock images based on registered mocks.
10394
+ *
10395
+ * @param options - Image generation options
10396
+ * @returns Mock image generation result
10397
+ */
10398
+ async generateImage(options) {
10399
+ const context = {
10400
+ model: options.model,
10401
+ provider: "mock",
10402
+ modelName: options.model,
10403
+ options: {
10404
+ model: options.model,
10405
+ messages: [{ role: "user", content: options.prompt }]
10406
+ },
10407
+ messages: [{ role: "user", content: options.prompt }]
10408
+ };
10409
+ const mockResponse = await this.mockManager.findMatch(context);
10410
+ if (!mockResponse?.images || mockResponse.images.length === 0) {
10411
+ throw new Error(
10412
+ `No mock registered for image generation with model "${options.model}". Use mockLLM().forModel("${options.model}").returnsImage(...).register() to add one.`
10413
+ );
10414
+ }
10415
+ return this.createImageResult(options, mockResponse);
10416
+ }
10417
+ /**
10418
+ * Transform mock response into ImageGenerationResult format.
10419
+ *
10420
+ * @param options - Original image generation options
10421
+ * @param mockResponse - Mock response containing image data
10422
+ * @returns ImageGenerationResult with mock data and zero cost
10423
+ */
10424
+ createImageResult(options, mockResponse) {
10425
+ const images = mockResponse.images ?? [];
10426
+ return {
10427
+ images: images.map((img) => ({
10428
+ b64Json: img.data,
10429
+ revisedPrompt: img.revisedPrompt
10430
+ })),
10431
+ model: options.model,
10432
+ usage: {
10433
+ imagesGenerated: images.length,
10434
+ size: options.size ?? "1024x1024",
10435
+ quality: options.quality ?? "standard"
10436
+ },
10437
+ cost: 0
10438
+ // Mock cost is always 0
10439
+ };
10440
+ }
10441
+ // ==========================================================================
10442
+ // Speech Generation Support
10443
+ // ==========================================================================
10444
+ /**
10445
+ * Check if this adapter supports speech generation for a given model.
10446
+ * Returns true if there's a registered mock with audio for this model.
10447
+ */
10448
+ supportsSpeechGeneration(_modelId) {
10449
+ return true;
10450
+ }
10451
+ /**
10452
+ * Generate mock speech based on registered mocks.
10453
+ *
10454
+ * @param options - Speech generation options
10455
+ * @returns Mock speech generation result
10456
+ */
10457
+ async generateSpeech(options) {
10458
+ const context = {
10459
+ model: options.model,
10460
+ provider: "mock",
10461
+ modelName: options.model,
10462
+ options: {
10463
+ model: options.model,
10464
+ messages: [{ role: "user", content: options.input }]
10465
+ },
10466
+ messages: [{ role: "user", content: options.input }]
10467
+ };
10468
+ const mockResponse = await this.mockManager.findMatch(context);
10469
+ if (!mockResponse?.audio) {
10470
+ throw new Error(
10471
+ `No mock registered for speech generation with model "${options.model}". Use mockLLM().forModel("${options.model}").returnsAudio(...).register() to add one.`
10472
+ );
10473
+ }
10474
+ return this.createSpeechResult(options, mockResponse);
10475
+ }
10476
+ /**
10477
+ * Transform mock response into SpeechGenerationResult format.
10478
+ * Converts base64 audio data to ArrayBuffer.
10479
+ *
10480
+ * @param options - Original speech generation options
10481
+ * @param mockResponse - Mock response containing audio data
10482
+ * @returns SpeechGenerationResult with mock data and zero cost
10483
+ */
10484
+ createSpeechResult(options, mockResponse) {
10485
+ const audio = mockResponse.audio;
10486
+ const binaryString = atob(audio.data);
10487
+ const bytes = new Uint8Array(binaryString.length);
10488
+ for (let i = 0; i < binaryString.length; i++) {
10489
+ bytes[i] = binaryString.charCodeAt(i);
8416
10490
  }
10491
+ const format = this.mimeTypeToAudioFormat(audio.mimeType);
10492
+ return {
10493
+ audio: bytes.buffer,
10494
+ model: options.model,
10495
+ usage: {
10496
+ characterCount: options.input.length
10497
+ },
10498
+ cost: 0,
10499
+ // Mock cost is always 0
10500
+ format
10501
+ };
10502
+ }
10503
+ /**
10504
+ * Map MIME type to audio format for SpeechGenerationResult.
10505
+ * Defaults to "mp3" for unknown MIME types.
10506
+ *
10507
+ * @param mimeType - Audio MIME type string
10508
+ * @returns Audio format identifier
10509
+ */
10510
+ mimeTypeToAudioFormat(mimeType) {
10511
+ const mapping = {
10512
+ "audio/mp3": "mp3",
10513
+ "audio/mpeg": "mp3",
10514
+ "audio/wav": "wav",
10515
+ "audio/webm": "opus",
10516
+ "audio/ogg": "opus"
10517
+ };
10518
+ return mapping[mimeType] ?? "mp3";
8417
10519
  }
8418
10520
  };
8419
10521
  function createMockAdapter(options) {
@@ -8421,6 +10523,20 @@ function createMockAdapter(options) {
8421
10523
  }
8422
10524
 
8423
10525
  // src/testing/mock-builder.ts
10526
+ init_input_content();
10527
+ init_messages();
10528
+ function hasImageContent(content) {
10529
+ if (typeof content === "string") return false;
10530
+ return content.some((part) => isImagePart(part));
10531
+ }
10532
+ function hasAudioContent(content) {
10533
+ if (typeof content === "string") return false;
10534
+ return content.some((part) => isAudioPart(part));
10535
+ }
10536
+ function countImages(content) {
10537
+ if (typeof content === "string") return 0;
10538
+ return content.filter((part) => isImagePart(part)).length;
10539
+ }
8424
10540
  var MockBuilder = class {
8425
10541
  matchers = [];
8426
10542
  response = {};
@@ -8483,9 +10599,9 @@ var MockBuilder = class {
8483
10599
  * @example
8484
10600
  * mockLLM().whenMessageContains('hello')
8485
10601
  */
8486
- whenMessageContains(text) {
10602
+ whenMessageContains(text3) {
8487
10603
  this.matchers.push(
8488
- (ctx) => ctx.messages.some((msg) => msg.content?.toLowerCase().includes(text.toLowerCase()))
10604
+ (ctx) => ctx.messages.some((msg) => extractText(msg.content).toLowerCase().includes(text3.toLowerCase()))
8489
10605
  );
8490
10606
  return this;
8491
10607
  }
@@ -8495,10 +10611,11 @@ var MockBuilder = class {
8495
10611
  * @example
8496
10612
  * mockLLM().whenLastMessageContains('goodbye')
8497
10613
  */
8498
- whenLastMessageContains(text) {
10614
+ whenLastMessageContains(text3) {
8499
10615
  this.matchers.push((ctx) => {
8500
10616
  const lastMsg = ctx.messages[ctx.messages.length - 1];
8501
- return lastMsg?.content?.toLowerCase().includes(text.toLowerCase()) ?? false;
10617
+ if (!lastMsg) return false;
10618
+ return extractText(lastMsg.content).toLowerCase().includes(text3.toLowerCase());
8502
10619
  });
8503
10620
  return this;
8504
10621
  }
@@ -8509,7 +10626,7 @@ var MockBuilder = class {
8509
10626
  * mockLLM().whenMessageMatches(/calculate \d+/)
8510
10627
  */
8511
10628
  whenMessageMatches(regex) {
8512
- this.matchers.push((ctx) => ctx.messages.some((msg) => regex.test(msg.content ?? "")));
10629
+ this.matchers.push((ctx) => ctx.messages.some((msg) => regex.test(extractText(msg.content))));
8513
10630
  return this;
8514
10631
  }
8515
10632
  /**
@@ -8518,10 +10635,10 @@ var MockBuilder = class {
8518
10635
  * @example
8519
10636
  * mockLLM().whenRoleContains('system', 'You are a helpful assistant')
8520
10637
  */
8521
- whenRoleContains(role, text) {
10638
+ whenRoleContains(role, text3) {
8522
10639
  this.matchers.push(
8523
10640
  (ctx) => ctx.messages.some(
8524
- (msg) => msg.role === role && msg.content?.toLowerCase().includes(text.toLowerCase())
10641
+ (msg) => msg.role === role && extractText(msg.content).toLowerCase().includes(text3.toLowerCase())
8525
10642
  )
8526
10643
  );
8527
10644
  return this;
@@ -8549,6 +10666,43 @@ var MockBuilder = class {
8549
10666
  this.matchers.push(matcher);
8550
10667
  return this;
8551
10668
  }
10669
+ // ==========================================================================
10670
+ // Multimodal Matchers
10671
+ // ==========================================================================
10672
+ /**
10673
+ * Match when any message contains an image.
10674
+ *
10675
+ * @example
10676
+ * mockLLM().whenMessageHasImage().returns("I see an image of a sunset.")
10677
+ */
10678
+ whenMessageHasImage() {
10679
+ this.matchers.push((ctx) => ctx.messages.some((msg) => hasImageContent(msg.content)));
10680
+ return this;
10681
+ }
10682
+ /**
10683
+ * Match when any message contains audio.
10684
+ *
10685
+ * @example
10686
+ * mockLLM().whenMessageHasAudio().returns("I hear music playing.")
10687
+ */
10688
+ whenMessageHasAudio() {
10689
+ this.matchers.push((ctx) => ctx.messages.some((msg) => hasAudioContent(msg.content)));
10690
+ return this;
10691
+ }
10692
+ /**
10693
+ * Match based on the number of images in the last message.
10694
+ *
10695
+ * @example
10696
+ * mockLLM().whenImageCount((n) => n >= 2).returns("Comparing multiple images...")
10697
+ */
10698
+ whenImageCount(predicate) {
10699
+ this.matchers.push((ctx) => {
10700
+ const lastMsg = ctx.messages[ctx.messages.length - 1];
10701
+ if (!lastMsg) return false;
10702
+ return predicate(countImages(lastMsg.content));
10703
+ });
10704
+ return this;
10705
+ }
8552
10706
  /**
8553
10707
  * Set the text response to return.
8554
10708
  * Can be a static string or a function that returns a string dynamically.
@@ -8558,17 +10712,17 @@ var MockBuilder = class {
8558
10712
  * mockLLM().returns(() => `Response at ${Date.now()}`)
8559
10713
  * mockLLM().returns((ctx) => `You said: ${ctx.messages[0]?.content}`)
8560
10714
  */
8561
- returns(text) {
8562
- if (typeof text === "function") {
10715
+ returns(text3) {
10716
+ if (typeof text3 === "function") {
8563
10717
  this.response = async (ctx) => {
8564
- const resolvedText = await Promise.resolve().then(() => text(ctx));
10718
+ const resolvedText = await Promise.resolve().then(() => text3(ctx));
8565
10719
  return { text: resolvedText };
8566
10720
  };
8567
10721
  } else {
8568
10722
  if (typeof this.response === "function") {
8569
10723
  throw new Error("Cannot use returns() after withResponse() with a function");
8570
10724
  }
8571
- this.response.text = text;
10725
+ this.response.text = text3;
8572
10726
  }
8573
10727
  return this;
8574
10728
  }
@@ -8605,6 +10759,112 @@ var MockBuilder = class {
8605
10759
  this.response.gadgetCalls.push({ gadgetName, parameters });
8606
10760
  return this;
8607
10761
  }
10762
+ // ==========================================================================
10763
+ // Multimodal Response Helpers
10764
+ // ==========================================================================
10765
+ /**
10766
+ * Return a single image in the response.
10767
+ * Useful for mocking image generation endpoints.
10768
+ *
10769
+ * @param data - Image data (base64 string or Buffer)
10770
+ * @param mimeType - MIME type (auto-detected if Buffer provided without type)
10771
+ *
10772
+ * @example
10773
+ * mockLLM()
10774
+ * .forModel('dall-e-3')
10775
+ * .returnsImage(pngBuffer)
10776
+ * .register();
10777
+ */
10778
+ returnsImage(data, mimeType) {
10779
+ if (typeof this.response === "function") {
10780
+ throw new Error("Cannot use returnsImage() after withResponse() with a function");
10781
+ }
10782
+ let imageData;
10783
+ let imageMime;
10784
+ if (typeof data === "string") {
10785
+ imageData = data;
10786
+ if (!mimeType) {
10787
+ throw new Error("MIME type is required when providing base64 string data");
10788
+ }
10789
+ imageMime = mimeType;
10790
+ } else {
10791
+ imageData = toBase64(data);
10792
+ const detected = mimeType ?? detectImageMimeType(data);
10793
+ if (!detected) {
10794
+ throw new Error(
10795
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
10796
+ );
10797
+ }
10798
+ imageMime = detected;
10799
+ }
10800
+ if (!this.response.images) {
10801
+ this.response.images = [];
10802
+ }
10803
+ this.response.images.push({ data: imageData, mimeType: imageMime });
10804
+ return this;
10805
+ }
10806
+ /**
10807
+ * Return multiple images in the response.
10808
+ *
10809
+ * @example
10810
+ * mockLLM()
10811
+ * .forModel('dall-e-3')
10812
+ * .returnsImages([
10813
+ * { data: pngBuffer1 },
10814
+ * { data: pngBuffer2 },
10815
+ * ])
10816
+ * .register();
10817
+ */
10818
+ returnsImages(images) {
10819
+ for (const img of images) {
10820
+ this.returnsImage(img.data, img.mimeType);
10821
+ if (img.revisedPrompt && this.response && typeof this.response !== "function") {
10822
+ const lastImage = this.response.images?.[this.response.images.length - 1];
10823
+ if (lastImage) {
10824
+ lastImage.revisedPrompt = img.revisedPrompt;
10825
+ }
10826
+ }
10827
+ }
10828
+ return this;
10829
+ }
10830
+ /**
10831
+ * Return audio data in the response.
10832
+ * Useful for mocking speech synthesis endpoints.
10833
+ *
10834
+ * @param data - Audio data (base64 string or Buffer)
10835
+ * @param mimeType - MIME type (auto-detected if Buffer provided without type)
10836
+ *
10837
+ * @example
10838
+ * mockLLM()
10839
+ * .forModel('tts-1')
10840
+ * .returnsAudio(mp3Buffer)
10841
+ * .register();
10842
+ */
10843
+ returnsAudio(data, mimeType) {
10844
+ if (typeof this.response === "function") {
10845
+ throw new Error("Cannot use returnsAudio() after withResponse() with a function");
10846
+ }
10847
+ let audioData;
10848
+ let audioMime;
10849
+ if (typeof data === "string") {
10850
+ audioData = data;
10851
+ if (!mimeType) {
10852
+ throw new Error("MIME type is required when providing base64 string data");
10853
+ }
10854
+ audioMime = mimeType;
10855
+ } else {
10856
+ audioData = toBase64(data);
10857
+ const detected = mimeType ?? detectAudioMimeType(data);
10858
+ if (!detected) {
10859
+ throw new Error(
10860
+ "Could not detect audio MIME type. Please provide the mimeType parameter explicitly."
10861
+ );
10862
+ }
10863
+ audioMime = detected;
10864
+ }
10865
+ this.response.audio = { data: audioData, mimeType: audioMime };
10866
+ return this;
10867
+ }
8608
10868
  /**
8609
10869
  * Set the complete mock response object.
8610
10870
  * This allows full control over all response properties.
@@ -8818,6 +11078,8 @@ var import_node_stream = require("stream");
8818
11078
  StreamParser,
8819
11079
  StreamProcessor,
8820
11080
  SummarizationStrategy,
11081
+ audioFromBase64,
11082
+ audioFromBuffer,
8821
11083
  collectEvents,
8822
11084
  collectText,
8823
11085
  complete,
@@ -8833,20 +11095,34 @@ var import_node_stream = require("stream");
8833
11095
  createOpenAIProviderFromEnv,
8834
11096
  createTextMockStream,
8835
11097
  defaultLogger,
11098
+ detectAudioMimeType,
11099
+ detectImageMimeType,
8836
11100
  discoverProviderAdapters,
11101
+ extractText,
8837
11102
  getMockManager,
8838
11103
  getModelId,
8839
11104
  getProvider,
8840
11105
  hasProviderPrefix,
11106
+ imageFromBase64,
11107
+ imageFromBuffer,
11108
+ imageFromUrl,
11109
+ isAudioPart,
11110
+ isDataUrl,
11111
+ isImagePart,
11112
+ isTextPart,
8841
11113
  iterationProgressHint,
8842
11114
  mockLLM,
11115
+ normalizeContent,
8843
11116
  parallelGadgetHint,
11117
+ parseDataUrl,
8844
11118
  resolveHintTemplate,
8845
11119
  resolveModel,
8846
11120
  resolvePromptTemplate,
8847
11121
  resolveRulesTemplate,
8848
11122
  runWithHandlers,
8849
11123
  stream,
11124
+ text,
11125
+ toBase64,
8850
11126
  validateAndApplyDefaults,
8851
11127
  validateGadgetParams,
8852
11128
  z