llmist 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -46,6 +46,137 @@ var init_constants = __esm({
46
46
  }
47
47
  });
48
48
 
49
+ // src/core/input-content.ts
50
+ function text(content) {
51
+ return { type: "text", text: content };
52
+ }
53
+ function imageFromUrl(url) {
54
+ return {
55
+ type: "image",
56
+ source: { type: "url", url }
57
+ };
58
+ }
59
+ function detectImageMimeType(data) {
60
+ const bytes = data instanceof Buffer ? data : Buffer.from(data);
61
+ for (const { bytes: magic, mimeType } of IMAGE_MAGIC_BYTES) {
62
+ if (bytes.length >= magic.length) {
63
+ let matches = true;
64
+ for (let i = 0; i < magic.length; i++) {
65
+ if (bytes[i] !== magic[i]) {
66
+ matches = false;
67
+ break;
68
+ }
69
+ }
70
+ if (matches) {
71
+ if (mimeType === "image/webp") {
72
+ if (bytes.length >= 12) {
73
+ const webpMarker = bytes[8] === 87 && bytes[9] === 69 && bytes[10] === 66 && bytes[11] === 80;
74
+ if (!webpMarker) continue;
75
+ }
76
+ }
77
+ return mimeType;
78
+ }
79
+ }
80
+ }
81
+ return null;
82
+ }
83
+ function detectAudioMimeType(data) {
84
+ const bytes = data instanceof Buffer ? data : Buffer.from(data);
85
+ for (const { bytes: magic, mimeType } of AUDIO_MAGIC_BYTES) {
86
+ if (bytes.length >= magic.length) {
87
+ let matches = true;
88
+ for (let i = 0; i < magic.length; i++) {
89
+ if (bytes[i] !== magic[i]) {
90
+ matches = false;
91
+ break;
92
+ }
93
+ }
94
+ if (matches) {
95
+ if (mimeType === "audio/wav") {
96
+ if (bytes.length >= 12) {
97
+ const waveMarker = bytes[8] === 87 && bytes[9] === 65 && bytes[10] === 86 && bytes[11] === 69;
98
+ if (!waveMarker) continue;
99
+ }
100
+ }
101
+ return mimeType;
102
+ }
103
+ }
104
+ }
105
+ return null;
106
+ }
107
+ function toBase64(data) {
108
+ if (typeof data === "string") {
109
+ return data;
110
+ }
111
+ return Buffer.from(data).toString("base64");
112
+ }
113
+ function imageFromBuffer(buffer, mediaType) {
114
+ const detectedType = mediaType ?? detectImageMimeType(buffer);
115
+ if (!detectedType) {
116
+ throw new Error(
117
+ "Could not detect image MIME type. Please provide the mediaType parameter explicitly."
118
+ );
119
+ }
120
+ return {
121
+ type: "image",
122
+ source: {
123
+ type: "base64",
124
+ mediaType: detectedType,
125
+ data: toBase64(buffer)
126
+ }
127
+ };
128
+ }
129
+ function audioFromBuffer(buffer, mediaType) {
130
+ const detectedType = mediaType ?? detectAudioMimeType(buffer);
131
+ if (!detectedType) {
132
+ throw new Error(
133
+ "Could not detect audio MIME type. Please provide the mediaType parameter explicitly."
134
+ );
135
+ }
136
+ return {
137
+ type: "audio",
138
+ source: {
139
+ type: "base64",
140
+ mediaType: detectedType,
141
+ data: toBase64(buffer)
142
+ }
143
+ };
144
+ }
145
+ function isDataUrl(input) {
146
+ return input.startsWith("data:");
147
+ }
148
+ function parseDataUrl(url) {
149
+ const match = url.match(/^data:([^;]+);base64,(.+)$/);
150
+ if (!match) return null;
151
+ return { mimeType: match[1], data: match[2] };
152
+ }
153
+ var IMAGE_MAGIC_BYTES, AUDIO_MAGIC_BYTES;
154
+ var init_input_content = __esm({
155
+ "src/core/input-content.ts"() {
156
+ "use strict";
157
+ IMAGE_MAGIC_BYTES = [
158
+ { bytes: [255, 216, 255], mimeType: "image/jpeg" },
159
+ { bytes: [137, 80, 78, 71], mimeType: "image/png" },
160
+ { bytes: [71, 73, 70, 56], mimeType: "image/gif" },
161
+ // WebP starts with RIFF....WEBP
162
+ { bytes: [82, 73, 70, 70], mimeType: "image/webp" }
163
+ ];
164
+ AUDIO_MAGIC_BYTES = [
165
+ // MP3 frame sync
166
+ { bytes: [255, 251], mimeType: "audio/mp3" },
167
+ { bytes: [255, 250], mimeType: "audio/mp3" },
168
+ // ID3 tag (MP3)
169
+ { bytes: [73, 68, 51], mimeType: "audio/mp3" },
170
+ // OGG
171
+ { bytes: [79, 103, 103, 83], mimeType: "audio/ogg" },
172
+ // WAV (RIFF)
173
+ { bytes: [82, 73, 70, 70], mimeType: "audio/wav" },
174
+ // WebM
175
+ { bytes: [26, 69, 223, 163], mimeType: "audio/webm" }
176
+ ];
177
+ }
178
+ });
179
+
49
180
  // src/core/model-shortcuts.ts
50
181
  function isKnownModelPattern(model) {
51
182
  const normalized = model.toLowerCase();
@@ -375,7 +506,9 @@ var init_prompt_config = __esm({
375
506
  rules: () => [
376
507
  "Output ONLY plain text with the exact markers - never use function/tool calling",
377
508
  "You can invoke multiple gadgets in a single response",
378
- "For dependent gadgets, invoke the first one and wait for the result"
509
+ "Gadgets without dependencies execute immediately (in parallel if multiple)",
510
+ "Use :invocation_id:dep1,dep2 syntax when a gadget needs results from prior gadgets",
511
+ "If any dependency fails, dependent gadgets are automatically skipped"
379
512
  ],
380
513
  customExamples: null
381
514
  };
@@ -383,11 +516,24 @@ var init_prompt_config = __esm({
383
516
  });
384
517
 
385
518
  // src/core/messages.ts
519
+ function normalizeContent(content) {
520
+ if (typeof content === "string") {
521
+ return [{ type: "text", text: content }];
522
+ }
523
+ return content;
524
+ }
525
+ function extractText(content) {
526
+ if (typeof content === "string") {
527
+ return content;
528
+ }
529
+ return content.filter((part) => part.type === "text").map((part) => part.text).join("");
530
+ }
386
531
  var LLMMessageBuilder;
387
532
  var init_messages = __esm({
388
533
  "src/core/messages.ts"() {
389
534
  "use strict";
390
535
  init_constants();
536
+ init_input_content();
391
537
  init_prompt_config();
392
538
  LLMMessageBuilder = class {
393
539
  messages = [];
@@ -489,6 +635,10 @@ CRITICAL: ${criticalUsage}
489
635
  parts.push(`
490
636
  1. Start marker: ${this.startPrefix}gadget_name`);
491
637
  parts.push(`
638
+ With ID: ${this.startPrefix}gadget_name:my_id`);
639
+ parts.push(`
640
+ With dependencies: ${this.startPrefix}gadget_name:my_id:dep1,dep2`);
641
+ parts.push(`
492
642
  2. ${formatDescription}`);
493
643
  parts.push(`
494
644
  3. End marker: ${this.endPrefix}`);
@@ -538,6 +688,25 @@ ${this.endPrefix}`;
538
688
  EXAMPLE (Multiple Gadgets):
539
689
 
540
690
  ${multipleExample}`);
691
+ const dependencyExample = `${this.startPrefix}fetch_data:fetch_1
692
+ ${this.argPrefix}url
693
+ https://api.example.com/users
694
+ ${this.endPrefix}
695
+ ${this.startPrefix}fetch_data:fetch_2
696
+ ${this.argPrefix}url
697
+ https://api.example.com/orders
698
+ ${this.endPrefix}
699
+ ${this.startPrefix}merge_data:merge_1:fetch_1,fetch_2
700
+ ${this.argPrefix}format
701
+ json
702
+ ${this.endPrefix}`;
703
+ parts.push(`
704
+
705
+ EXAMPLE (With Dependencies):
706
+ merge_1 waits for fetch_1 AND fetch_2 to complete.
707
+ If either fails, merge_1 is automatically skipped.
708
+
709
+ ${dependencyExample}`);
541
710
  parts.push(`
542
711
 
543
712
  BLOCK FORMAT SYNTAX:
@@ -588,6 +757,25 @@ Produces: { "items": ["first", "second"] }`);
588
757
  }
589
758
  return parts.join("");
590
759
  }
760
+ /**
761
+ * Add a user message.
762
+ * Content can be a string (text only) or an array of content parts (multimodal).
763
+ *
764
+ * @param content - Message content
765
+ * @param metadata - Optional metadata
766
+ *
767
+ * @example
768
+ * ```typescript
769
+ * // Text only
770
+ * builder.addUser("Hello!");
771
+ *
772
+ * // Multimodal
773
+ * builder.addUser([
774
+ * text("What's in this image?"),
775
+ * imageFromBuffer(imageData),
776
+ * ]);
777
+ * ```
778
+ */
591
779
  addUser(content, metadata) {
592
780
  this.messages.push({ role: "user", content, metadata });
593
781
  return this;
@@ -596,6 +784,104 @@ Produces: { "items": ["first", "second"] }`);
596
784
  this.messages.push({ role: "assistant", content, metadata });
597
785
  return this;
598
786
  }
787
+ /**
788
+ * Add a user message with an image attachment.
789
+ *
790
+ * @param textContent - Text prompt
791
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
792
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
793
+ *
794
+ * @example
795
+ * ```typescript
796
+ * builder.addUserWithImage(
797
+ * "What's in this image?",
798
+ * await fs.readFile("photo.jpg"),
799
+ * "image/jpeg" // Optional - auto-detected
800
+ * );
801
+ * ```
802
+ */
803
+ addUserWithImage(textContent, imageData, mimeType) {
804
+ const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
805
+ const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
806
+ if (!detectedMime) {
807
+ throw new Error(
808
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
809
+ );
810
+ }
811
+ const content = [
812
+ text(textContent),
813
+ {
814
+ type: "image",
815
+ source: {
816
+ type: "base64",
817
+ mediaType: detectedMime,
818
+ data: toBase64(imageBuffer)
819
+ }
820
+ }
821
+ ];
822
+ this.messages.push({ role: "user", content });
823
+ return this;
824
+ }
825
+ /**
826
+ * Add a user message with an image URL (OpenAI only).
827
+ *
828
+ * @param textContent - Text prompt
829
+ * @param imageUrl - URL to the image
830
+ *
831
+ * @example
832
+ * ```typescript
833
+ * builder.addUserWithImageUrl(
834
+ * "What's in this image?",
835
+ * "https://example.com/image.jpg"
836
+ * );
837
+ * ```
838
+ */
839
+ addUserWithImageUrl(textContent, imageUrl) {
840
+ const content = [text(textContent), imageFromUrl(imageUrl)];
841
+ this.messages.push({ role: "user", content });
842
+ return this;
843
+ }
844
+ /**
845
+ * Add a user message with an audio attachment (Gemini only).
846
+ *
847
+ * @param textContent - Text prompt
848
+ * @param audioData - Audio data (Buffer, Uint8Array, or base64 string)
849
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
850
+ *
851
+ * @example
852
+ * ```typescript
853
+ * builder.addUserWithAudio(
854
+ * "Transcribe this audio",
855
+ * await fs.readFile("recording.mp3"),
856
+ * "audio/mp3" // Optional - auto-detected
857
+ * );
858
+ * ```
859
+ */
860
+ addUserWithAudio(textContent, audioData, mimeType) {
861
+ const audioBuffer = typeof audioData === "string" ? Buffer.from(audioData, "base64") : audioData;
862
+ const content = [text(textContent), audioFromBuffer(audioBuffer, mimeType)];
863
+ this.messages.push({ role: "user", content });
864
+ return this;
865
+ }
866
+ /**
867
+ * Add a user message with multiple content parts.
868
+ * Provides full flexibility for complex multimodal messages.
869
+ *
870
+ * @param parts - Array of content parts
871
+ *
872
+ * @example
873
+ * ```typescript
874
+ * builder.addUserMultimodal([
875
+ * text("Compare these images:"),
876
+ * imageFromBuffer(image1),
877
+ * imageFromBuffer(image2),
878
+ * ]);
879
+ * ```
880
+ */
881
+ addUserMultimodal(parts) {
882
+ this.messages.push({ role: "user", content: parts });
883
+ return this;
884
+ }
599
885
  addGadgetCall(gadget, parameters, result) {
600
886
  const paramStr = this.formatBlockParameters(parameters, "");
601
887
  this.messages.push({
@@ -1914,7 +2200,7 @@ var init_conversation_manager = __esm({
1914
2200
  if (msg.role === "user") {
1915
2201
  this.historyBuilder.addUser(msg.content);
1916
2202
  } else if (msg.role === "assistant") {
1917
- this.historyBuilder.addAssistant(msg.content);
2203
+ this.historyBuilder.addAssistant(extractText(msg.content));
1918
2204
  }
1919
2205
  }
1920
2206
  }
@@ -1935,8 +2221,10 @@ async function runWithHandlers(agentGenerator, handlers) {
1935
2221
  if (handlers.onGadgetCall) {
1936
2222
  await handlers.onGadgetCall({
1937
2223
  gadgetName: event.call.gadgetName,
2224
+ invocationId: event.call.invocationId,
1938
2225
  parameters: event.call.parameters,
1939
- parametersRaw: event.call.parametersRaw
2226
+ parametersRaw: event.call.parametersRaw,
2227
+ dependencies: event.call.dependencies
1940
2228
  });
1941
2229
  }
1942
2230
  break;
@@ -2498,7 +2786,27 @@ var init_cost_reporting_client = __esm({
2498
2786
  constructor(client, reportCost) {
2499
2787
  this.client = client;
2500
2788
  this.reportCost = reportCost;
2789
+ this.image = {
2790
+ generate: async (options) => {
2791
+ const result = await this.client.image.generate(options);
2792
+ if (result.cost !== void 0 && result.cost > 0) {
2793
+ this.reportCost(result.cost);
2794
+ }
2795
+ return result;
2796
+ }
2797
+ };
2798
+ this.speech = {
2799
+ generate: async (options) => {
2800
+ const result = await this.client.speech.generate(options);
2801
+ if (result.cost !== void 0 && result.cost > 0) {
2802
+ this.reportCost(result.cost);
2803
+ }
2804
+ return result;
2805
+ }
2806
+ };
2501
2807
  }
2808
+ image;
2809
+ speech;
2502
2810
  /**
2503
2811
  * Access to model registry for cost estimation.
2504
2812
  */
@@ -2763,15 +3071,37 @@ var init_parser = __esm({
2763
3071
  return segment.trim().length > 0 ? segment : void 0;
2764
3072
  }
2765
3073
  /**
2766
- * Parse gadget name, handling both old format (name:invocationId) and new format (just name).
2767
- * For new format, generates a unique invocation ID.
3074
+ * Parse gadget name with optional invocation ID and dependencies.
3075
+ *
3076
+ * Supported formats:
3077
+ * - `GadgetName` - Auto-generate ID, no dependencies
3078
+ * - `GadgetName:my_id` - Explicit ID, no dependencies
3079
+ * - `GadgetName:my_id:dep1,dep2` - Explicit ID with dependencies
3080
+ *
3081
+ * Dependencies must be comma-separated invocation IDs.
2768
3082
  */
2769
3083
  parseGadgetName(gadgetName) {
2770
- if (gadgetName.includes(":")) {
2771
- const parts = gadgetName.split(":");
2772
- return { actualName: parts[0], invocationId: parts[1] };
3084
+ const parts = gadgetName.split(":");
3085
+ if (parts.length === 1) {
3086
+ return {
3087
+ actualName: parts[0],
3088
+ invocationId: `gadget_${++globalInvocationCounter}`,
3089
+ dependencies: []
3090
+ };
3091
+ } else if (parts.length === 2) {
3092
+ return {
3093
+ actualName: parts[0],
3094
+ invocationId: parts[1].trim(),
3095
+ dependencies: []
3096
+ };
3097
+ } else {
3098
+ const deps = parts[2].split(",").map((d) => d.trim()).filter((d) => d.length > 0);
3099
+ return {
3100
+ actualName: parts[0],
3101
+ invocationId: parts[1].trim(),
3102
+ dependencies: deps
3103
+ };
2773
3104
  }
2774
- return { actualName: gadgetName, invocationId: `gadget_${++globalInvocationCounter}` };
2775
3105
  }
2776
3106
  /**
2777
3107
  * Extract the error message from a parse error.
@@ -2807,39 +3137,20 @@ var init_parser = __esm({
2807
3137
  const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
2808
3138
  if (metadataEndIndex === -1) break;
2809
3139
  const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
2810
- const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
3140
+ const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
2811
3141
  const contentStartIndex = metadataEndIndex + 1;
2812
3142
  let partEndIndex;
2813
3143
  let endMarkerLength = 0;
2814
- if (gadgetName.includes(":")) {
2815
- const oldEndMarker = `${this.endPrefix + actualGadgetName}:${invocationId}`;
2816
- partEndIndex = this.buffer.indexOf(oldEndMarker, contentStartIndex);
2817
- if (partEndIndex === -1) break;
2818
- endMarkerLength = oldEndMarker.length;
3144
+ const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
3145
+ const endPos = this.buffer.indexOf(this.endPrefix, contentStartIndex);
3146
+ if (nextStartPos !== -1 && (endPos === -1 || nextStartPos < endPos)) {
3147
+ partEndIndex = nextStartPos;
3148
+ endMarkerLength = 0;
3149
+ } else if (endPos !== -1) {
3150
+ partEndIndex = endPos;
3151
+ endMarkerLength = this.endPrefix.length;
2819
3152
  } else {
2820
- const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
2821
- let validEndPos = -1;
2822
- let searchPos = contentStartIndex;
2823
- while (true) {
2824
- const endPos = this.buffer.indexOf(this.endPrefix, searchPos);
2825
- if (endPos === -1) break;
2826
- const afterEnd = this.buffer.substring(endPos + this.endPrefix.length);
2827
- if (afterEnd.startsWith("\n") || afterEnd.startsWith("\r") || afterEnd.startsWith(this.startPrefix) || afterEnd.length === 0) {
2828
- validEndPos = endPos;
2829
- break;
2830
- } else {
2831
- searchPos = endPos + this.endPrefix.length;
2832
- }
2833
- }
2834
- if (nextStartPos !== -1 && (validEndPos === -1 || nextStartPos < validEndPos)) {
2835
- partEndIndex = nextStartPos;
2836
- endMarkerLength = 0;
2837
- } else if (validEndPos !== -1) {
2838
- partEndIndex = validEndPos;
2839
- endMarkerLength = this.endPrefix.length;
2840
- } else {
2841
- break;
2842
- }
3153
+ break;
2843
3154
  }
2844
3155
  const parametersRaw = this.buffer.substring(contentStartIndex, partEndIndex).trim();
2845
3156
  const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2850,7 +3161,8 @@ var init_parser = __esm({
2850
3161
  invocationId,
2851
3162
  parametersRaw,
2852
3163
  parameters,
2853
- parseError
3164
+ parseError,
3165
+ dependencies
2854
3166
  }
2855
3167
  };
2856
3168
  startIndex = partEndIndex + endMarkerLength;
@@ -2873,7 +3185,7 @@ var init_parser = __esm({
2873
3185
  const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
2874
3186
  if (metadataEndIndex !== -1) {
2875
3187
  const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
2876
- const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
3188
+ const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
2877
3189
  const contentStartIndex = metadataEndIndex + 1;
2878
3190
  const parametersRaw = this.buffer.substring(contentStartIndex).trim();
2879
3191
  const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2884,7 +3196,8 @@ var init_parser = __esm({
2884
3196
  invocationId,
2885
3197
  parametersRaw,
2886
3198
  parameters,
2887
- parseError
3199
+ parseError,
3200
+ dependencies
2888
3201
  }
2889
3202
  };
2890
3203
  return;
@@ -3254,6 +3567,13 @@ var init_stream_processor = __esm({
3254
3567
  accumulatedText = "";
3255
3568
  shouldStopExecution = false;
3256
3569
  observerFailureCount = 0;
3570
+ // Dependency tracking for gadget execution DAG
3571
+ /** Gadgets waiting for their dependencies to complete */
3572
+ pendingGadgets = /* @__PURE__ */ new Map();
3573
+ /** Completed gadget results, keyed by invocation ID */
3574
+ completedResults = /* @__PURE__ */ new Map();
3575
+ /** Invocation IDs of gadgets that have failed (error or skipped due to dependency) */
3576
+ failedInvocations = /* @__PURE__ */ new Set();
3257
3577
  constructor(options) {
3258
3578
  this.iteration = options.iteration;
3259
3579
  this.registry = options.registry;
@@ -3354,6 +3674,16 @@ var init_stream_processor = __esm({
3354
3674
  }
3355
3675
  }
3356
3676
  }
3677
+ const finalPendingEvents = await this.processPendingGadgets();
3678
+ outputs.push(...finalPendingEvents);
3679
+ if (finalPendingEvents.some((e) => e.type === "gadget_result")) {
3680
+ didExecuteGadgets = true;
3681
+ }
3682
+ for (const evt of finalPendingEvents) {
3683
+ if (evt.type === "gadget_result" && evt.result.breaksLoop) {
3684
+ shouldBreakLoop = true;
3685
+ }
3686
+ }
3357
3687
  }
3358
3688
  let finalMessage = this.accumulatedText;
3359
3689
  if (this.hooks.interceptors?.interceptAssistantMessage) {
@@ -3405,7 +3735,11 @@ var init_stream_processor = __esm({
3405
3735
  return [{ type: "text", content }];
3406
3736
  }
3407
3737
  /**
3408
- * Process a gadget call through the full lifecycle.
3738
+ * Process a gadget call through the full lifecycle, handling dependencies.
3739
+ *
3740
+ * Gadgets without dependencies (or with all dependencies satisfied) execute immediately.
3741
+ * Gadgets with unsatisfied dependencies are queued for later execution.
3742
+ * After each execution, pending gadgets are checked to see if they can now run.
3409
3743
  */
3410
3744
  async processGadgetCall(call) {
3411
3745
  if (this.shouldStopExecution) {
@@ -3416,6 +3750,53 @@ var init_stream_processor = __esm({
3416
3750
  }
3417
3751
  const events = [];
3418
3752
  events.push({ type: "gadget_call", call });
3753
+ if (call.dependencies.length > 0) {
3754
+ if (call.dependencies.includes(call.invocationId)) {
3755
+ this.logger.warn("Gadget has self-referential dependency (depends on itself)", {
3756
+ gadgetName: call.gadgetName,
3757
+ invocationId: call.invocationId
3758
+ });
3759
+ this.failedInvocations.add(call.invocationId);
3760
+ const skipEvent = {
3761
+ type: "gadget_skipped",
3762
+ gadgetName: call.gadgetName,
3763
+ invocationId: call.invocationId,
3764
+ parameters: call.parameters ?? {},
3765
+ failedDependency: call.invocationId,
3766
+ failedDependencyError: `Gadget "${call.invocationId}" cannot depend on itself (self-referential dependency)`
3767
+ };
3768
+ events.push(skipEvent);
3769
+ return events;
3770
+ }
3771
+ const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
3772
+ if (failedDep) {
3773
+ const skipEvents = await this.handleFailedDependency(call, failedDep);
3774
+ events.push(...skipEvents);
3775
+ return events;
3776
+ }
3777
+ const unsatisfied = call.dependencies.filter((dep) => !this.completedResults.has(dep));
3778
+ if (unsatisfied.length > 0) {
3779
+ this.logger.debug("Queueing gadget for later - waiting on dependencies", {
3780
+ gadgetName: call.gadgetName,
3781
+ invocationId: call.invocationId,
3782
+ waitingOn: unsatisfied
3783
+ });
3784
+ this.pendingGadgets.set(call.invocationId, call);
3785
+ return events;
3786
+ }
3787
+ }
3788
+ const executeEvents = await this.executeGadgetWithHooks(call);
3789
+ events.push(...executeEvents);
3790
+ const triggeredEvents = await this.processPendingGadgets();
3791
+ events.push(...triggeredEvents);
3792
+ return events;
3793
+ }
3794
+ /**
3795
+ * Execute a gadget through the full hook lifecycle.
3796
+ * This is the core execution logic, extracted from processGadgetCall.
3797
+ */
3798
+ async executeGadgetWithHooks(call) {
3799
+ const events = [];
3419
3800
  if (call.parseError) {
3420
3801
  this.logger.warn("Gadget has parse error", {
3421
3802
  gadgetName: call.gadgetName,
@@ -3546,6 +3927,10 @@ var init_stream_processor = __esm({
3546
3927
  });
3547
3928
  }
3548
3929
  await this.runObserversInParallel(completeObservers);
3930
+ this.completedResults.set(result.invocationId, result);
3931
+ if (result.error) {
3932
+ this.failedInvocations.add(result.invocationId);
3933
+ }
3549
3934
  events.push({ type: "gadget_result", result });
3550
3935
  if (result.error) {
3551
3936
  const errorType = this.determineErrorType(call, result);
@@ -3561,6 +3946,162 @@ var init_stream_processor = __esm({
3561
3946
  }
3562
3947
  return events;
3563
3948
  }
3949
+ /**
3950
+ * Handle a gadget that cannot execute because a dependency failed.
3951
+ * Calls the onDependencySkipped controller to allow customization.
3952
+ */
3953
+ async handleFailedDependency(call, failedDep) {
3954
+ const events = [];
3955
+ const depResult = this.completedResults.get(failedDep);
3956
+ const depError = depResult?.error ?? "Dependency failed";
3957
+ let action = { action: "skip" };
3958
+ if (this.hooks.controllers?.onDependencySkipped) {
3959
+ const context = {
3960
+ iteration: this.iteration,
3961
+ gadgetName: call.gadgetName,
3962
+ invocationId: call.invocationId,
3963
+ parameters: call.parameters ?? {},
3964
+ failedDependency: failedDep,
3965
+ failedDependencyError: depError,
3966
+ logger: this.logger
3967
+ };
3968
+ action = await this.hooks.controllers.onDependencySkipped(context);
3969
+ }
3970
+ if (action.action === "skip") {
3971
+ this.failedInvocations.add(call.invocationId);
3972
+ const skipEvent = {
3973
+ type: "gadget_skipped",
3974
+ gadgetName: call.gadgetName,
3975
+ invocationId: call.invocationId,
3976
+ parameters: call.parameters ?? {},
3977
+ failedDependency: failedDep,
3978
+ failedDependencyError: depError
3979
+ };
3980
+ events.push(skipEvent);
3981
+ if (this.hooks.observers?.onGadgetSkipped) {
3982
+ const observeContext = {
3983
+ iteration: this.iteration,
3984
+ gadgetName: call.gadgetName,
3985
+ invocationId: call.invocationId,
3986
+ parameters: call.parameters ?? {},
3987
+ failedDependency: failedDep,
3988
+ failedDependencyError: depError,
3989
+ logger: this.logger
3990
+ };
3991
+ await this.safeObserve(() => this.hooks.observers.onGadgetSkipped(observeContext));
3992
+ }
3993
+ this.logger.info("Gadget skipped due to failed dependency", {
3994
+ gadgetName: call.gadgetName,
3995
+ invocationId: call.invocationId,
3996
+ failedDependency: failedDep
3997
+ });
3998
+ } else if (action.action === "execute_anyway") {
3999
+ this.logger.info("Executing gadget despite failed dependency (controller override)", {
4000
+ gadgetName: call.gadgetName,
4001
+ invocationId: call.invocationId,
4002
+ failedDependency: failedDep
4003
+ });
4004
+ const executeEvents = await this.executeGadgetWithHooks(call);
4005
+ events.push(...executeEvents);
4006
+ } else if (action.action === "use_fallback") {
4007
+ const fallbackResult = {
4008
+ gadgetName: call.gadgetName,
4009
+ invocationId: call.invocationId,
4010
+ parameters: call.parameters ?? {},
4011
+ result: action.fallbackResult,
4012
+ executionTimeMs: 0
4013
+ };
4014
+ this.completedResults.set(call.invocationId, fallbackResult);
4015
+ events.push({ type: "gadget_result", result: fallbackResult });
4016
+ this.logger.info("Using fallback result for gadget with failed dependency", {
4017
+ gadgetName: call.gadgetName,
4018
+ invocationId: call.invocationId,
4019
+ failedDependency: failedDep
4020
+ });
4021
+ }
4022
+ return events;
4023
+ }
4024
+ /**
4025
+ * Process pending gadgets whose dependencies are now satisfied.
4026
+ * Executes ready gadgets in parallel and continues until no more can be triggered.
4027
+ */
4028
+ async processPendingGadgets() {
4029
+ const events = [];
4030
+ let progress = true;
4031
+ while (progress && this.pendingGadgets.size > 0) {
4032
+ progress = false;
4033
+ const readyToExecute = [];
4034
+ const readyToSkip = [];
4035
+ for (const [invocationId, call] of this.pendingGadgets) {
4036
+ const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
4037
+ if (failedDep) {
4038
+ readyToSkip.push({ call, failedDep });
4039
+ continue;
4040
+ }
4041
+ const allSatisfied = call.dependencies.every((dep) => this.completedResults.has(dep));
4042
+ if (allSatisfied) {
4043
+ readyToExecute.push(call);
4044
+ }
4045
+ }
4046
+ for (const { call, failedDep } of readyToSkip) {
4047
+ this.pendingGadgets.delete(call.invocationId);
4048
+ const skipEvents = await this.handleFailedDependency(call, failedDep);
4049
+ events.push(...skipEvents);
4050
+ progress = true;
4051
+ }
4052
+ if (readyToExecute.length > 0) {
4053
+ this.logger.debug("Executing ready gadgets in parallel", {
4054
+ count: readyToExecute.length,
4055
+ invocationIds: readyToExecute.map((c) => c.invocationId)
4056
+ });
4057
+ for (const call of readyToExecute) {
4058
+ this.pendingGadgets.delete(call.invocationId);
4059
+ }
4060
+ const executePromises = readyToExecute.map((call) => this.executeGadgetWithHooks(call));
4061
+ const results = await Promise.all(executePromises);
4062
+ for (const executeEvents of results) {
4063
+ events.push(...executeEvents);
4064
+ }
4065
+ progress = true;
4066
+ }
4067
+ }
4068
+ if (this.pendingGadgets.size > 0) {
4069
+ const pendingIds = new Set(this.pendingGadgets.keys());
4070
+ for (const [invocationId, call] of this.pendingGadgets) {
4071
+ const missingDeps = call.dependencies.filter((dep) => !this.completedResults.has(dep));
4072
+ const circularDeps = missingDeps.filter((dep) => pendingIds.has(dep));
4073
+ const trulyMissingDeps = missingDeps.filter((dep) => !pendingIds.has(dep));
4074
+ let errorMessage;
4075
+ let logLevel = "warn";
4076
+ if (circularDeps.length > 0 && trulyMissingDeps.length > 0) {
4077
+ errorMessage = `Dependencies unresolvable: circular=[${circularDeps.join(", ")}], missing=[${trulyMissingDeps.join(", ")}]`;
4078
+ logLevel = "error";
4079
+ } else if (circularDeps.length > 0) {
4080
+ errorMessage = `Circular dependency detected: "${invocationId}" depends on "${circularDeps[0]}" which also depends on "${invocationId}" (directly or indirectly)`;
4081
+ } else {
4082
+ errorMessage = `Dependency "${missingDeps[0]}" was never executed - check that the invocation ID exists and is spelled correctly`;
4083
+ }
4084
+ this.logger[logLevel]("Gadget has unresolvable dependencies", {
4085
+ gadgetName: call.gadgetName,
4086
+ invocationId,
4087
+ circularDependencies: circularDeps,
4088
+ missingDependencies: trulyMissingDeps
4089
+ });
4090
+ this.failedInvocations.add(invocationId);
4091
+ const skipEvent = {
4092
+ type: "gadget_skipped",
4093
+ gadgetName: call.gadgetName,
4094
+ invocationId,
4095
+ parameters: call.parameters ?? {},
4096
+ failedDependency: missingDeps[0],
4097
+ failedDependencyError: errorMessage
4098
+ };
4099
+ events.push(skipEvent);
4100
+ }
4101
+ this.pendingGadgets.clear();
4102
+ }
4103
+ return events;
4104
+ }
3564
4105
  /**
3565
4106
  * Safely execute an observer, catching and logging any errors.
3566
4107
  * Observers are non-critical, so errors are logged but don't crash the system.
@@ -3998,9 +4539,9 @@ var init_agent = __esm({
3998
4539
  if (msg.role === "user") {
3999
4540
  this.conversation.addUserMessage(msg.content);
4000
4541
  } else if (msg.role === "assistant") {
4001
- this.conversation.addAssistantMessage(msg.content);
4542
+ this.conversation.addAssistantMessage(extractText(msg.content));
4002
4543
  } else if (msg.role === "system") {
4003
- this.conversation.addUserMessage(`[System] ${msg.content}`);
4544
+ this.conversation.addUserMessage(`[System] ${extractText(msg.content)}`);
4004
4545
  }
4005
4546
  }
4006
4547
  }
@@ -4579,6 +5120,7 @@ var init_anthropic = __esm({
4579
5120
  "src/providers/anthropic.ts"() {
4580
5121
  "use strict";
4581
5122
  import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
5123
+ init_messages();
4582
5124
  init_anthropic_models();
4583
5125
  init_base_provider();
4584
5126
  init_constants2();
@@ -4591,11 +5133,33 @@ var init_anthropic = __esm({
4591
5133
  getModelSpecs() {
4592
5134
  return ANTHROPIC_MODELS;
4593
5135
  }
5136
+ // =========================================================================
5137
+ // Image Generation (Not Supported)
5138
+ // =========================================================================
5139
+ supportsImageGeneration(_modelId) {
5140
+ return false;
5141
+ }
5142
+ async generateImage() {
5143
+ throw new Error(
5144
+ "Anthropic does not support image generation. Use OpenAI (DALL-E, GPT Image) or Google Gemini (Imagen) instead."
5145
+ );
5146
+ }
5147
+ // =========================================================================
5148
+ // Speech Generation (Not Supported)
5149
+ // =========================================================================
5150
+ supportsSpeechGeneration(_modelId) {
5151
+ return false;
5152
+ }
5153
+ async generateSpeech() {
5154
+ throw new Error(
5155
+ "Anthropic does not support speech generation. Use OpenAI (TTS) or Google Gemini (TTS) instead."
5156
+ );
5157
+ }
4594
5158
  buildRequestPayload(options, descriptor, spec, messages) {
4595
5159
  const systemMessages = messages.filter((message) => message.role === "system");
4596
5160
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
4597
5161
  type: "text",
4598
- text: m.content,
5162
+ text: extractText(m.content),
4599
5163
  // Add cache_control to the LAST system message block
4600
5164
  ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
4601
5165
  })) : void 0;
@@ -4608,14 +5172,10 @@ var init_anthropic = __esm({
4608
5172
  );
4609
5173
  const conversation = nonSystemMessages.map((message, index) => ({
4610
5174
  role: message.role,
4611
- content: [
4612
- {
4613
- type: "text",
4614
- text: message.content,
4615
- // Add cache_control to the LAST user message
4616
- ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
4617
- }
4618
- ]
5175
+ content: this.convertToAnthropicContent(
5176
+ message.content,
5177
+ message.role === "user" && index === lastUserIndex
5178
+ )
4619
5179
  }));
4620
5180
  const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
4621
5181
  const payload = {
@@ -4631,15 +5191,61 @@ var init_anthropic = __esm({
4631
5191
  };
4632
5192
  return payload;
4633
5193
  }
4634
- async executeStreamRequest(payload, signal) {
4635
- const client = this.client;
4636
- const stream2 = await client.messages.create(payload, signal ? { signal } : void 0);
4637
- return stream2;
5194
+ /**
5195
+ * Convert llmist content to Anthropic's content block format.
5196
+ * Handles text, images (base64 only), and applies cache_control.
5197
+ */
5198
+ convertToAnthropicContent(content, addCacheControl) {
5199
+ const parts = normalizeContent(content);
5200
+ return parts.map((part, index) => {
5201
+ const isLastPart = index === parts.length - 1;
5202
+ const cacheControl = addCacheControl && isLastPart ? { cache_control: { type: "ephemeral" } } : {};
5203
+ if (part.type === "text") {
5204
+ return {
5205
+ type: "text",
5206
+ text: part.text,
5207
+ ...cacheControl
5208
+ };
5209
+ }
5210
+ if (part.type === "image") {
5211
+ return this.convertImagePart(part, cacheControl);
5212
+ }
5213
+ if (part.type === "audio") {
5214
+ throw new Error(
5215
+ "Anthropic does not support audio input. Use Google Gemini for audio processing."
5216
+ );
5217
+ }
5218
+ throw new Error(`Unsupported content type: ${part.type}`);
5219
+ });
4638
5220
  }
4639
- async *wrapStream(iterable) {
4640
- const stream2 = iterable;
4641
- let inputTokens = 0;
4642
- let cachedInputTokens = 0;
5221
+ /**
5222
+ * Convert an image content part to Anthropic's image block format.
5223
+ */
5224
+ convertImagePart(part, cacheControl) {
5225
+ if (part.source.type === "url") {
5226
+ throw new Error(
5227
+ "Anthropic does not support image URLs. Please provide base64-encoded image data instead."
5228
+ );
5229
+ }
5230
+ return {
5231
+ type: "image",
5232
+ source: {
5233
+ type: "base64",
5234
+ media_type: part.source.mediaType,
5235
+ data: part.source.data
5236
+ },
5237
+ ...cacheControl
5238
+ };
5239
+ }
5240
+ async executeStreamRequest(payload, signal) {
5241
+ const client = this.client;
5242
+ const stream2 = await client.messages.create(payload, signal ? { signal } : void 0);
5243
+ return stream2;
5244
+ }
5245
+ async *wrapStream(iterable) {
5246
+ const stream2 = iterable;
5247
+ let inputTokens = 0;
5248
+ let cachedInputTokens = 0;
4643
5249
  let cacheCreationInputTokens = 0;
4644
5250
  for await (const event of stream2) {
4645
5251
  if (event.type === "message_start") {
@@ -4713,17 +5319,12 @@ var init_anthropic = __esm({
4713
5319
  async countTokens(messages, descriptor, _spec) {
4714
5320
  const client = this.client;
4715
5321
  const systemMessages = messages.filter((message) => message.role === "system");
4716
- const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
5322
+ const system = systemMessages.length > 0 ? systemMessages.map((m) => extractText(m.content)).join("\n\n") : void 0;
4717
5323
  const conversation = messages.filter(
4718
5324
  (message) => message.role !== "system"
4719
5325
  ).map((message) => ({
4720
5326
  role: message.role,
4721
- content: [
4722
- {
4723
- type: "text",
4724
- text: message.content
4725
- }
4726
- ]
5327
+ content: this.convertToAnthropicContent(message.content, false)
4727
5328
  }));
4728
5329
  try {
4729
5330
  const response = await client.messages.countTokens({
@@ -4737,14 +5338,201 @@ var init_anthropic = __esm({
4737
5338
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
4738
5339
  error
4739
5340
  );
4740
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
4741
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
5341
+ let totalChars = 0;
5342
+ let imageCount = 0;
5343
+ for (const msg of messages) {
5344
+ const parts = normalizeContent(msg.content);
5345
+ for (const part of parts) {
5346
+ if (part.type === "text") {
5347
+ totalChars += part.text.length;
5348
+ } else if (part.type === "image") {
5349
+ imageCount++;
5350
+ }
5351
+ }
5352
+ }
5353
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 1e3;
4742
5354
  }
4743
5355
  }
4744
5356
  };
4745
5357
  }
4746
5358
  });
4747
5359
 
5360
+ // src/providers/gemini-image-models.ts
5361
+ function getGeminiImageModelSpec(modelId) {
5362
+ return geminiImageModels.find((m) => m.modelId === modelId);
5363
+ }
5364
+ function isGeminiImageModel(modelId) {
5365
+ return geminiImageModels.some((m) => m.modelId === modelId);
5366
+ }
5367
+ function calculateGeminiImageCost(modelId, size = "1:1", n = 1) {
5368
+ const spec = getGeminiImageModelSpec(modelId);
5369
+ if (!spec) return void 0;
5370
+ if (spec.pricing.perImage !== void 0) {
5371
+ return spec.pricing.perImage * n;
5372
+ }
5373
+ if (spec.pricing.bySize) {
5374
+ const sizePrice = spec.pricing.bySize[size];
5375
+ if (typeof sizePrice === "number") {
5376
+ return sizePrice * n;
5377
+ }
5378
+ }
5379
+ return void 0;
5380
+ }
5381
+ var IMAGEN4_ASPECT_RATIOS, GEMINI_IMAGE_ASPECT_RATIOS, geminiImageModels;
5382
+ var init_gemini_image_models = __esm({
5383
+ "src/providers/gemini-image-models.ts"() {
5384
+ "use strict";
5385
+ IMAGEN4_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
5386
+ GEMINI_IMAGE_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
5387
+ geminiImageModels = [
5388
+ // Imagen 4 Family (standalone image generation)
5389
+ {
5390
+ provider: "gemini",
5391
+ modelId: "imagen-4.0-fast-generate-001",
5392
+ displayName: "Imagen 4 Fast",
5393
+ pricing: {
5394
+ perImage: 0.02
5395
+ },
5396
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5397
+ maxImages: 4,
5398
+ defaultSize: "1:1",
5399
+ features: {
5400
+ textRendering: true
5401
+ }
5402
+ },
5403
+ {
5404
+ provider: "gemini",
5405
+ modelId: "imagen-4.0-generate-001",
5406
+ displayName: "Imagen 4",
5407
+ pricing: {
5408
+ perImage: 0.04
5409
+ },
5410
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5411
+ maxImages: 4,
5412
+ defaultSize: "1:1",
5413
+ features: {
5414
+ textRendering: true
5415
+ }
5416
+ },
5417
+ {
5418
+ provider: "gemini",
5419
+ modelId: "imagen-4.0-ultra-generate-001",
5420
+ displayName: "Imagen 4 Ultra",
5421
+ pricing: {
5422
+ perImage: 0.06
5423
+ },
5424
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5425
+ maxImages: 4,
5426
+ defaultSize: "1:1",
5427
+ features: {
5428
+ textRendering: true
5429
+ }
5430
+ },
5431
+ // Preview versions
5432
+ {
5433
+ provider: "gemini",
5434
+ modelId: "imagen-4.0-generate-preview-06-06",
5435
+ displayName: "Imagen 4 (Preview)",
5436
+ pricing: {
5437
+ perImage: 0.04
5438
+ },
5439
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5440
+ maxImages: 4,
5441
+ defaultSize: "1:1",
5442
+ features: {
5443
+ textRendering: true
5444
+ }
5445
+ },
5446
+ {
5447
+ provider: "gemini",
5448
+ modelId: "imagen-4.0-ultra-generate-preview-06-06",
5449
+ displayName: "Imagen 4 Ultra (Preview)",
5450
+ pricing: {
5451
+ perImage: 0.06
5452
+ },
5453
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5454
+ maxImages: 4,
5455
+ defaultSize: "1:1",
5456
+ features: {
5457
+ textRendering: true
5458
+ }
5459
+ },
5460
+ // Gemini Native Image Generation (multimodal models)
5461
+ {
5462
+ provider: "gemini",
5463
+ modelId: "gemini-2.5-flash-image",
5464
+ displayName: "Gemini 2.5 Flash Image",
5465
+ pricing: {
5466
+ perImage: 0.039
5467
+ },
5468
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
5469
+ maxImages: 1,
5470
+ defaultSize: "1:1",
5471
+ features: {
5472
+ conversational: true,
5473
+ textRendering: true
5474
+ }
5475
+ },
5476
+ {
5477
+ provider: "gemini",
5478
+ modelId: "gemini-2.5-flash-image-preview",
5479
+ displayName: "Gemini 2.5 Flash Image (Preview)",
5480
+ pricing: {
5481
+ perImage: 0.039
5482
+ },
5483
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
5484
+ maxImages: 1,
5485
+ defaultSize: "1:1",
5486
+ features: {
5487
+ conversational: true,
5488
+ textRendering: true
5489
+ }
5490
+ },
5491
+ {
5492
+ provider: "gemini",
5493
+ modelId: "gemini-3-pro-image-preview",
5494
+ displayName: "Gemini 3 Pro Image (Preview)",
5495
+ pricing: {
5496
+ // Token-based: ~$0.134 per 1K/2K image, $0.24 per 4K
5497
+ // Using 2K as default
5498
+ bySize: {
5499
+ "1K": 0.134,
5500
+ "2K": 0.134,
5501
+ "4K": 0.24
5502
+ }
5503
+ },
5504
+ supportedSizes: ["1K", "2K", "4K"],
5505
+ maxImages: 1,
5506
+ defaultSize: "2K",
5507
+ features: {
5508
+ conversational: true,
5509
+ textRendering: true
5510
+ }
5511
+ },
5512
+ // Alias: nano-banana-pro-preview is gemini-3-pro-image-preview
5513
+ {
5514
+ provider: "gemini",
5515
+ modelId: "nano-banana-pro-preview",
5516
+ displayName: "Nano Banana Pro (Gemini 3 Pro Image)",
5517
+ pricing: {
5518
+ bySize: {
5519
+ "1K": 0.134,
5520
+ "2K": 0.134,
5521
+ "4K": 0.24
5522
+ }
5523
+ },
5524
+ supportedSizes: ["1K", "2K", "4K"],
5525
+ maxImages: 1,
5526
+ defaultSize: "2K",
5527
+ features: {
5528
+ conversational: true,
5529
+ textRendering: true
5530
+ }
5531
+ }
5532
+ ];
5533
+ }
5534
+ });
5535
+
4748
5536
  // src/providers/gemini-models.ts
4749
5537
  var GEMINI_MODELS;
4750
5538
  var init_gemini_models = __esm({
@@ -4918,7 +5706,171 @@ var init_gemini_models = __esm({
4918
5706
  }
4919
5707
  });
4920
5708
 
5709
+ // src/providers/gemini-speech-models.ts
5710
+ function getGeminiSpeechModelSpec(modelId) {
5711
+ return geminiSpeechModels.find((m) => m.modelId === modelId);
5712
+ }
5713
+ function isGeminiSpeechModel(modelId) {
5714
+ return geminiSpeechModels.some((m) => m.modelId === modelId);
5715
+ }
5716
+ function calculateGeminiSpeechCost(modelId, characterCount, estimatedMinutes) {
5717
+ const spec = getGeminiSpeechModelSpec(modelId);
5718
+ if (!spec) return void 0;
5719
+ if (spec.pricing.perMinute !== void 0) {
5720
+ if (estimatedMinutes !== void 0) {
5721
+ return estimatedMinutes * spec.pricing.perMinute;
5722
+ }
5723
+ const approxMinutes = characterCount / 750;
5724
+ return approxMinutes * spec.pricing.perMinute;
5725
+ }
5726
+ return void 0;
5727
+ }
5728
+ var GEMINI_TTS_VOICES, GEMINI_TTS_FORMATS, geminiSpeechModels;
5729
+ var init_gemini_speech_models = __esm({
5730
+ "src/providers/gemini-speech-models.ts"() {
5731
+ "use strict";
5732
+ GEMINI_TTS_VOICES = [
5733
+ "Zephyr",
5734
+ // Bright
5735
+ "Puck",
5736
+ // Upbeat
5737
+ "Charon",
5738
+ // Informative
5739
+ "Kore",
5740
+ // Firm
5741
+ "Fenrir",
5742
+ // Excitable
5743
+ "Leda",
5744
+ // Youthful
5745
+ "Orus",
5746
+ // Firm
5747
+ "Aoede",
5748
+ // Breezy
5749
+ "Callirrhoe",
5750
+ // Easy-going
5751
+ "Autonoe",
5752
+ // Bright
5753
+ "Enceladus",
5754
+ // Breathy
5755
+ "Iapetus",
5756
+ // Clear
5757
+ "Umbriel",
5758
+ // Easy-going
5759
+ "Algieba",
5760
+ // Smooth
5761
+ "Despina",
5762
+ // Smooth
5763
+ "Erinome",
5764
+ // Clear
5765
+ "Algenib",
5766
+ // Gravelly
5767
+ "Rasalgethi",
5768
+ // Informative
5769
+ "Laomedeia",
5770
+ // Upbeat
5771
+ "Achernar",
5772
+ // Soft
5773
+ "Alnilam",
5774
+ // Firm
5775
+ "Schedar",
5776
+ // Even
5777
+ "Gacrux",
5778
+ // Mature
5779
+ "Pulcherrima",
5780
+ // Forward
5781
+ "Achird",
5782
+ // Friendly
5783
+ "Zubenelgenubi",
5784
+ // Casual
5785
+ "Vindemiatrix",
5786
+ // Gentle
5787
+ "Sadachbia",
5788
+ // Lively
5789
+ "Sadaltager",
5790
+ // Knowledgeable
5791
+ "Sulafat"
5792
+ // Warm
5793
+ ];
5794
+ GEMINI_TTS_FORMATS = ["pcm", "wav"];
5795
+ geminiSpeechModels = [
5796
+ {
5797
+ provider: "gemini",
5798
+ modelId: "gemini-2.5-flash-preview-tts",
5799
+ displayName: "Gemini 2.5 Flash TTS (Preview)",
5800
+ pricing: {
5801
+ // $0.50 per 1M input tokens = $0.0000005 per token
5802
+ perInputToken: 5e-7,
5803
+ // $10.00 per 1M audio output tokens = $0.00001 per token
5804
+ perAudioOutputToken: 1e-5,
5805
+ // Rough estimate: ~$0.01 per minute of audio
5806
+ perMinute: 0.01
5807
+ },
5808
+ voices: [...GEMINI_TTS_VOICES],
5809
+ formats: GEMINI_TTS_FORMATS,
5810
+ maxInputLength: 8e3,
5811
+ // bytes (text + prompt combined)
5812
+ defaultVoice: "Zephyr",
5813
+ defaultFormat: "wav",
5814
+ features: {
5815
+ multiSpeaker: true,
5816
+ languages: 24,
5817
+ voiceInstructions: true
5818
+ }
5819
+ },
5820
+ {
5821
+ provider: "gemini",
5822
+ modelId: "gemini-2.5-pro-preview-tts",
5823
+ displayName: "Gemini 2.5 Pro TTS (Preview)",
5824
+ pricing: {
5825
+ // $1.00 per 1M input tokens = $0.000001 per token
5826
+ perInputToken: 1e-6,
5827
+ // $20.00 per 1M audio output tokens = $0.00002 per token
5828
+ perAudioOutputToken: 2e-5,
5829
+ // Rough estimate: ~$0.02 per minute of audio
5830
+ perMinute: 0.02
5831
+ },
5832
+ voices: [...GEMINI_TTS_VOICES],
5833
+ formats: GEMINI_TTS_FORMATS,
5834
+ maxInputLength: 8e3,
5835
+ // bytes
5836
+ defaultVoice: "Zephyr",
5837
+ defaultFormat: "wav",
5838
+ features: {
5839
+ multiSpeaker: true,
5840
+ languages: 24,
5841
+ voiceInstructions: true
5842
+ }
5843
+ }
5844
+ ];
5845
+ }
5846
+ });
5847
+
4921
5848
  // src/providers/gemini.ts
5849
+ function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
5850
+ const byteRate = sampleRate * numChannels * bitsPerSample / 8;
5851
+ const blockAlign = numChannels * bitsPerSample / 8;
5852
+ const dataSize = pcmData.length;
5853
+ const headerSize = 44;
5854
+ const fileSize = headerSize + dataSize - 8;
5855
+ const buffer = new ArrayBuffer(headerSize + dataSize);
5856
+ const view = new DataView(buffer);
5857
+ const uint8 = new Uint8Array(buffer);
5858
+ view.setUint32(0, 1380533830, false);
5859
+ view.setUint32(4, fileSize, true);
5860
+ view.setUint32(8, 1463899717, false);
5861
+ view.setUint32(12, 1718449184, false);
5862
+ view.setUint32(16, 16, true);
5863
+ view.setUint16(20, 1, true);
5864
+ view.setUint16(22, numChannels, true);
5865
+ view.setUint32(24, sampleRate, true);
5866
+ view.setUint32(28, byteRate, true);
5867
+ view.setUint16(32, blockAlign, true);
5868
+ view.setUint16(34, bitsPerSample, true);
5869
+ view.setUint32(36, 1684108385, false);
5870
+ view.setUint32(40, dataSize, true);
5871
+ uint8.set(pcmData, headerSize);
5872
+ return buffer;
5873
+ }
4922
5874
  function createGeminiProviderFromEnv() {
4923
5875
  return createProviderFromEnv("GEMINI_API_KEY", import_genai.GoogleGenAI, GeminiGenerativeProvider);
4924
5876
  }
@@ -4927,9 +5879,12 @@ var init_gemini = __esm({
4927
5879
  "src/providers/gemini.ts"() {
4928
5880
  "use strict";
4929
5881
  import_genai = require("@google/genai");
5882
+ init_messages();
4930
5883
  init_base_provider();
4931
5884
  init_constants2();
5885
+ init_gemini_image_models();
4932
5886
  init_gemini_models();
5887
+ init_gemini_speech_models();
4933
5888
  init_utils();
4934
5889
  GEMINI_ROLE_MAP = {
4935
5890
  system: "user",
@@ -4944,6 +5899,139 @@ var init_gemini = __esm({
4944
5899
  getModelSpecs() {
4945
5900
  return GEMINI_MODELS;
4946
5901
  }
5902
+ // =========================================================================
5903
+ // Image Generation
5904
+ // =========================================================================
5905
+ getImageModelSpecs() {
5906
+ return geminiImageModels;
5907
+ }
5908
+ supportsImageGeneration(modelId) {
5909
+ return isGeminiImageModel(modelId);
5910
+ }
5911
+ async generateImage(options) {
5912
+ const client = this.client;
5913
+ const spec = getGeminiImageModelSpec(options.model);
5914
+ const isImagenModel = options.model.startsWith("imagen");
5915
+ const aspectRatio = options.size ?? spec?.defaultSize ?? "1:1";
5916
+ const n = options.n ?? 1;
5917
+ if (isImagenModel) {
5918
+ const response2 = await client.models.generateImages({
5919
+ model: options.model,
5920
+ prompt: options.prompt,
5921
+ config: {
5922
+ numberOfImages: n,
5923
+ aspectRatio,
5924
+ outputMimeType: options.responseFormat === "b64_json" ? "image/png" : "image/jpeg"
5925
+ }
5926
+ });
5927
+ const images2 = response2.generatedImages ?? [];
5928
+ const cost2 = calculateGeminiImageCost(options.model, aspectRatio, images2.length);
5929
+ return {
5930
+ // Gemini's imageBytes is already base64 encoded, so use it directly
5931
+ images: images2.map((img) => ({
5932
+ b64Json: img.image?.imageBytes ?? void 0
5933
+ })),
5934
+ model: options.model,
5935
+ usage: {
5936
+ imagesGenerated: images2.length,
5937
+ size: aspectRatio,
5938
+ quality: "standard"
5939
+ },
5940
+ cost: cost2
5941
+ };
5942
+ }
5943
+ const response = await client.models.generateContent({
5944
+ model: options.model,
5945
+ contents: [{ role: "user", parts: [{ text: options.prompt }] }],
5946
+ config: {
5947
+ responseModalities: [import_genai.Modality.IMAGE, import_genai.Modality.TEXT]
5948
+ }
5949
+ });
5950
+ const images = [];
5951
+ const candidate = response.candidates?.[0];
5952
+ if (candidate?.content?.parts) {
5953
+ for (const part of candidate.content.parts) {
5954
+ if ("inlineData" in part && part.inlineData) {
5955
+ images.push({
5956
+ b64Json: part.inlineData.data
5957
+ });
5958
+ }
5959
+ }
5960
+ }
5961
+ const cost = calculateGeminiImageCost(options.model, aspectRatio, images.length);
5962
+ return {
5963
+ images,
5964
+ model: options.model,
5965
+ usage: {
5966
+ imagesGenerated: images.length,
5967
+ size: aspectRatio,
5968
+ quality: "standard"
5969
+ },
5970
+ cost
5971
+ };
5972
+ }
5973
+ // =========================================================================
5974
+ // Speech Generation
5975
+ // =========================================================================
5976
+ getSpeechModelSpecs() {
5977
+ return geminiSpeechModels;
5978
+ }
5979
+ supportsSpeechGeneration(modelId) {
5980
+ return isGeminiSpeechModel(modelId);
5981
+ }
5982
+ async generateSpeech(options) {
5983
+ const client = this.client;
5984
+ const spec = getGeminiSpeechModelSpec(options.model);
5985
+ const voice = options.voice ?? spec?.defaultVoice ?? "Zephyr";
5986
+ const response = await client.models.generateContent({
5987
+ model: options.model,
5988
+ contents: [
5989
+ {
5990
+ role: "user",
5991
+ parts: [{ text: options.input }]
5992
+ }
5993
+ ],
5994
+ config: {
5995
+ responseModalities: [import_genai.Modality.AUDIO],
5996
+ speechConfig: {
5997
+ voiceConfig: {
5998
+ prebuiltVoiceConfig: {
5999
+ voiceName: voice
6000
+ }
6001
+ }
6002
+ }
6003
+ }
6004
+ });
6005
+ let pcmData;
6006
+ const candidate = response.candidates?.[0];
6007
+ if (candidate?.content?.parts) {
6008
+ for (const part of candidate.content.parts) {
6009
+ if ("inlineData" in part && part.inlineData?.data) {
6010
+ const base64 = part.inlineData.data;
6011
+ const binary = atob(base64);
6012
+ pcmData = new Uint8Array(binary.length);
6013
+ for (let i = 0; i < binary.length; i++) {
6014
+ pcmData[i] = binary.charCodeAt(i);
6015
+ }
6016
+ break;
6017
+ }
6018
+ }
6019
+ }
6020
+ if (!pcmData) {
6021
+ throw new Error("No audio data in Gemini TTS response");
6022
+ }
6023
+ const audioData = wrapPcmInWav(pcmData, 24e3, 16, 1);
6024
+ const cost = calculateGeminiSpeechCost(options.model, options.input.length);
6025
+ return {
6026
+ audio: audioData,
6027
+ model: options.model,
6028
+ usage: {
6029
+ characterCount: options.input.length
6030
+ },
6031
+ cost,
6032
+ format: spec?.defaultFormat ?? "wav"
6033
+ };
6034
+ }
4947
6035
  buildRequestPayload(options, descriptor, _spec, messages) {
4948
6036
  const contents = this.convertMessagesToContents(messages);
4949
6037
  const generationConfig = this.buildGenerationConfig(options);
@@ -4961,7 +6049,7 @@ var init_gemini = __esm({
4961
6049
  };
4962
6050
  return {
4963
6051
  model: descriptor.name,
4964
- contents: this.convertContentsForNewSDK(contents),
6052
+ contents,
4965
6053
  config
4966
6054
  };
4967
6055
  }
@@ -4996,18 +6084,25 @@ var init_gemini = __esm({
4996
6084
  if (message.role === "system") {
4997
6085
  expandedMessages.push({
4998
6086
  role: "user",
4999
- content: message.content
6087
+ content: extractText(message.content)
5000
6088
  });
5001
6089
  expandedMessages.push({
5002
6090
  role: "assistant",
5003
6091
  content: "Understood."
5004
6092
  });
5005
6093
  } else {
5006
- expandedMessages.push(message);
6094
+ expandedMessages.push({
6095
+ role: message.role,
6096
+ content: message.content
6097
+ });
5007
6098
  }
5008
6099
  }
5009
6100
  return this.mergeConsecutiveMessages(expandedMessages);
5010
6101
  }
6102
+ /**
6103
+ * Merge consecutive messages with the same role (required by Gemini).
6104
+ * Handles multimodal content by converting to Gemini's part format.
6105
+ */
5011
6106
  mergeConsecutiveMessages(messages) {
5012
6107
  if (messages.length === 0) {
5013
6108
  return [];
@@ -5016,15 +6111,16 @@ var init_gemini = __esm({
5016
6111
  let currentGroup = null;
5017
6112
  for (const message of messages) {
5018
6113
  const geminiRole = GEMINI_ROLE_MAP[message.role];
6114
+ const geminiParts = this.convertToGeminiParts(message.content);
5019
6115
  if (currentGroup && currentGroup.role === geminiRole) {
5020
- currentGroup.parts.push({ text: message.content });
6116
+ currentGroup.parts.push(...geminiParts);
5021
6117
  } else {
5022
6118
  if (currentGroup) {
5023
6119
  result.push(currentGroup);
5024
6120
  }
5025
6121
  currentGroup = {
5026
6122
  role: geminiRole,
5027
- parts: [{ text: message.content }]
6123
+ parts: geminiParts
5028
6124
  };
5029
6125
  }
5030
6126
  }
@@ -5033,11 +6129,39 @@ var init_gemini = __esm({
5033
6129
  }
5034
6130
  return result;
5035
6131
  }
5036
- convertContentsForNewSDK(contents) {
5037
- return contents.map((content) => ({
5038
- role: content.role,
5039
- parts: content.parts.map((part) => ({ text: part.text }))
5040
- }));
6132
+ /**
6133
+ * Convert llmist content to Gemini's part format.
6134
+ * Handles text, images, and audio (Gemini supports all three).
6135
+ */
6136
+ convertToGeminiParts(content) {
6137
+ const parts = normalizeContent(content);
6138
+ return parts.map((part) => {
6139
+ if (part.type === "text") {
6140
+ return { text: part.text };
6141
+ }
6142
+ if (part.type === "image") {
6143
+ if (part.source.type === "url") {
6144
+ throw new Error(
6145
+ "Gemini does not support image URLs directly. Please provide base64-encoded image data."
6146
+ );
6147
+ }
6148
+ return {
6149
+ inlineData: {
6150
+ mimeType: part.source.mediaType,
6151
+ data: part.source.data
6152
+ }
6153
+ };
6154
+ }
6155
+ if (part.type === "audio") {
6156
+ return {
6157
+ inlineData: {
6158
+ mimeType: part.source.mediaType,
6159
+ data: part.source.data
6160
+ }
6161
+ };
6162
+ }
6163
+ throw new Error(`Unsupported content type: ${part.type}`);
6164
+ });
5041
6165
  }
5042
6166
  buildGenerationConfig(options) {
5043
6167
  const config = {};
@@ -5058,9 +6182,9 @@ var init_gemini = __esm({
5058
6182
  async *wrapStream(iterable) {
5059
6183
  const stream2 = iterable;
5060
6184
  for await (const chunk of stream2) {
5061
- const text = this.extractText(chunk);
5062
- if (text) {
5063
- yield { text, rawEvent: chunk };
6185
+ const text3 = this.extractText(chunk);
6186
+ if (text3) {
6187
+ yield { text: text3, rawEvent: chunk };
5064
6188
  }
5065
6189
  const finishReason = this.extractFinishReason(chunk);
5066
6190
  const usage = this.extractUsage(chunk);
@@ -5121,7 +6245,7 @@ var init_gemini = __esm({
5121
6245
  try {
5122
6246
  const response = await client.models.countTokens({
5123
6247
  model: descriptor.name,
5124
- contents: this.convertContentsForNewSDK(contents)
6248
+ contents
5125
6249
  // Note: systemInstruction not used - it's not supported by countTokens()
5126
6250
  // and would cause a 2100% token counting error
5127
6251
  });
@@ -5131,14 +6255,140 @@ var init_gemini = __esm({
5131
6255
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
5132
6256
  error
5133
6257
  );
5134
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
5135
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
6258
+ let totalChars = 0;
6259
+ let mediaCount = 0;
6260
+ for (const msg of messages) {
6261
+ const parts = normalizeContent(msg.content);
6262
+ for (const part of parts) {
6263
+ if (part.type === "text") {
6264
+ totalChars += part.text.length;
6265
+ } else if (part.type === "image" || part.type === "audio") {
6266
+ mediaCount++;
6267
+ }
6268
+ }
6269
+ }
6270
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + mediaCount * 258;
5136
6271
  }
5137
6272
  }
5138
6273
  };
5139
6274
  }
5140
6275
  });
5141
6276
 
6277
+ // src/providers/openai-image-models.ts
6278
+ function getOpenAIImageModelSpec(modelId) {
6279
+ return openaiImageModels.find((m) => m.modelId === modelId);
6280
+ }
6281
+ function isOpenAIImageModel(modelId) {
6282
+ return openaiImageModels.some((m) => m.modelId === modelId);
6283
+ }
6284
+ function calculateOpenAIImageCost(modelId, size, quality = "standard", n = 1) {
6285
+ const spec = getOpenAIImageModelSpec(modelId);
6286
+ if (!spec) return void 0;
6287
+ const sizePrice = spec.pricing.bySize?.[size];
6288
+ if (sizePrice === void 0) return void 0;
6289
+ let pricePerImage;
6290
+ if (typeof sizePrice === "number") {
6291
+ pricePerImage = sizePrice;
6292
+ } else {
6293
+ pricePerImage = sizePrice[quality];
6294
+ if (pricePerImage === void 0) return void 0;
6295
+ }
6296
+ return pricePerImage * n;
6297
+ }
6298
+ var GPT_IMAGE_SIZES, GPT_IMAGE_QUALITIES, DALLE3_SIZES, DALLE3_QUALITIES, DALLE2_SIZES, openaiImageModels;
6299
+ var init_openai_image_models = __esm({
6300
+ "src/providers/openai-image-models.ts"() {
6301
+ "use strict";
6302
+ GPT_IMAGE_SIZES = ["1024x1024", "1024x1536", "1536x1024"];
6303
+ GPT_IMAGE_QUALITIES = ["low", "medium", "high"];
6304
+ DALLE3_SIZES = ["1024x1024", "1024x1792", "1792x1024"];
6305
+ DALLE3_QUALITIES = ["standard", "hd"];
6306
+ DALLE2_SIZES = ["256x256", "512x512", "1024x1024"];
6307
+ openaiImageModels = [
6308
+ // GPT Image 1 Family (flagship)
6309
+ {
6310
+ provider: "openai",
6311
+ modelId: "gpt-image-1",
6312
+ displayName: "GPT Image 1",
6313
+ pricing: {
6314
+ bySize: {
6315
+ "1024x1024": { low: 0.011, medium: 0.04, high: 0.17 },
6316
+ "1024x1536": { low: 0.016, medium: 0.06, high: 0.25 },
6317
+ "1536x1024": { low: 0.016, medium: 0.06, high: 0.25 }
6318
+ }
6319
+ },
6320
+ supportedSizes: [...GPT_IMAGE_SIZES],
6321
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
6322
+ maxImages: 1,
6323
+ defaultSize: "1024x1024",
6324
+ defaultQuality: "medium",
6325
+ features: {
6326
+ textRendering: true,
6327
+ transparency: true
6328
+ }
6329
+ },
6330
+ {
6331
+ provider: "openai",
6332
+ modelId: "gpt-image-1-mini",
6333
+ displayName: "GPT Image 1 Mini",
6334
+ pricing: {
6335
+ bySize: {
6336
+ "1024x1024": { low: 5e-3, medium: 0.02, high: 0.052 },
6337
+ "1024x1536": { low: 75e-4, medium: 0.03, high: 0.078 },
6338
+ "1536x1024": { low: 75e-4, medium: 0.03, high: 0.078 }
6339
+ }
6340
+ },
6341
+ supportedSizes: [...GPT_IMAGE_SIZES],
6342
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
6343
+ maxImages: 1,
6344
+ defaultSize: "1024x1024",
6345
+ defaultQuality: "medium",
6346
+ features: {
6347
+ textRendering: true,
6348
+ transparency: true
6349
+ }
6350
+ },
6351
+ // DALL-E Family
6352
+ {
6353
+ provider: "openai",
6354
+ modelId: "dall-e-3",
6355
+ displayName: "DALL-E 3",
6356
+ pricing: {
6357
+ bySize: {
6358
+ "1024x1024": { standard: 0.04, hd: 0.08 },
6359
+ "1024x1792": { standard: 0.08, hd: 0.12 },
6360
+ "1792x1024": { standard: 0.08, hd: 0.12 }
6361
+ }
6362
+ },
6363
+ supportedSizes: [...DALLE3_SIZES],
6364
+ supportedQualities: [...DALLE3_QUALITIES],
6365
+ maxImages: 1,
6366
+ // DALL-E 3 only supports n=1
6367
+ defaultSize: "1024x1024",
6368
+ defaultQuality: "standard",
6369
+ features: {
6370
+ textRendering: true
6371
+ }
6372
+ },
6373
+ {
6374
+ provider: "openai",
6375
+ modelId: "dall-e-2",
6376
+ displayName: "DALL-E 2 (Legacy)",
6377
+ pricing: {
6378
+ bySize: {
6379
+ "256x256": 0.016,
6380
+ "512x512": 0.018,
6381
+ "1024x1024": 0.02
6382
+ }
6383
+ },
6384
+ supportedSizes: [...DALLE2_SIZES],
6385
+ maxImages: 10,
6386
+ defaultSize: "1024x1024"
6387
+ }
6388
+ ];
6389
+ }
6390
+ });
6391
+
5142
6392
  // src/providers/openai-models.ts
5143
6393
  var OPENAI_MODELS;
5144
6394
  var init_openai_models = __esm({
@@ -5503,6 +6753,144 @@ var init_openai_models = __esm({
5503
6753
  }
5504
6754
  });
5505
6755
 
6756
+ // src/providers/openai-speech-models.ts
6757
+ function getOpenAISpeechModelSpec(modelId) {
6758
+ return openaiSpeechModels.find((m) => m.modelId === modelId);
6759
+ }
6760
+ function isOpenAISpeechModel(modelId) {
6761
+ return openaiSpeechModels.some((m) => m.modelId === modelId);
6762
+ }
6763
+ function calculateOpenAISpeechCost(modelId, characterCount, estimatedMinutes) {
6764
+ const spec = getOpenAISpeechModelSpec(modelId);
6765
+ if (!spec) return void 0;
6766
+ if (spec.pricing.perCharacter !== void 0) {
6767
+ return characterCount * spec.pricing.perCharacter;
6768
+ }
6769
+ if (spec.pricing.perMinute !== void 0 && estimatedMinutes !== void 0) {
6770
+ return estimatedMinutes * spec.pricing.perMinute;
6771
+ }
6772
+ if (spec.pricing.perMinute !== void 0) {
6773
+ const approxMinutes = characterCount / 750;
6774
+ return approxMinutes * spec.pricing.perMinute;
6775
+ }
6776
+ return void 0;
6777
+ }
6778
+ var OPENAI_TTS_VOICES, OPENAI_TTS_EXTENDED_VOICES, OPENAI_TTS_FORMATS, openaiSpeechModels;
6779
+ var init_openai_speech_models = __esm({
6780
+ "src/providers/openai-speech-models.ts"() {
6781
+ "use strict";
6782
+ OPENAI_TTS_VOICES = [
6783
+ "alloy",
6784
+ "echo",
6785
+ "fable",
6786
+ "onyx",
6787
+ "nova",
6788
+ "shimmer"
6789
+ ];
6790
+ OPENAI_TTS_EXTENDED_VOICES = [
6791
+ ...OPENAI_TTS_VOICES,
6792
+ "ash",
6793
+ "ballad",
6794
+ "coral",
6795
+ "sage",
6796
+ "verse"
6797
+ ];
6798
+ OPENAI_TTS_FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
6799
+ openaiSpeechModels = [
6800
+ // Standard TTS models (character-based pricing)
6801
+ {
6802
+ provider: "openai",
6803
+ modelId: "tts-1",
6804
+ displayName: "TTS-1",
6805
+ pricing: {
6806
+ // $15 per 1M characters = $0.000015 per character
6807
+ perCharacter: 15e-6
6808
+ },
6809
+ voices: [...OPENAI_TTS_VOICES],
6810
+ formats: OPENAI_TTS_FORMATS,
6811
+ maxInputLength: 4096,
6812
+ defaultVoice: "alloy",
6813
+ defaultFormat: "mp3",
6814
+ features: {
6815
+ voiceInstructions: false
6816
+ }
6817
+ },
6818
+ {
6819
+ provider: "openai",
6820
+ modelId: "tts-1-1106",
6821
+ displayName: "TTS-1 (Nov 2023)",
6822
+ pricing: {
6823
+ perCharacter: 15e-6
6824
+ },
6825
+ voices: [...OPENAI_TTS_VOICES],
6826
+ formats: OPENAI_TTS_FORMATS,
6827
+ maxInputLength: 4096,
6828
+ defaultVoice: "alloy",
6829
+ defaultFormat: "mp3",
6830
+ features: {
6831
+ voiceInstructions: false
6832
+ }
6833
+ },
6834
+ {
6835
+ provider: "openai",
6836
+ modelId: "tts-1-hd",
6837
+ displayName: "TTS-1 HD",
6838
+ pricing: {
6839
+ // $30 per 1M characters = $0.00003 per character
6840
+ perCharacter: 3e-5
6841
+ },
6842
+ voices: [...OPENAI_TTS_VOICES],
6843
+ formats: OPENAI_TTS_FORMATS,
6844
+ maxInputLength: 4096,
6845
+ defaultVoice: "alloy",
6846
+ defaultFormat: "mp3",
6847
+ features: {
6848
+ voiceInstructions: false
6849
+ }
6850
+ },
6851
+ {
6852
+ provider: "openai",
6853
+ modelId: "tts-1-hd-1106",
6854
+ displayName: "TTS-1 HD (Nov 2023)",
6855
+ pricing: {
6856
+ perCharacter: 3e-5
6857
+ },
6858
+ voices: [...OPENAI_TTS_VOICES],
6859
+ formats: OPENAI_TTS_FORMATS,
6860
+ maxInputLength: 4096,
6861
+ defaultVoice: "alloy",
6862
+ defaultFormat: "mp3",
6863
+ features: {
6864
+ voiceInstructions: false
6865
+ }
6866
+ },
6867
+ // Token-based TTS model with voice instructions support
6868
+ {
6869
+ provider: "openai",
6870
+ modelId: "gpt-4o-mini-tts",
6871
+ displayName: "GPT-4o Mini TTS",
6872
+ pricing: {
6873
+ // $0.60 per 1M input tokens = $0.0000006 per token
6874
+ perInputToken: 6e-7,
6875
+ // $12 per 1M audio output tokens = $0.000012 per token
6876
+ perAudioOutputToken: 12e-6,
6877
+ // ~$0.015 per minute of audio
6878
+ perMinute: 0.015
6879
+ },
6880
+ voices: [...OPENAI_TTS_EXTENDED_VOICES],
6881
+ formats: OPENAI_TTS_FORMATS,
6882
+ maxInputLength: 2e3,
6883
+ // tokens, not characters
6884
+ defaultVoice: "alloy",
6885
+ defaultFormat: "mp3",
6886
+ features: {
6887
+ voiceInstructions: true
6888
+ }
6889
+ }
6890
+ ];
6891
+ }
6892
+ });
6893
+
5506
6894
  // src/providers/openai.ts
5507
6895
  function sanitizeExtra(extra, allowTemperature) {
5508
6896
  if (!extra) {
@@ -5522,9 +6910,12 @@ var init_openai = __esm({
5522
6910
  "use strict";
5523
6911
  import_openai = __toESM(require("openai"), 1);
5524
6912
  import_tiktoken = require("tiktoken");
6913
+ init_messages();
5525
6914
  init_base_provider();
5526
6915
  init_constants2();
6916
+ init_openai_image_models();
5527
6917
  init_openai_models();
6918
+ init_openai_speech_models();
5528
6919
  init_utils();
5529
6920
  ROLE_MAP = {
5530
6921
  system: "system",
@@ -5539,6 +6930,87 @@ var init_openai = __esm({
5539
6930
  getModelSpecs() {
5540
6931
  return OPENAI_MODELS;
5541
6932
  }
6933
+ // =========================================================================
6934
+ // Image Generation
6935
+ // =========================================================================
6936
+ getImageModelSpecs() {
6937
+ return openaiImageModels;
6938
+ }
6939
+ supportsImageGeneration(modelId) {
6940
+ return isOpenAIImageModel(modelId);
6941
+ }
6942
+ async generateImage(options) {
6943
+ const client = this.client;
6944
+ const spec = getOpenAIImageModelSpec(options.model);
6945
+ const size = options.size ?? spec?.defaultSize ?? "1024x1024";
6946
+ const quality = options.quality ?? spec?.defaultQuality ?? "standard";
6947
+ const n = options.n ?? 1;
6948
+ const isDallE2 = options.model === "dall-e-2";
6949
+ const isGptImage = options.model.startsWith("gpt-image");
6950
+ const requestParams = {
6951
+ model: options.model,
6952
+ prompt: options.prompt,
6953
+ size,
6954
+ n
6955
+ };
6956
+ if (!isDallE2 && !isGptImage) {
6957
+ requestParams.quality = quality;
6958
+ }
6959
+ if (isGptImage) {
6960
+ } else if (!isDallE2) {
6961
+ requestParams.response_format = options.responseFormat ?? "url";
6962
+ }
6963
+ const response = await client.images.generate(requestParams);
6964
+ const cost = calculateOpenAIImageCost(options.model, size, quality, n);
6965
+ const images = response.data ?? [];
6966
+ return {
6967
+ images: images.map((img) => ({
6968
+ url: img.url,
6969
+ b64Json: img.b64_json,
6970
+ revisedPrompt: img.revised_prompt
6971
+ })),
6972
+ model: options.model,
6973
+ usage: {
6974
+ imagesGenerated: images.length,
6975
+ size,
6976
+ quality
6977
+ },
6978
+ cost
6979
+ };
6980
+ }
6981
+ // =========================================================================
6982
+ // Speech Generation
6983
+ // =========================================================================
6984
+ getSpeechModelSpecs() {
6985
+ return openaiSpeechModels;
6986
+ }
6987
+ supportsSpeechGeneration(modelId) {
6988
+ return isOpenAISpeechModel(modelId);
6989
+ }
6990
+ async generateSpeech(options) {
6991
+ const client = this.client;
6992
+ const spec = getOpenAISpeechModelSpec(options.model);
6993
+ const format = options.responseFormat ?? spec?.defaultFormat ?? "mp3";
6994
+ const voice = options.voice ?? spec?.defaultVoice ?? "alloy";
6995
+ const response = await client.audio.speech.create({
6996
+ model: options.model,
6997
+ input: options.input,
6998
+ voice,
6999
+ response_format: format,
7000
+ speed: options.speed ?? 1
7001
+ });
7002
+ const audioBuffer = await response.arrayBuffer();
7003
+ const cost = calculateOpenAISpeechCost(options.model, options.input.length);
7004
+ return {
7005
+ audio: audioBuffer,
7006
+ model: options.model,
7007
+ usage: {
7008
+ characterCount: options.input.length
7009
+ },
7010
+ cost,
7011
+ format
7012
+ };
7013
+ }
5542
7014
  buildRequestPayload(options, descriptor, spec, messages) {
5543
7015
  const { maxTokens, temperature, topP, stopSequences, extra } = options;
5544
7016
  const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
@@ -5546,11 +7018,7 @@ var init_openai = __esm({
5546
7018
  const sanitizedExtra = sanitizeExtra(extra, shouldIncludeTemperature);
5547
7019
  return {
5548
7020
  model: descriptor.name,
5549
- messages: messages.map((message) => ({
5550
- role: ROLE_MAP[message.role],
5551
- content: message.content,
5552
- name: message.name
5553
- })),
7021
+ messages: messages.map((message) => this.convertToOpenAIMessage(message)),
5554
7022
  // Only set max_completion_tokens if explicitly provided
5555
7023
  // Otherwise let the API use "as much as fits" in the context window
5556
7024
  ...maxTokens !== void 0 ? { max_completion_tokens: maxTokens } : {},
@@ -5562,6 +7030,77 @@ var init_openai = __esm({
5562
7030
  ...shouldIncludeTemperature ? { temperature } : {}
5563
7031
  };
5564
7032
  }
7033
+ /**
7034
+ * Convert an LLMMessage to OpenAI's ChatCompletionMessageParam.
7035
+ * Handles role-specific content type requirements:
7036
+ * - system/assistant: string content only
7037
+ * - user: string or multimodal array content
7038
+ */
7039
+ convertToOpenAIMessage(message) {
7040
+ const role = ROLE_MAP[message.role];
7041
+ if (role === "user") {
7042
+ const content = this.convertToOpenAIContent(message.content);
7043
+ return {
7044
+ role: "user",
7045
+ content,
7046
+ ...message.name ? { name: message.name } : {}
7047
+ };
7048
+ }
7049
+ const textContent = typeof message.content === "string" ? message.content : extractText(message.content);
7050
+ if (role === "system") {
7051
+ return {
7052
+ role: "system",
7053
+ content: textContent,
7054
+ ...message.name ? { name: message.name } : {}
7055
+ };
7056
+ }
7057
+ return {
7058
+ role: "assistant",
7059
+ content: textContent,
7060
+ ...message.name ? { name: message.name } : {}
7061
+ };
7062
+ }
7063
+ /**
7064
+ * Convert llmist content to OpenAI's content format.
7065
+ * Optimizes by returning string for text-only content, array for multimodal.
7066
+ */
7067
+ convertToOpenAIContent(content) {
7068
+ if (typeof content === "string") {
7069
+ return content;
7070
+ }
7071
+ return content.map((part) => {
7072
+ if (part.type === "text") {
7073
+ return { type: "text", text: part.text };
7074
+ }
7075
+ if (part.type === "image") {
7076
+ return this.convertImagePart(part);
7077
+ }
7078
+ if (part.type === "audio") {
7079
+ throw new Error(
7080
+ "OpenAI chat completions do not support audio input. Use Whisper for transcription or Gemini for audio understanding."
7081
+ );
7082
+ }
7083
+ throw new Error(`Unsupported content type: ${part.type}`);
7084
+ });
7085
+ }
7086
+ /**
7087
+ * Convert an image content part to OpenAI's image_url format.
7088
+ * Supports both URLs and base64 data URLs.
7089
+ */
7090
+ convertImagePart(part) {
7091
+ if (part.source.type === "url") {
7092
+ return {
7093
+ type: "image_url",
7094
+ image_url: { url: part.source.url }
7095
+ };
7096
+ }
7097
+ return {
7098
+ type: "image_url",
7099
+ image_url: {
7100
+ url: `data:${part.source.mediaType};base64,${part.source.data}`
7101
+ }
7102
+ };
7103
+ }
5565
7104
  async executeStreamRequest(payload, signal) {
5566
7105
  const client = this.client;
5567
7106
  const stream2 = await client.chat.completions.create(payload, signal ? { signal } : void 0);
@@ -5570,9 +7109,9 @@ var init_openai = __esm({
5570
7109
  async *wrapStream(iterable) {
5571
7110
  const stream2 = iterable;
5572
7111
  for await (const chunk of stream2) {
5573
- const text = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
5574
- if (text) {
5575
- yield { text, rawEvent: chunk };
7112
+ const text3 = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
7113
+ if (text3) {
7114
+ yield { text: text3, rawEvent: chunk };
5576
7115
  }
5577
7116
  const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
5578
7117
  const usage = chunk.usage ? {
@@ -5620,17 +7159,26 @@ var init_openai = __esm({
5620
7159
  }
5621
7160
  try {
5622
7161
  let tokenCount = 0;
7162
+ let imageCount = 0;
5623
7163
  for (const message of messages) {
5624
7164
  tokenCount += OPENAI_MESSAGE_OVERHEAD_TOKENS;
5625
7165
  const roleText = ROLE_MAP[message.role];
5626
7166
  tokenCount += encoding.encode(roleText).length;
5627
- tokenCount += encoding.encode(message.content ?? "").length;
7167
+ const textContent = extractText(message.content);
7168
+ tokenCount += encoding.encode(textContent).length;
7169
+ const parts = normalizeContent(message.content);
7170
+ for (const part of parts) {
7171
+ if (part.type === "image") {
7172
+ imageCount++;
7173
+ }
7174
+ }
5628
7175
  if (message.name) {
5629
7176
  tokenCount += encoding.encode(message.name).length;
5630
7177
  tokenCount += OPENAI_NAME_FIELD_OVERHEAD_TOKENS;
5631
7178
  }
5632
7179
  }
5633
7180
  tokenCount += OPENAI_REPLY_PRIMING_TOKENS;
7181
+ tokenCount += imageCount * 765;
5634
7182
  return tokenCount;
5635
7183
  } finally {
5636
7184
  encoding.free();
@@ -5640,8 +7188,19 @@ var init_openai = __esm({
5640
7188
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
5641
7189
  error
5642
7190
  );
5643
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
5644
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
7191
+ let totalChars = 0;
7192
+ let imageCount = 0;
7193
+ for (const msg of messages) {
7194
+ const parts = normalizeContent(msg.content);
7195
+ for (const part of parts) {
7196
+ if (part.type === "text") {
7197
+ totalChars += part.text.length;
7198
+ } else if (part.type === "image") {
7199
+ imageCount++;
7200
+ }
7201
+ }
7202
+ }
7203
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 765;
5645
7204
  }
5646
7205
  }
5647
7206
  };
@@ -5879,30 +7438,109 @@ var init_model_registry = __esm({
5879
7438
  }
5880
7439
  });
5881
7440
 
5882
- // src/core/options.ts
5883
- var ModelIdentifierParser;
5884
- var init_options = __esm({
5885
- "src/core/options.ts"() {
7441
+ // src/core/namespaces/image.ts
7442
+ var ImageNamespace;
7443
+ var init_image = __esm({
7444
+ "src/core/namespaces/image.ts"() {
5886
7445
  "use strict";
5887
- ModelIdentifierParser = class {
5888
- constructor(defaultProvider = "openai") {
7446
+ ImageNamespace = class {
7447
+ constructor(adapters, defaultProvider) {
7448
+ this.adapters = adapters;
5889
7449
  this.defaultProvider = defaultProvider;
5890
7450
  }
5891
- parse(identifier) {
5892
- const trimmed = identifier.trim();
5893
- if (!trimmed) {
5894
- throw new Error("Model identifier cannot be empty");
7451
+ /**
7452
+ * Generate images from a text prompt.
7453
+ *
7454
+ * @param options - Image generation options
7455
+ * @returns Promise resolving to the generation result with images and cost
7456
+ * @throws Error if the provider doesn't support image generation
7457
+ */
7458
+ async generate(options) {
7459
+ const modelId = options.model;
7460
+ const adapter = this.findImageAdapter(modelId);
7461
+ if (!adapter || !adapter.generateImage) {
7462
+ throw new Error(
7463
+ `No provider supports image generation for model "${modelId}". Available image models: ${this.listModels().map((m) => m.modelId).join(", ")}`
7464
+ );
5895
7465
  }
5896
- const [maybeProvider, ...rest] = trimmed.split(":");
5897
- if (rest.length === 0) {
5898
- return { provider: this.defaultProvider, name: maybeProvider };
7466
+ return adapter.generateImage(options);
7467
+ }
7468
+ /**
7469
+ * List all available image generation models.
7470
+ */
7471
+ listModels() {
7472
+ const models = [];
7473
+ for (const adapter of this.adapters) {
7474
+ if (adapter.getImageModelSpecs) {
7475
+ models.push(...adapter.getImageModelSpecs());
7476
+ }
5899
7477
  }
5900
- const provider = maybeProvider;
5901
- const name = rest.join(":");
5902
- if (!name) {
5903
- throw new Error("Model name cannot be empty");
7478
+ return models;
7479
+ }
7480
+ /**
7481
+ * Check if a model is supported for image generation.
7482
+ */
7483
+ supportsModel(modelId) {
7484
+ return this.findImageAdapter(modelId) !== void 0;
7485
+ }
7486
+ findImageAdapter(modelId) {
7487
+ return this.adapters.find(
7488
+ (adapter) => adapter.supportsImageGeneration?.(modelId) ?? false
7489
+ );
7490
+ }
7491
+ };
7492
+ }
7493
+ });
7494
+
7495
+ // src/core/namespaces/speech.ts
7496
+ var SpeechNamespace;
7497
+ var init_speech = __esm({
7498
+ "src/core/namespaces/speech.ts"() {
7499
+ "use strict";
7500
+ SpeechNamespace = class {
7501
+ constructor(adapters, defaultProvider) {
7502
+ this.adapters = adapters;
7503
+ this.defaultProvider = defaultProvider;
7504
+ }
7505
+ /**
7506
+ * Generate speech audio from text.
7507
+ *
7508
+ * @param options - Speech generation options
7509
+ * @returns Promise resolving to the generation result with audio and cost
7510
+ * @throws Error if the provider doesn't support speech generation
7511
+ */
7512
+ async generate(options) {
7513
+ const modelId = options.model;
7514
+ const adapter = this.findSpeechAdapter(modelId);
7515
+ if (!adapter || !adapter.generateSpeech) {
7516
+ throw new Error(
7517
+ `No provider supports speech generation for model "${modelId}". Available speech models: ${this.listModels().map((m) => m.modelId).join(", ")}`
7518
+ );
5904
7519
  }
5905
- return { provider, name };
7520
+ return adapter.generateSpeech(options);
7521
+ }
7522
+ /**
7523
+ * List all available speech generation models.
7524
+ */
7525
+ listModels() {
7526
+ const models = [];
7527
+ for (const adapter of this.adapters) {
7528
+ if (adapter.getSpeechModelSpecs) {
7529
+ models.push(...adapter.getSpeechModelSpecs());
7530
+ }
7531
+ }
7532
+ return models;
7533
+ }
7534
+ /**
7535
+ * Check if a model is supported for speech generation.
7536
+ */
7537
+ supportsModel(modelId) {
7538
+ return this.findSpeechAdapter(modelId) !== void 0;
7539
+ }
7540
+ findSpeechAdapter(modelId) {
7541
+ return this.adapters.find(
7542
+ (adapter) => adapter.supportsSpeechGeneration?.(modelId) ?? false
7543
+ );
5906
7544
  }
5907
7545
  };
5908
7546
  }
@@ -5951,6 +7589,201 @@ var init_quick_methods = __esm({
5951
7589
  }
5952
7590
  });
5953
7591
 
7592
+ // src/core/namespaces/text.ts
7593
+ var TextNamespace;
7594
+ var init_text = __esm({
7595
+ "src/core/namespaces/text.ts"() {
7596
+ "use strict";
7597
+ init_quick_methods();
7598
+ TextNamespace = class {
7599
+ constructor(client) {
7600
+ this.client = client;
7601
+ }
7602
+ /**
7603
+ * Generate a complete text response.
7604
+ *
7605
+ * @param prompt - User prompt
7606
+ * @param options - Optional configuration
7607
+ * @returns Complete text response
7608
+ */
7609
+ async complete(prompt, options) {
7610
+ return complete(this.client, prompt, options);
7611
+ }
7612
+ /**
7613
+ * Stream text chunks.
7614
+ *
7615
+ * @param prompt - User prompt
7616
+ * @param options - Optional configuration
7617
+ * @returns Async generator yielding text chunks
7618
+ */
7619
+ stream(prompt, options) {
7620
+ return stream(this.client, prompt, options);
7621
+ }
7622
+ };
7623
+ }
7624
+ });
7625
+
7626
+ // src/core/namespaces/vision.ts
7627
+ var VisionNamespace;
7628
+ var init_vision = __esm({
7629
+ "src/core/namespaces/vision.ts"() {
7630
+ "use strict";
7631
+ init_input_content();
7632
+ init_messages();
7633
+ VisionNamespace = class {
7634
+ constructor(client) {
7635
+ this.client = client;
7636
+ }
7637
+ /**
7638
+ * Build a message builder with the image content attached.
7639
+ * Handles URLs, data URLs, base64 strings, and binary buffers.
7640
+ */
7641
+ buildImageMessage(options) {
7642
+ const builder = new LLMMessageBuilder();
7643
+ if (options.systemPrompt) {
7644
+ builder.addSystem(options.systemPrompt);
7645
+ }
7646
+ if (typeof options.image === "string") {
7647
+ if (options.image.startsWith("http://") || options.image.startsWith("https://")) {
7648
+ builder.addUserWithImageUrl(options.prompt, options.image);
7649
+ } else if (isDataUrl(options.image)) {
7650
+ const parsed = parseDataUrl(options.image);
7651
+ if (!parsed) {
7652
+ throw new Error("Invalid data URL format");
7653
+ }
7654
+ builder.addUserWithImage(
7655
+ options.prompt,
7656
+ parsed.data,
7657
+ parsed.mimeType
7658
+ );
7659
+ } else {
7660
+ const buffer = Buffer.from(options.image, "base64");
7661
+ builder.addUserWithImage(options.prompt, buffer, options.mimeType);
7662
+ }
7663
+ } else {
7664
+ builder.addUserWithImage(options.prompt, options.image, options.mimeType);
7665
+ }
7666
+ return builder;
7667
+ }
7668
+ /**
7669
+ * Stream the response and collect text and usage information.
7670
+ */
7671
+ async streamAndCollect(options, builder) {
7672
+ let response = "";
7673
+ let finalUsage;
7674
+ for await (const chunk of this.client.stream({
7675
+ model: options.model,
7676
+ messages: builder.build(),
7677
+ maxTokens: options.maxTokens,
7678
+ temperature: options.temperature
7679
+ })) {
7680
+ response += chunk.text;
7681
+ if (chunk.usage) {
7682
+ finalUsage = {
7683
+ inputTokens: chunk.usage.inputTokens,
7684
+ outputTokens: chunk.usage.outputTokens,
7685
+ totalTokens: chunk.usage.totalTokens
7686
+ };
7687
+ }
7688
+ }
7689
+ return { text: response.trim(), usage: finalUsage };
7690
+ }
7691
+ /**
7692
+ * Analyze an image with a vision-capable model.
7693
+ * Returns the analysis as a string.
7694
+ *
7695
+ * @param options - Vision analysis options
7696
+ * @returns Promise resolving to the analysis text
7697
+ * @throws Error if the image format is unsupported or model doesn't support vision
7698
+ *
7699
+ * @example
7700
+ * ```typescript
7701
+ * // From file
7702
+ * const result = await llmist.vision.analyze({
7703
+ * model: "gpt-4o",
7704
+ * image: await fs.readFile("photo.jpg"),
7705
+ * prompt: "What's in this image?",
7706
+ * });
7707
+ *
7708
+ * // From URL (OpenAI only)
7709
+ * const result = await llmist.vision.analyze({
7710
+ * model: "gpt-4o",
7711
+ * image: "https://example.com/image.jpg",
7712
+ * prompt: "Describe this image",
7713
+ * });
7714
+ * ```
7715
+ */
7716
+ async analyze(options) {
7717
+ const builder = this.buildImageMessage(options);
7718
+ const { text: text3 } = await this.streamAndCollect(options, builder);
7719
+ return text3;
7720
+ }
7721
+ /**
7722
+ * Analyze an image and return detailed result with usage info.
7723
+ *
7724
+ * @param options - Vision analysis options
7725
+ * @returns Promise resolving to the analysis result with usage info
7726
+ */
7727
+ async analyzeWithUsage(options) {
7728
+ const builder = this.buildImageMessage(options);
7729
+ const { text: text3, usage } = await this.streamAndCollect(options, builder);
7730
+ return {
7731
+ text: text3,
7732
+ model: options.model,
7733
+ usage
7734
+ };
7735
+ }
7736
+ /**
7737
+ * Check if a model supports vision/image input.
7738
+ *
7739
+ * @param modelId - Model ID to check
7740
+ * @returns True if the model supports vision
7741
+ */
7742
+ supportsModel(modelId) {
7743
+ const spec = this.client.modelRegistry.getModelSpec(modelId);
7744
+ return spec?.features?.vision === true;
7745
+ }
7746
+ /**
7747
+ * List all models that support vision.
7748
+ *
7749
+ * @returns Array of model IDs that support vision
7750
+ */
7751
+ listModels() {
7752
+ return this.client.modelRegistry.listModels().filter((spec) => spec.features?.vision === true).map((spec) => spec.modelId);
7753
+ }
7754
+ };
7755
+ }
7756
+ });
7757
+
7758
+ // src/core/options.ts
7759
+ var ModelIdentifierParser;
7760
+ var init_options = __esm({
7761
+ "src/core/options.ts"() {
7762
+ "use strict";
7763
+ ModelIdentifierParser = class {
7764
+ constructor(defaultProvider = "openai") {
7765
+ this.defaultProvider = defaultProvider;
7766
+ }
7767
+ parse(identifier) {
7768
+ const trimmed = identifier.trim();
7769
+ if (!trimmed) {
7770
+ throw new Error("Model identifier cannot be empty");
7771
+ }
7772
+ const [maybeProvider, ...rest] = trimmed.split(":");
7773
+ if (rest.length === 0) {
7774
+ return { provider: this.defaultProvider, name: maybeProvider };
7775
+ }
7776
+ const provider = maybeProvider;
7777
+ const name = rest.join(":");
7778
+ if (!name) {
7779
+ throw new Error("Model name cannot be empty");
7780
+ }
7781
+ return { provider, name };
7782
+ }
7783
+ };
7784
+ }
7785
+ });
7786
+
5954
7787
  // src/core/client.ts
5955
7788
  var client_exports = {};
5956
7789
  __export(client_exports, {
@@ -5963,12 +7796,22 @@ var init_client = __esm({
5963
7796
  init_builder();
5964
7797
  init_discovery();
5965
7798
  init_model_registry();
7799
+ init_image();
7800
+ init_speech();
7801
+ init_text();
7802
+ init_vision();
5966
7803
  init_options();
5967
7804
  init_quick_methods();
5968
7805
  LLMist = class _LLMist {
5969
7806
  parser;
7807
+ defaultProvider;
5970
7808
  modelRegistry;
5971
7809
  adapters;
7810
+ // Namespaces for different generation types
7811
+ text;
7812
+ image;
7813
+ speech;
7814
+ vision;
5972
7815
  constructor(...args) {
5973
7816
  let adapters = [];
5974
7817
  let defaultProvider;
@@ -6007,6 +7850,7 @@ var init_client = __esm({
6007
7850
  const priorityB = b.priority ?? 0;
6008
7851
  return priorityB - priorityA;
6009
7852
  });
7853
+ this.defaultProvider = resolvedDefaultProvider;
6010
7854
  this.parser = new ModelIdentifierParser(resolvedDefaultProvider);
6011
7855
  this.modelRegistry = new ModelRegistry();
6012
7856
  for (const adapter of this.adapters) {
@@ -6015,6 +7859,10 @@ var init_client = __esm({
6015
7859
  if (customModels.length > 0) {
6016
7860
  this.modelRegistry.registerModels(customModels);
6017
7861
  }
7862
+ this.text = new TextNamespace(this);
7863
+ this.image = new ImageNamespace(this.adapters, this.defaultProvider);
7864
+ this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
7865
+ this.vision = new VisionNamespace(this);
6018
7866
  }
6019
7867
  stream(options) {
6020
7868
  const descriptor = this.parser.parse(options.model);
@@ -6199,6 +8047,7 @@ var init_builder = __esm({
6199
8047
  "src/agent/builder.ts"() {
6200
8048
  "use strict";
6201
8049
  init_constants();
8050
+ init_input_content();
6202
8051
  init_model_shortcuts();
6203
8052
  init_registry();
6204
8053
  init_agent();
@@ -6846,13 +8695,17 @@ ${endPrefix}`
6846
8695
  * }
6847
8696
  * ```
6848
8697
  */
6849
- ask(userPrompt) {
8698
+ /**
8699
+ * Build AgentOptions with the given user prompt.
8700
+ * Centralizes options construction for ask(), askWithImage(), and askWithContent().
8701
+ */
8702
+ buildAgentOptions(userPrompt) {
6850
8703
  if (!this.client) {
6851
8704
  const { LLMist: LLMistClass } = (init_client(), __toCommonJS(client_exports));
6852
8705
  this.client = new LLMistClass();
6853
8706
  }
6854
8707
  const registry = GadgetRegistry.from(this.gadgets);
6855
- const options = {
8708
+ return {
6856
8709
  client: this.client,
6857
8710
  model: this.model ?? "openai:gpt-5-nano",
6858
8711
  systemPrompt: this.systemPrompt,
@@ -6878,6 +8731,83 @@ ${endPrefix}`
6878
8731
  compactionConfig: this.compactionConfig,
6879
8732
  signal: this.signal
6880
8733
  };
8734
+ }
8735
+ ask(userPrompt) {
8736
+ const options = this.buildAgentOptions(userPrompt);
8737
+ return new Agent(AGENT_INTERNAL_KEY, options);
8738
+ }
8739
+ /**
8740
+ * Build and create the agent with a multimodal user prompt (text + image).
8741
+ * Returns the Agent instance ready to run.
8742
+ *
8743
+ * @param textPrompt - Text prompt describing what to do with the image
8744
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
8745
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
8746
+ * @returns Configured Agent instance
8747
+ *
8748
+ * @example
8749
+ * ```typescript
8750
+ * const agent = LLMist.createAgent()
8751
+ * .withModel("gpt-4o")
8752
+ * .withSystem("You analyze images")
8753
+ * .askWithImage(
8754
+ * "What's in this image?",
8755
+ * await fs.readFile("photo.jpg")
8756
+ * );
8757
+ *
8758
+ * for await (const event of agent.run()) {
8759
+ * // handle events
8760
+ * }
8761
+ * ```
8762
+ */
8763
+ askWithImage(textPrompt, imageData, mimeType) {
8764
+ const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
8765
+ const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
8766
+ if (!detectedMime) {
8767
+ throw new Error(
8768
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
8769
+ );
8770
+ }
8771
+ const userContent = [
8772
+ text(textPrompt),
8773
+ {
8774
+ type: "image",
8775
+ source: {
8776
+ type: "base64",
8777
+ mediaType: detectedMime,
8778
+ data: toBase64(imageBuffer)
8779
+ }
8780
+ }
8781
+ ];
8782
+ const options = this.buildAgentOptions(userContent);
8783
+ return new Agent(AGENT_INTERNAL_KEY, options);
8784
+ }
8785
+ /**
8786
+ * Build and return an Agent configured with multimodal content.
8787
+ * More flexible than askWithImage - accepts any combination of content parts.
8788
+ *
8789
+ * @param content - Array of content parts (text, images, audio)
8790
+ * @returns A configured Agent ready for execution
8791
+ *
8792
+ * @example
8793
+ * ```typescript
8794
+ * import { text, imageFromBuffer, audioFromBuffer } from "llmist";
8795
+ *
8796
+ * const agent = LLMist.createAgent()
8797
+ * .withModel("gemini:gemini-2.5-flash")
8798
+ * .askWithContent([
8799
+ * text("Describe this image and transcribe the audio:"),
8800
+ * imageFromBuffer(imageData),
8801
+ * audioFromBuffer(audioData),
8802
+ * ]);
8803
+ *
8804
+ * for await (const event of agent.run()) {
8805
+ * // handle events
8806
+ * }
8807
+ * ```
8808
+ */
8809
+ askWithContent(content) {
8810
+ const options = this.buildAgentOptions(content);
6881
8811
  return new Agent(AGENT_INTERNAL_KEY, options);
6882
8812
  }
6883
8813
  /**
@@ -6995,7 +8925,10 @@ var COMMANDS = {
6995
8925
  complete: "complete",
6996
8926
  agent: "agent",
6997
8927
  models: "models",
6998
- gadget: "gadget"
8928
+ gadget: "gadget",
8929
+ image: "image",
8930
+ speech: "speech",
8931
+ vision: "vision"
6999
8932
  };
7000
8933
  var LOG_LEVELS = ["silly", "trace", "debug", "info", "warn", "error", "fatal"];
7001
8934
  var DEFAULT_MODEL = "openai:gpt-5-nano";
@@ -7016,7 +8949,20 @@ var OPTION_FLAGS = {
7016
8949
  docker: "--docker",
7017
8950
  dockerRo: "--docker-ro",
7018
8951
  noDocker: "--no-docker",
7019
- dockerDev: "--docker-dev"
8952
+ dockerDev: "--docker-dev",
8953
+ // Multimodal input options
8954
+ inputImage: "--image <path>",
8955
+ inputAudio: "--audio <path>",
8956
+ // Image generation options
8957
+ imageSize: "--size <size>",
8958
+ imageQuality: "--quality <quality>",
8959
+ imageCount: "-n, --count <number>",
8960
+ imageOutput: "-o, --output <path>",
8961
+ // Speech generation options
8962
+ voice: "--voice <name>",
8963
+ speechFormat: "--format <format>",
8964
+ speechSpeed: "--speed <value>",
8965
+ speechOutput: "-o, --output <path>"
7020
8966
  };
7021
8967
  var OPTION_DESCRIPTIONS = {
7022
8968
  model: "Model identifier, e.g. openai:gpt-5-nano or anthropic:claude-sonnet-4-5.",
@@ -7032,10 +8978,23 @@ var OPTION_DESCRIPTIONS = {
7032
8978
  noBuiltins: "Disable built-in gadgets (AskUser, TellUser).",
7033
8979
  noBuiltinInteraction: "Disable interactive gadgets (AskUser) while keeping TellUser.",
7034
8980
  quiet: "Suppress all output except content (text and TellUser messages).",
8981
+ // Multimodal input descriptions
8982
+ inputImage: "Image file to include with the prompt (vision models).",
8983
+ inputAudio: "Audio file to include with the prompt (Gemini only).",
7035
8984
  docker: "Run agent in a Docker sandbox container for security isolation.",
7036
8985
  dockerRo: "Run in Docker with current directory mounted read-only.",
7037
8986
  noDocker: "Disable Docker sandboxing (override config).",
7038
- dockerDev: "Run in Docker dev mode (mount local source instead of npm install)."
8987
+ dockerDev: "Run in Docker dev mode (mount local source instead of npm install).",
8988
+ // Image generation descriptions
8989
+ imageSize: "Image size/aspect ratio, e.g. '1024x1024', '1:1', '16:9'.",
8990
+ imageQuality: "Image quality: 'standard', 'hd', 'low', 'medium', 'high'.",
8991
+ imageCount: "Number of images to generate (model dependent, usually 1-4).",
8992
+ imageOutput: "Output path for the generated image. Defaults to stdout if not specified.",
8993
+ // Speech generation descriptions
8994
+ voice: "Voice name for speech generation, e.g. 'nova', 'alloy', 'Zephyr'.",
8995
+ speechFormat: "Audio format: 'mp3', 'opus', 'aac', 'flac', 'wav', 'pcm'.",
8996
+ speechSpeed: "Speech speed multiplier (0.25 to 4.0, default 1.0).",
8997
+ speechOutput: "Output path for audio file. Defaults to stdout if not specified."
7039
8998
  };
7040
8999
  var SUMMARY_PREFIX = "[llmist]";
7041
9000
 
@@ -7045,7 +9004,7 @@ var import_commander2 = require("commander");
7045
9004
  // package.json
7046
9005
  var package_default = {
7047
9006
  name: "llmist",
7048
- version: "2.2.0",
9007
+ version: "2.5.0",
7049
9008
  description: "TypeScript LLM client with streaming tool execution. Tools fire mid-stream. Built-in function calling works with any model\u2014no structured outputs or native tool support required.",
7050
9009
  type: "module",
7051
9010
  main: "dist/index.cjs",
@@ -7167,7 +9126,7 @@ var package_default = {
7167
9126
  };
7168
9127
 
7169
9128
  // src/cli/agent-command.ts
7170
- var import_promises3 = require("readline/promises");
9129
+ var import_promises4 = require("readline/promises");
7171
9130
  var import_chalk5 = __toESM(require("chalk"), 1);
7172
9131
  init_builder();
7173
9132
 
@@ -7185,6 +9144,7 @@ function isAbortError(error) {
7185
9144
  }
7186
9145
 
7187
9146
  // src/cli/agent-command.ts
9147
+ init_input_content();
7188
9148
  init_registry();
7189
9149
  init_constants2();
7190
9150
 
@@ -7509,15 +9469,84 @@ var finish = createGadget({
7509
9469
  });
7510
9470
  var builtinGadgets = [askUser, tellUser, finish];
7511
9471
 
9472
+ // src/cli/file-utils.ts
9473
+ var import_promises2 = require("fs/promises");
9474
+ var import_node_path3 = require("path");
9475
+ init_input_content();
9476
+ var DEFAULT_MAX_FILE_SIZE = 50 * 1024 * 1024;
9477
+ function formatFileSize(bytes) {
9478
+ if (bytes < 1024) return `${bytes} bytes`;
9479
+ if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
9480
+ if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
9481
+ return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
9482
+ }
9483
+ async function checkFileSize(absolutePath, filePath, maxSize) {
9484
+ const stats = await (0, import_promises2.stat)(absolutePath);
9485
+ if (stats.size > maxSize) {
9486
+ throw new Error(
9487
+ `File "${filePath}" is too large (${formatFileSize(stats.size)}). Maximum allowed size is ${formatFileSize(maxSize)}. Consider compressing the file or using a smaller version.`
9488
+ );
9489
+ }
9490
+ }
9491
+ async function readImageFile(filePath, options = {}) {
9492
+ const absolutePath = (0, import_node_path3.resolve)(filePath);
9493
+ const maxFileSize = options.maxFileSize ?? DEFAULT_MAX_FILE_SIZE;
9494
+ let buffer;
9495
+ try {
9496
+ await checkFileSize(absolutePath, filePath, maxFileSize);
9497
+ buffer = await (0, import_promises2.readFile)(absolutePath);
9498
+ } catch (error) {
9499
+ const message = error instanceof Error ? error.message : String(error);
9500
+ throw new Error(`Failed to read image file "${filePath}": ${message}`);
9501
+ }
9502
+ const mimeType = detectImageMimeType(buffer);
9503
+ if (!mimeType) {
9504
+ throw new Error(
9505
+ `File "${filePath}" is not a supported image format. Supported formats: JPEG, PNG, GIF, WebP`
9506
+ );
9507
+ }
9508
+ return imageFromBuffer(buffer, mimeType);
9509
+ }
9510
+ async function readAudioFile(filePath, options = {}) {
9511
+ const absolutePath = (0, import_node_path3.resolve)(filePath);
9512
+ const maxFileSize = options.maxFileSize ?? DEFAULT_MAX_FILE_SIZE;
9513
+ let buffer;
9514
+ try {
9515
+ await checkFileSize(absolutePath, filePath, maxFileSize);
9516
+ buffer = await (0, import_promises2.readFile)(absolutePath);
9517
+ } catch (error) {
9518
+ const message = error instanceof Error ? error.message : String(error);
9519
+ throw new Error(`Failed to read audio file "${filePath}": ${message}`);
9520
+ }
9521
+ const mimeType = detectAudioMimeType(buffer);
9522
+ if (!mimeType) {
9523
+ throw new Error(
9524
+ `File "${filePath}" is not a supported audio format. Supported formats: MP3, WAV, OGG, WebM`
9525
+ );
9526
+ }
9527
+ return audioFromBuffer(buffer, mimeType);
9528
+ }
9529
+ async function readFileBuffer(filePath, options = {}) {
9530
+ const absolutePath = (0, import_node_path3.resolve)(filePath);
9531
+ const maxFileSize = options.maxFileSize ?? DEFAULT_MAX_FILE_SIZE;
9532
+ try {
9533
+ await checkFileSize(absolutePath, filePath, maxFileSize);
9534
+ return await (0, import_promises2.readFile)(absolutePath);
9535
+ } catch (error) {
9536
+ const message = error instanceof Error ? error.message : String(error);
9537
+ throw new Error(`Failed to read file "${filePath}": ${message}`);
9538
+ }
9539
+ }
9540
+
7512
9541
  // src/cli/gadgets.ts
7513
9542
  var import_node_fs7 = __toESM(require("fs"), 1);
7514
- var import_node_path6 = __toESM(require("path"), 1);
9543
+ var import_node_path7 = __toESM(require("path"), 1);
7515
9544
  var import_node_url = require("url");
7516
9545
  init_gadget();
7517
9546
 
7518
9547
  // src/cli/builtins/filesystem/list-directory.ts
7519
9548
  var import_node_fs4 = __toESM(require("fs"), 1);
7520
- var import_node_path4 = __toESM(require("path"), 1);
9549
+ var import_node_path5 = __toESM(require("path"), 1);
7521
9550
  var import_zod4 = require("zod");
7522
9551
 
7523
9552
  // src/index.ts
@@ -7541,6 +9570,7 @@ init_prompt_config();
7541
9570
 
7542
9571
  // src/index.ts
7543
9572
  init_client();
9573
+ init_input_content();
7544
9574
  init_messages();
7545
9575
  init_model_registry();
7546
9576
  init_model_shortcuts();
@@ -7571,6 +9601,10 @@ init_logger();
7571
9601
  // src/testing/mock-stream.ts
7572
9602
  init_constants();
7573
9603
 
9604
+ // src/testing/mock-builder.ts
9605
+ init_input_content();
9606
+ init_messages();
9607
+
7574
9608
  // src/testing/mock-client.ts
7575
9609
  init_client();
7576
9610
 
@@ -7582,7 +9616,7 @@ var import_node_stream = require("stream");
7582
9616
 
7583
9617
  // src/cli/builtins/filesystem/utils.ts
7584
9618
  var import_node_fs3 = __toESM(require("fs"), 1);
7585
- var import_node_path3 = __toESM(require("path"), 1);
9619
+ var import_node_path4 = __toESM(require("path"), 1);
7586
9620
  var PathSandboxException = class extends Error {
7587
9621
  constructor(inputPath, reason) {
7588
9622
  super(`Path access denied: ${inputPath}. ${reason}`);
@@ -7591,7 +9625,7 @@ var PathSandboxException = class extends Error {
7591
9625
  };
7592
9626
  function validatePathIsWithinCwd(inputPath) {
7593
9627
  const cwd = process.cwd();
7594
- const resolvedPath = import_node_path3.default.resolve(cwd, inputPath);
9628
+ const resolvedPath = import_node_path4.default.resolve(cwd, inputPath);
7595
9629
  let finalPath;
7596
9630
  try {
7597
9631
  finalPath = import_node_fs3.default.realpathSync(resolvedPath);
@@ -7603,7 +9637,7 @@ function validatePathIsWithinCwd(inputPath) {
7603
9637
  throw error;
7604
9638
  }
7605
9639
  }
7606
- const cwdWithSep = cwd + import_node_path3.default.sep;
9640
+ const cwdWithSep = cwd + import_node_path4.default.sep;
7607
9641
  if (!finalPath.startsWith(cwdWithSep) && finalPath !== cwd) {
7608
9642
  throw new PathSandboxException(inputPath, "Path is outside the current working directory");
7609
9643
  }
@@ -7616,8 +9650,8 @@ function listFiles(dirPath, basePath = dirPath, maxDepth = 1, currentDepth = 1)
7616
9650
  try {
7617
9651
  const items = import_node_fs4.default.readdirSync(dirPath);
7618
9652
  for (const item of items) {
7619
- const fullPath = import_node_path4.default.join(dirPath, item);
7620
- const relativePath = import_node_path4.default.relative(basePath, fullPath);
9653
+ const fullPath = import_node_path5.default.join(dirPath, item);
9654
+ const relativePath = import_node_path5.default.relative(basePath, fullPath);
7621
9655
  try {
7622
9656
  const stats = import_node_fs4.default.lstatSync(fullPath);
7623
9657
  let type;
@@ -7732,7 +9766,7 @@ ${formattedList}`;
7732
9766
  // src/cli/builtins/filesystem/read-file.ts
7733
9767
  var import_node_fs5 = __toESM(require("fs"), 1);
7734
9768
  var import_zod5 = require("zod");
7735
- var readFile = createGadget({
9769
+ var readFile2 = createGadget({
7736
9770
  name: "ReadFile",
7737
9771
  description: "Read the entire content of a file and return it as text. The file path must be within the current working directory or its subdirectories.",
7738
9772
  schema: import_zod5.z.object({
@@ -7761,7 +9795,7 @@ ${content}`;
7761
9795
 
7762
9796
  // src/cli/builtins/filesystem/write-file.ts
7763
9797
  var import_node_fs6 = __toESM(require("fs"), 1);
7764
- var import_node_path5 = __toESM(require("path"), 1);
9798
+ var import_node_path6 = __toESM(require("path"), 1);
7765
9799
  var import_zod6 = require("zod");
7766
9800
  var writeFile = createGadget({
7767
9801
  name: "WriteFile",
@@ -7796,7 +9830,7 @@ console.log(\`Server running on http://localhost:\${port}\`);`
7796
9830
  ],
7797
9831
  execute: ({ filePath, content }) => {
7798
9832
  const validatedPath = validatePathIsWithinCwd(filePath);
7799
- const parentDir = import_node_path5.default.dirname(validatedPath);
9833
+ const parentDir = import_node_path6.default.dirname(validatedPath);
7800
9834
  let createdDir = false;
7801
9835
  if (!import_node_fs6.default.existsSync(parentDir)) {
7802
9836
  validatePathIsWithinCwd(parentDir);
@@ -7805,7 +9839,7 @@ console.log(\`Server running on http://localhost:\${port}\`);`
7805
9839
  }
7806
9840
  import_node_fs6.default.writeFileSync(validatedPath, content, "utf-8");
7807
9841
  const bytesWritten = Buffer.byteLength(content, "utf-8");
7808
- const dirNote = createdDir ? ` (created directory: ${import_node_path5.default.dirname(filePath)})` : "";
9842
+ const dirNote = createdDir ? ` (created directory: ${import_node_path6.default.dirname(filePath)})` : "";
7809
9843
  return `path=${filePath}
7810
9844
 
7811
9845
  Wrote ${bytesWritten} bytes${dirNote}`;
@@ -8003,7 +10037,7 @@ error: ${message}`;
8003
10037
  // src/cli/builtins/index.ts
8004
10038
  var builtinGadgetRegistry = {
8005
10039
  ListDirectory: listDirectory,
8006
- ReadFile: readFile,
10040
+ ReadFile: readFile2,
8007
10041
  WriteFile: writeFile,
8008
10042
  EditFile: editFile,
8009
10043
  RunCommand: runCommand
@@ -8040,10 +10074,10 @@ function expandHomePath(input) {
8040
10074
  if (!home) {
8041
10075
  return input;
8042
10076
  }
8043
- return import_node_path6.default.join(home, input.slice(1));
10077
+ return import_node_path7.default.join(home, input.slice(1));
8044
10078
  }
8045
10079
  function isFileLikeSpecifier(specifier) {
8046
- return PATH_PREFIXES.some((prefix) => specifier.startsWith(prefix)) || specifier.includes(import_node_path6.default.sep);
10080
+ return PATH_PREFIXES.some((prefix) => specifier.startsWith(prefix)) || specifier.includes(import_node_path7.default.sep);
8047
10081
  }
8048
10082
  function tryResolveBuiltin(specifier) {
8049
10083
  if (specifier.startsWith(BUILTIN_PREFIX)) {
@@ -8066,7 +10100,7 @@ function resolveGadgetSpecifier(specifier, cwd) {
8066
10100
  return specifier;
8067
10101
  }
8068
10102
  const expanded = expandHomePath(specifier);
8069
- const resolvedPath = import_node_path6.default.resolve(cwd, expanded);
10103
+ const resolvedPath = import_node_path7.default.resolve(cwd, expanded);
8070
10104
  if (!import_node_fs7.default.existsSync(resolvedPath)) {
8071
10105
  throw new Error(`Gadget module not found at ${resolvedPath}`);
8072
10106
  }
@@ -8138,13 +10172,14 @@ async function loadGadgets(specifiers, cwd, importer = (specifier) => import(spe
8138
10172
  }
8139
10173
 
8140
10174
  // src/cli/llm-logging.ts
8141
- var import_promises2 = require("fs/promises");
10175
+ var import_promises3 = require("fs/promises");
8142
10176
  var import_node_os = require("os");
8143
- var import_node_path7 = require("path");
8144
- var DEFAULT_LLM_LOG_DIR = (0, import_node_path7.join)((0, import_node_os.homedir)(), ".llmist", "logs");
10177
+ var import_node_path8 = require("path");
10178
+ init_messages();
10179
+ var DEFAULT_LLM_LOG_DIR = (0, import_node_path8.join)((0, import_node_os.homedir)(), ".llmist", "logs");
8145
10180
  function resolveLogDir(option, subdir) {
8146
10181
  if (option === true) {
8147
- return (0, import_node_path7.join)(DEFAULT_LLM_LOG_DIR, subdir);
10182
+ return (0, import_node_path8.join)(DEFAULT_LLM_LOG_DIR, subdir);
8148
10183
  }
8149
10184
  if (typeof option === "string") {
8150
10185
  return option;
@@ -8155,14 +10190,14 @@ function formatLlmRequest(messages) {
8155
10190
  const lines = [];
8156
10191
  for (const msg of messages) {
8157
10192
  lines.push(`=== ${msg.role.toUpperCase()} ===`);
8158
- lines.push(msg.content ?? "");
10193
+ lines.push(msg.content ? extractText(msg.content) : "");
8159
10194
  lines.push("");
8160
10195
  }
8161
10196
  return lines.join("\n");
8162
10197
  }
8163
10198
  async function writeLogFile(dir, filename, content) {
8164
- await (0, import_promises2.mkdir)(dir, { recursive: true });
8165
- await (0, import_promises2.writeFile)((0, import_node_path7.join)(dir, filename), content, "utf-8");
10199
+ await (0, import_promises3.mkdir)(dir, { recursive: true });
10200
+ await (0, import_promises3.writeFile)((0, import_node_path8.join)(dir, filename), content, "utf-8");
8166
10201
  }
8167
10202
  function formatSessionTimestamp(date = /* @__PURE__ */ new Date()) {
8168
10203
  const pad = (n) => n.toString().padStart(2, "0");
@@ -8176,9 +10211,9 @@ function formatSessionTimestamp(date = /* @__PURE__ */ new Date()) {
8176
10211
  }
8177
10212
  async function createSessionDir(baseDir) {
8178
10213
  const timestamp = formatSessionTimestamp();
8179
- const sessionDir = (0, import_node_path7.join)(baseDir, timestamp);
10214
+ const sessionDir = (0, import_node_path8.join)(baseDir, timestamp);
8180
10215
  try {
8181
- await (0, import_promises2.mkdir)(sessionDir, { recursive: true });
10216
+ await (0, import_promises3.mkdir)(sessionDir, { recursive: true });
8182
10217
  return sessionDir;
8183
10218
  } catch (error) {
8184
10219
  console.warn(`[llmist] Failed to create log session directory: ${sessionDir}`, error);
@@ -8229,9 +10264,9 @@ function ensureMarkedConfigured() {
8229
10264
  markedConfigured = true;
8230
10265
  }
8231
10266
  }
8232
- function renderMarkdown(text) {
10267
+ function renderMarkdown(text3) {
8233
10268
  ensureMarkedConfigured();
8234
- let rendered = import_marked.marked.parse(text);
10269
+ let rendered = import_marked.marked.parse(text3);
8235
10270
  rendered = rendered.replace(/\*\*(.+?)\*\*/g, (_, content) => import_chalk3.default.bold(content)).replace(/(?<!\*)\*(\S[^*]*)\*(?!\*)/g, (_, content) => import_chalk3.default.italic(content));
8236
10271
  return rendered.trimEnd();
8237
10272
  }
@@ -8245,8 +10280,8 @@ function createRainbowSeparator() {
8245
10280
  }
8246
10281
  return result;
8247
10282
  }
8248
- function renderMarkdownWithSeparators(text) {
8249
- const rendered = renderMarkdown(text);
10283
+ function renderMarkdownWithSeparators(text3) {
10284
+ const rendered = renderMarkdown(text3);
8250
10285
  const separator = createRainbowSeparator();
8251
10286
  return `
8252
10287
  ${separator}
@@ -8414,12 +10449,12 @@ var StreamPrinter = class {
8414
10449
  *
8415
10450
  * @param text - Text to write
8416
10451
  */
8417
- write(text) {
8418
- if (!text) {
10452
+ write(text3) {
10453
+ if (!text3) {
8419
10454
  return;
8420
10455
  }
8421
- this.target.write(text);
8422
- this.endedWithNewline = text.endsWith("\n");
10456
+ this.target.write(text3);
10457
+ this.endedWithNewline = text3.endsWith("\n");
8423
10458
  }
8424
10459
  /**
8425
10460
  * Ensures output ends with a newline by writing one if needed.
@@ -8898,7 +10933,7 @@ function addCompleteOptions(cmd, defaults) {
8898
10933
  OPTION_DESCRIPTIONS.maxTokens,
8899
10934
  createNumericParser({ label: "Max tokens", integer: true, min: 1 }),
8900
10935
  defaults?.["max-tokens"]
8901
- ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]);
10936
+ ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]).option(OPTION_FLAGS.inputImage, OPTION_DESCRIPTIONS.inputImage).option(OPTION_FLAGS.inputAudio, OPTION_DESCRIPTIONS.inputAudio);
8902
10937
  }
8903
10938
  function addAgentOptions(cmd, defaults) {
8904
10939
  const gadgetAccumulator = (value, previous = []) => [
@@ -8922,7 +10957,7 @@ function addAgentOptions(cmd, defaults) {
8922
10957
  OPTION_FLAGS.noBuiltinInteraction,
8923
10958
  OPTION_DESCRIPTIONS.noBuiltinInteraction,
8924
10959
  defaults?.["builtin-interaction"] !== false
8925
- ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]).option(OPTION_FLAGS.docker, OPTION_DESCRIPTIONS.docker).option(OPTION_FLAGS.dockerRo, OPTION_DESCRIPTIONS.dockerRo).option(OPTION_FLAGS.noDocker, OPTION_DESCRIPTIONS.noDocker).option(OPTION_FLAGS.dockerDev, OPTION_DESCRIPTIONS.dockerDev);
10960
+ ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]).option(OPTION_FLAGS.inputImage, OPTION_DESCRIPTIONS.inputImage).option(OPTION_FLAGS.inputAudio, OPTION_DESCRIPTIONS.inputAudio).option(OPTION_FLAGS.docker, OPTION_DESCRIPTIONS.docker).option(OPTION_FLAGS.dockerRo, OPTION_DESCRIPTIONS.dockerRo).option(OPTION_FLAGS.noDocker, OPTION_DESCRIPTIONS.noDocker).option(OPTION_FLAGS.dockerDev, OPTION_DESCRIPTIONS.dockerDev);
8926
10961
  }
8927
10962
  function configToCompleteOptions(config) {
8928
10963
  const result = {};
@@ -8989,7 +11024,7 @@ var DEV_SOURCE_MOUNT_TARGET = "/llmist-src";
8989
11024
  // src/cli/config.ts
8990
11025
  var import_node_fs8 = require("fs");
8991
11026
  var import_node_os2 = require("os");
8992
- var import_node_path8 = require("path");
11027
+ var import_node_path9 = require("path");
8993
11028
  var import_js_toml = require("js-toml");
8994
11029
 
8995
11030
  // src/cli/templates.ts
@@ -9127,6 +11162,22 @@ var AGENT_CONFIG_KEYS = /* @__PURE__ */ new Set([
9127
11162
  "docker-cwd-permission"
9128
11163
  // Override CWD mount permission for this profile
9129
11164
  ]);
11165
+ var IMAGE_CONFIG_KEYS = /* @__PURE__ */ new Set([
11166
+ "model",
11167
+ "size",
11168
+ "quality",
11169
+ "count",
11170
+ "output",
11171
+ "quiet"
11172
+ ]);
11173
+ var SPEECH_CONFIG_KEYS = /* @__PURE__ */ new Set([
11174
+ "model",
11175
+ "voice",
11176
+ "format",
11177
+ "speed",
11178
+ "output",
11179
+ "quiet"
11180
+ ]);
9130
11181
  var CUSTOM_CONFIG_KEYS = /* @__PURE__ */ new Set([
9131
11182
  ...COMPLETE_CONFIG_KEYS,
9132
11183
  ...AGENT_CONFIG_KEYS,
@@ -9134,7 +11185,7 @@ var CUSTOM_CONFIG_KEYS = /* @__PURE__ */ new Set([
9134
11185
  "description"
9135
11186
  ]);
9136
11187
  function getConfigPath() {
9137
- return (0, import_node_path8.join)((0, import_node_os2.homedir)(), ".llmist", "cli.toml");
11188
+ return (0, import_node_path9.join)((0, import_node_os2.homedir)(), ".llmist", "cli.toml");
9138
11189
  }
9139
11190
  var ConfigError = class extends Error {
9140
11191
  constructor(message, path5) {
@@ -9387,6 +11438,75 @@ function validateAgentConfig(raw, section) {
9387
11438
  }
9388
11439
  return result;
9389
11440
  }
11441
+ function validateImageConfig(raw, section) {
11442
+ if (typeof raw !== "object" || raw === null) {
11443
+ throw new ConfigError(`[${section}] must be a table`);
11444
+ }
11445
+ const rawObj = raw;
11446
+ for (const key of Object.keys(rawObj)) {
11447
+ if (!IMAGE_CONFIG_KEYS.has(key)) {
11448
+ throw new ConfigError(`[${section}].${key} is not a valid option`);
11449
+ }
11450
+ }
11451
+ const result = {};
11452
+ if ("model" in rawObj) {
11453
+ result.model = validateString(rawObj.model, "model", section);
11454
+ }
11455
+ if ("size" in rawObj) {
11456
+ result.size = validateString(rawObj.size, "size", section);
11457
+ }
11458
+ if ("quality" in rawObj) {
11459
+ result.quality = validateString(rawObj.quality, "quality", section);
11460
+ }
11461
+ if ("count" in rawObj) {
11462
+ result.count = validateNumber(rawObj.count, "count", section, {
11463
+ integer: true,
11464
+ min: 1,
11465
+ max: 10
11466
+ });
11467
+ }
11468
+ if ("output" in rawObj) {
11469
+ result.output = validateString(rawObj.output, "output", section);
11470
+ }
11471
+ if ("quiet" in rawObj) {
11472
+ result.quiet = validateBoolean(rawObj.quiet, "quiet", section);
11473
+ }
11474
+ return result;
11475
+ }
11476
+ function validateSpeechConfig(raw, section) {
11477
+ if (typeof raw !== "object" || raw === null) {
11478
+ throw new ConfigError(`[${section}] must be a table`);
11479
+ }
11480
+ const rawObj = raw;
11481
+ for (const key of Object.keys(rawObj)) {
11482
+ if (!SPEECH_CONFIG_KEYS.has(key)) {
11483
+ throw new ConfigError(`[${section}].${key} is not a valid option`);
11484
+ }
11485
+ }
11486
+ const result = {};
11487
+ if ("model" in rawObj) {
11488
+ result.model = validateString(rawObj.model, "model", section);
11489
+ }
11490
+ if ("voice" in rawObj) {
11491
+ result.voice = validateString(rawObj.voice, "voice", section);
11492
+ }
11493
+ if ("format" in rawObj) {
11494
+ result.format = validateString(rawObj.format, "format", section);
11495
+ }
11496
+ if ("speed" in rawObj) {
11497
+ result.speed = validateNumber(rawObj.speed, "speed", section, {
11498
+ min: 0.25,
11499
+ max: 4
11500
+ });
11501
+ }
11502
+ if ("output" in rawObj) {
11503
+ result.output = validateString(rawObj.output, "output", section);
11504
+ }
11505
+ if ("quiet" in rawObj) {
11506
+ result.quiet = validateBoolean(rawObj.quiet, "quiet", section);
11507
+ }
11508
+ return result;
11509
+ }
9390
11510
  function validateStringOrBoolean(value, field, section) {
9391
11511
  if (typeof value === "string" || typeof value === "boolean") {
9392
11512
  return value;
@@ -9509,6 +11629,10 @@ function validateConfig(raw, configPath) {
9509
11629
  result.complete = validateCompleteConfig(value, key);
9510
11630
  } else if (key === "agent") {
9511
11631
  result.agent = validateAgentConfig(value, key);
11632
+ } else if (key === "image") {
11633
+ result.image = validateImageConfig(value, key);
11634
+ } else if (key === "speech") {
11635
+ result.speech = validateSpeechConfig(value, key);
9512
11636
  } else if (key === "prompts") {
9513
11637
  result.prompts = validatePromptsConfig(value, key);
9514
11638
  } else if (key === "docker") {
@@ -9553,7 +11677,7 @@ function loadConfig() {
9553
11677
  return resolveTemplatesInConfig(inherited, configPath);
9554
11678
  }
9555
11679
  function getCustomCommandNames(config) {
9556
- const reserved = /* @__PURE__ */ new Set(["global", "complete", "agent", "prompts", "docker"]);
11680
+ const reserved = /* @__PURE__ */ new Set(["global", "complete", "agent", "image", "speech", "prompts", "docker"]);
9557
11681
  return Object.keys(config).filter((key) => !reserved.has(key));
9558
11682
  }
9559
11683
  function resolveTemplatesInConfig(config, configPath) {
@@ -9908,8 +12032,8 @@ function computeDockerfileHash(dockerfile) {
9908
12032
  // src/cli/docker/image-manager.ts
9909
12033
  var import_node_fs9 = require("fs");
9910
12034
  var import_node_os3 = require("os");
9911
- var import_node_path9 = require("path");
9912
- var CACHE_DIR = (0, import_node_path9.join)((0, import_node_os3.homedir)(), ".llmist", "docker-cache");
12035
+ var import_node_path10 = require("path");
12036
+ var CACHE_DIR = (0, import_node_path10.join)((0, import_node_os3.homedir)(), ".llmist", "docker-cache");
9913
12037
  var HASH_FILE = "image-hash.json";
9914
12038
  function ensureCacheDir() {
9915
12039
  if (!(0, import_node_fs9.existsSync)(CACHE_DIR)) {
@@ -9917,7 +12041,7 @@ function ensureCacheDir() {
9917
12041
  }
9918
12042
  }
9919
12043
  function getCachedHash(imageName) {
9920
- const hashPath = (0, import_node_path9.join)(CACHE_DIR, HASH_FILE);
12044
+ const hashPath = (0, import_node_path10.join)(CACHE_DIR, HASH_FILE);
9921
12045
  if (!(0, import_node_fs9.existsSync)(hashPath)) {
9922
12046
  return void 0;
9923
12047
  }
@@ -9931,7 +12055,7 @@ function getCachedHash(imageName) {
9931
12055
  }
9932
12056
  function setCachedHash(imageName, hash) {
9933
12057
  ensureCacheDir();
9934
- const hashPath = (0, import_node_path9.join)(CACHE_DIR, HASH_FILE);
12058
+ const hashPath = (0, import_node_path10.join)(CACHE_DIR, HASH_FILE);
9935
12059
  let cache = {};
9936
12060
  if ((0, import_node_fs9.existsSync)(hashPath)) {
9937
12061
  try {
@@ -9957,7 +12081,7 @@ var DockerBuildError = class extends Error {
9957
12081
  };
9958
12082
  async function buildImage(imageName, dockerfile) {
9959
12083
  ensureCacheDir();
9960
- const dockerfilePath = (0, import_node_path9.join)(CACHE_DIR, "Dockerfile");
12084
+ const dockerfilePath = (0, import_node_path10.join)(CACHE_DIR, "Dockerfile");
9961
12085
  (0, import_node_fs9.writeFileSync)(dockerfilePath, dockerfile);
9962
12086
  const proc = Bun.spawn(
9963
12087
  ["docker", "build", "-t", imageName, "-f", dockerfilePath, CACHE_DIR],
@@ -9992,7 +12116,7 @@ async function ensureImage(imageName = DEFAULT_IMAGE_NAME, dockerfile) {
9992
12116
 
9993
12117
  // src/cli/docker/docker-wrapper.ts
9994
12118
  var import_node_fs10 = require("fs");
9995
- var import_node_path10 = require("path");
12119
+ var import_node_path11 = require("path");
9996
12120
  var import_node_os4 = require("os");
9997
12121
  var DockerUnavailableError = class extends Error {
9998
12122
  constructor() {
@@ -10038,9 +12162,9 @@ function autoDetectDevSource() {
10038
12162
  if (!scriptPath || !scriptPath.endsWith("src/cli.ts")) {
10039
12163
  return void 0;
10040
12164
  }
10041
- const srcDir = (0, import_node_path10.dirname)(scriptPath);
10042
- const projectDir = (0, import_node_path10.dirname)(srcDir);
10043
- const packageJsonPath = (0, import_node_path10.join)(projectDir, "package.json");
12165
+ const srcDir = (0, import_node_path11.dirname)(scriptPath);
12166
+ const projectDir = (0, import_node_path11.dirname)(srcDir);
12167
+ const packageJsonPath = (0, import_node_path11.join)(projectDir, "package.json");
10044
12168
  if (!(0, import_node_fs10.existsSync)(packageJsonPath)) {
10045
12169
  return void 0;
10046
12170
  }
@@ -10189,7 +12313,7 @@ function createHumanInputHandler(env, progress, keyboard) {
10189
12313
  keyboard.cleanupEsc();
10190
12314
  keyboard.cleanupEsc = null;
10191
12315
  }
10192
- const rl = (0, import_promises3.createInterface)({ input: env.stdin, output: env.stdout });
12316
+ const rl = (0, import_promises4.createInterface)({ input: env.stdin, output: env.stdout });
10193
12317
  try {
10194
12318
  const questionLine = question.trim() ? `
10195
12319
  ${renderMarkdownWithSeparators(question.trim())}` : "";
@@ -10547,8 +12671,8 @@ Denied: ${result.reason ?? "by user"}`
10547
12671
  builder.withTextOnlyHandler("acknowledge");
10548
12672
  builder.withTextWithGadgetsHandler({
10549
12673
  gadgetName: "TellUser",
10550
- parameterMapping: (text) => ({ message: text, done: false, type: "info" }),
10551
- resultMapping: (text) => `\u2139\uFE0F ${text}`
12674
+ parameterMapping: (text3) => ({ message: text3, done: false, type: "info" }),
12675
+ resultMapping: (text3) => `\u2139\uFE0F ${text3}`
10552
12676
  });
10553
12677
  builder.withTrailingMessage(
10554
12678
  (ctx) => [
@@ -10557,7 +12681,19 @@ Denied: ${result.reason ?? "by user"}`
10557
12681
  "Maximize efficiency by batching independent operations in a single response."
10558
12682
  ].join(" ")
10559
12683
  );
10560
- const agent = builder.ask(prompt);
12684
+ let agent;
12685
+ if (options.image || options.audio) {
12686
+ const parts = [text(prompt)];
12687
+ if (options.image) {
12688
+ parts.push(await readImageFile(options.image));
12689
+ }
12690
+ if (options.audio) {
12691
+ parts.push(await readAudioFile(options.audio));
12692
+ }
12693
+ agent = builder.askWithContent(parts);
12694
+ } else {
12695
+ agent = builder.ask(prompt);
12696
+ }
10561
12697
  let textBuffer = "";
10562
12698
  const flushTextBuffer = () => {
10563
12699
  if (textBuffer) {
@@ -10632,6 +12768,7 @@ function registerAgentCommand(program, env, config) {
10632
12768
  }
10633
12769
 
10634
12770
  // src/cli/complete-command.ts
12771
+ init_input_content();
10635
12772
  init_messages();
10636
12773
  init_model_shortcuts();
10637
12774
  init_constants2();
@@ -10643,7 +12780,18 @@ async function executeComplete(promptArg, options, env) {
10643
12780
  if (options.system) {
10644
12781
  builder.addSystem(options.system);
10645
12782
  }
10646
- builder.addUser(prompt);
12783
+ if (options.image || options.audio) {
12784
+ const parts = [text(prompt)];
12785
+ if (options.image) {
12786
+ parts.push(await readImageFile(options.image));
12787
+ }
12788
+ if (options.audio) {
12789
+ parts.push(await readAudioFile(options.audio));
12790
+ }
12791
+ builder.addUserMultimodal(parts);
12792
+ } else {
12793
+ builder.addUser(prompt);
12794
+ }
10647
12795
  const messages = builder.build();
10648
12796
  const llmLogsBaseDir = resolveLogDir(options.logLlmRequests, "requests");
10649
12797
  let llmSessionDir;
@@ -10718,7 +12866,7 @@ init_schema_to_json();
10718
12866
  init_schema_validator();
10719
12867
 
10720
12868
  // src/cli/gadget-prompts.ts
10721
- var import_promises4 = require("readline/promises");
12869
+ var import_promises5 = require("readline/promises");
10722
12870
  var import_chalk6 = __toESM(require("chalk"), 1);
10723
12871
  init_schema_to_json();
10724
12872
  async function promptForParameters(schema, ctx) {
@@ -10729,7 +12877,7 @@ async function promptForParameters(schema, ctx) {
10729
12877
  if (!jsonSchema.properties || Object.keys(jsonSchema.properties).length === 0) {
10730
12878
  return {};
10731
12879
  }
10732
- const rl = (0, import_promises4.createInterface)({ input: ctx.stdin, output: ctx.stdout });
12880
+ const rl = (0, import_promises5.createInterface)({ input: ctx.stdin, output: ctx.stdout });
10733
12881
  const params = {};
10734
12882
  try {
10735
12883
  for (const [key, prop] of Object.entries(jsonSchema.properties)) {
@@ -11148,19 +13296,118 @@ function registerGadgetCommand(program, env) {
11148
13296
  );
11149
13297
  }
11150
13298
 
13299
+ // src/cli/image-command.ts
13300
+ var import_node_fs11 = require("fs");
13301
+ var DEFAULT_IMAGE_MODEL = "dall-e-3";
13302
+ async function executeImage(promptArg, options, env) {
13303
+ const prompt = await resolvePrompt(promptArg, env);
13304
+ const client = env.createClient();
13305
+ const model = options.model;
13306
+ const n = options.count ? Number.parseInt(options.count, 10) : 1;
13307
+ const stderrTTY = env.stderr.isTTY === true;
13308
+ if (!options.quiet && stderrTTY) {
13309
+ env.stderr.write(`${SUMMARY_PREFIX} Generating image with ${model}...
13310
+ `);
13311
+ }
13312
+ const result = await client.image.generate({
13313
+ model,
13314
+ prompt,
13315
+ size: options.size,
13316
+ quality: options.quality,
13317
+ n,
13318
+ responseFormat: options.output ? "b64_json" : "url"
13319
+ });
13320
+ if (options.output) {
13321
+ const imageData = result.images[0];
13322
+ if (imageData.b64Json) {
13323
+ const buffer = Buffer.from(imageData.b64Json, "base64");
13324
+ (0, import_node_fs11.writeFileSync)(options.output, buffer);
13325
+ if (!options.quiet) {
13326
+ env.stderr.write(`${SUMMARY_PREFIX} Image saved to ${options.output}
13327
+ `);
13328
+ }
13329
+ } else if (imageData.url) {
13330
+ env.stdout.write(`${imageData.url}
13331
+ `);
13332
+ }
13333
+ } else {
13334
+ for (const image of result.images) {
13335
+ if (image.url) {
13336
+ env.stdout.write(`${image.url}
13337
+ `);
13338
+ } else if (image.b64Json) {
13339
+ env.stdout.write(image.b64Json);
13340
+ }
13341
+ }
13342
+ }
13343
+ if (!options.quiet && stderrTTY) {
13344
+ const parts = [
13345
+ `${result.images.length} image(s)`,
13346
+ `size: ${result.usage.size}`,
13347
+ `quality: ${result.usage.quality}`
13348
+ ];
13349
+ if (result.cost !== void 0) {
13350
+ parts.push(`cost: ${formatCost(result.cost)}`);
13351
+ }
13352
+ env.stderr.write(`${SUMMARY_PREFIX} ${parts.join(" | ")}
13353
+ `);
13354
+ }
13355
+ }
13356
+ function registerImageCommand(program, env, config) {
13357
+ program.command(COMMANDS.image).description("Generate images from a text prompt.").argument("[prompt]", "Image generation prompt. If omitted, stdin is used when available.").option(
13358
+ OPTION_FLAGS.model,
13359
+ OPTION_DESCRIPTIONS.model,
13360
+ config?.model ?? DEFAULT_IMAGE_MODEL
13361
+ ).option(OPTION_FLAGS.imageSize, OPTION_DESCRIPTIONS.imageSize, config?.size).option(OPTION_FLAGS.imageQuality, OPTION_DESCRIPTIONS.imageQuality, config?.quality).option(OPTION_FLAGS.imageCount, OPTION_DESCRIPTIONS.imageCount, config?.count?.toString()).option(OPTION_FLAGS.imageOutput, OPTION_DESCRIPTIONS.imageOutput, config?.output).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, config?.quiet ?? false).action(
13362
+ (prompt, options) => executeAction(() => executeImage(prompt, options, env), env)
13363
+ );
13364
+ }
13365
+
11151
13366
  // src/cli/models-command.ts
11152
13367
  var import_chalk8 = __toESM(require("chalk"), 1);
11153
13368
  init_model_shortcuts();
11154
13369
  async function handleModelsCommand(options, env) {
11155
13370
  const client = env.createClient();
11156
- const models = client.modelRegistry.listModels(options.provider);
13371
+ const showText = options.all || options.text || !options.image && !options.speech;
13372
+ const showImage = options.all || options.image;
13373
+ const showSpeech = options.all || options.speech;
13374
+ const textModels = showText ? client.modelRegistry.listModels(options.provider) : [];
13375
+ const imageModels = showImage ? client.image.listModels().filter((m) => !options.provider || m.provider === options.provider) : [];
13376
+ const speechModels = showSpeech ? client.speech.listModels().filter((m) => !options.provider || m.provider === options.provider) : [];
11157
13377
  if (options.format === "json") {
11158
- renderJSON(models, env.stdout);
13378
+ renderJSON(textModels, imageModels, speechModels, env.stdout);
11159
13379
  } else {
11160
- renderTable(models, options.verbose || false, env.stdout);
13380
+ renderAllTables(textModels, imageModels, speechModels, options.verbose || false, env.stdout);
13381
+ }
13382
+ }
13383
+ function renderAllTables(textModels, imageModels, speechModels, verbose, stream2) {
13384
+ const hasAnyModels = textModels.length > 0 || imageModels.length > 0 || speechModels.length > 0;
13385
+ if (!hasAnyModels) {
13386
+ stream2.write(import_chalk8.default.yellow("\nNo models found matching the specified criteria.\n\n"));
13387
+ return;
13388
+ }
13389
+ stream2.write(import_chalk8.default.bold.cyan("\nAvailable Models\n"));
13390
+ stream2.write(import_chalk8.default.cyan("=".repeat(80)) + "\n\n");
13391
+ if (textModels.length > 0) {
13392
+ renderTextTable(textModels, verbose, stream2);
13393
+ }
13394
+ if (imageModels.length > 0) {
13395
+ renderImageTable(imageModels, verbose, stream2);
13396
+ }
13397
+ if (speechModels.length > 0) {
13398
+ renderSpeechTable(speechModels, verbose, stream2);
13399
+ }
13400
+ if (textModels.length > 0) {
13401
+ stream2.write(import_chalk8.default.bold.magenta("Model Shortcuts\n"));
13402
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n");
13403
+ const shortcuts = Object.entries(MODEL_ALIASES).sort((a, b) => a[0].localeCompare(b[0]));
13404
+ for (const [shortcut, fullName] of shortcuts) {
13405
+ stream2.write(import_chalk8.default.cyan(` ${shortcut.padEnd(15)}`) + import_chalk8.default.dim(" \u2192 ") + import_chalk8.default.white(fullName) + "\n");
13406
+ }
13407
+ stream2.write("\n");
11161
13408
  }
11162
13409
  }
11163
- function renderTable(models, verbose, stream2) {
13410
+ function renderTextTable(models, verbose, stream2) {
11164
13411
  const grouped = /* @__PURE__ */ new Map();
11165
13412
  for (const model of models) {
11166
13413
  const provider = model.provider;
@@ -11169,13 +13416,13 @@ function renderTable(models, verbose, stream2) {
11169
13416
  }
11170
13417
  grouped.get(provider).push(model);
11171
13418
  }
11172
- stream2.write(import_chalk8.default.bold.cyan("\nAvailable Models\n"));
11173
- stream2.write(import_chalk8.default.cyan("=".repeat(80)) + "\n\n");
13419
+ stream2.write(import_chalk8.default.bold.blue("\u{1F4DD} Text/LLM Models\n"));
13420
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n\n");
11174
13421
  const providers = Array.from(grouped.keys()).sort();
11175
13422
  for (const provider of providers) {
11176
13423
  const providerModels = grouped.get(provider);
11177
13424
  const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
11178
- stream2.write(import_chalk8.default.bold.yellow(`${providerName} Models
13425
+ stream2.write(import_chalk8.default.bold.yellow(`${providerName}
11179
13426
  `));
11180
13427
  if (verbose) {
11181
13428
  renderVerboseTable(providerModels, stream2);
@@ -11184,13 +13431,6 @@ function renderTable(models, verbose, stream2) {
11184
13431
  }
11185
13432
  stream2.write("\n");
11186
13433
  }
11187
- stream2.write(import_chalk8.default.bold.magenta("Model Shortcuts\n"));
11188
- stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n");
11189
- const shortcuts = Object.entries(MODEL_ALIASES).sort((a, b) => a[0].localeCompare(b[0]));
11190
- for (const [shortcut, fullName] of shortcuts) {
11191
- stream2.write(import_chalk8.default.cyan(` ${shortcut.padEnd(15)}`) + import_chalk8.default.dim(" \u2192 ") + import_chalk8.default.white(fullName) + "\n");
11192
- }
11193
- stream2.write("\n");
11194
13434
  }
11195
13435
  function renderCompactTable(models, stream2) {
11196
13436
  const idWidth = 25;
@@ -11267,9 +13507,171 @@ function renderVerboseTable(models, stream2) {
11267
13507
  }
11268
13508
  stream2.write("\n");
11269
13509
  }
11270
- function renderJSON(models, stream2) {
11271
- const output = {
11272
- models: models.map((model) => ({
13510
+ function renderImageTable(models, verbose, stream2) {
13511
+ stream2.write(import_chalk8.default.bold.green("\u{1F3A8} Image Generation Models\n"));
13512
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n\n");
13513
+ const grouped = /* @__PURE__ */ new Map();
13514
+ for (const model of models) {
13515
+ if (!grouped.has(model.provider)) {
13516
+ grouped.set(model.provider, []);
13517
+ }
13518
+ grouped.get(model.provider).push(model);
13519
+ }
13520
+ for (const [provider, providerModels] of Array.from(grouped.entries()).sort()) {
13521
+ const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
13522
+ stream2.write(import_chalk8.default.bold.yellow(`${providerName}
13523
+ `));
13524
+ if (verbose) {
13525
+ for (const model of providerModels) {
13526
+ stream2.write(import_chalk8.default.bold.green(`
13527
+ ${model.modelId}
13528
+ `));
13529
+ stream2.write(import_chalk8.default.dim(" " + "\u2500".repeat(60)) + "\n");
13530
+ stream2.write(` ${import_chalk8.default.dim("Name:")} ${import_chalk8.default.white(model.displayName)}
13531
+ `);
13532
+ stream2.write(` ${import_chalk8.default.dim("Sizes:")} ${import_chalk8.default.yellow(model.supportedSizes.join(", "))}
13533
+ `);
13534
+ if (model.supportedQualities) {
13535
+ stream2.write(` ${import_chalk8.default.dim("Qualities:")} ${import_chalk8.default.yellow(model.supportedQualities.join(", "))}
13536
+ `);
13537
+ }
13538
+ stream2.write(` ${import_chalk8.default.dim("Max Images:")} ${import_chalk8.default.yellow(model.maxImages.toString())}
13539
+ `);
13540
+ stream2.write(` ${import_chalk8.default.dim("Pricing:")} ${import_chalk8.default.cyan(formatImagePrice(model))}
13541
+ `);
13542
+ if (model.features) {
13543
+ const features = [];
13544
+ if (model.features.textRendering) features.push("text-rendering");
13545
+ if (model.features.transparency) features.push("transparency");
13546
+ if (model.features.conversational) features.push("conversational");
13547
+ if (features.length > 0) {
13548
+ stream2.write(` ${import_chalk8.default.dim("Features:")} ${import_chalk8.default.blue(features.join(", "))}
13549
+ `);
13550
+ }
13551
+ }
13552
+ }
13553
+ } else {
13554
+ const idWidth = 32;
13555
+ const nameWidth = 25;
13556
+ const sizesWidth = 20;
13557
+ const priceWidth = 15;
13558
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
13559
+ stream2.write(
13560
+ import_chalk8.default.bold(
13561
+ "Model ID".padEnd(idWidth) + " " + "Display Name".padEnd(nameWidth) + " " + "Sizes".padEnd(sizesWidth) + " " + "Price".padEnd(priceWidth)
13562
+ ) + "\n"
13563
+ );
13564
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
13565
+ for (const model of providerModels) {
13566
+ const sizes = model.supportedSizes.length > 2 ? model.supportedSizes.slice(0, 2).join(", ") + "..." : model.supportedSizes.join(", ");
13567
+ stream2.write(
13568
+ import_chalk8.default.green(model.modelId.padEnd(idWidth)) + " " + import_chalk8.default.white(model.displayName.substring(0, nameWidth - 1).padEnd(nameWidth)) + " " + import_chalk8.default.yellow(sizes.padEnd(sizesWidth)) + " " + import_chalk8.default.cyan(formatImagePrice(model).padEnd(priceWidth)) + "\n"
13569
+ );
13570
+ }
13571
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
13572
+ }
13573
+ stream2.write("\n");
13574
+ }
13575
+ }
13576
+ function renderSpeechTable(models, verbose, stream2) {
13577
+ stream2.write(import_chalk8.default.bold.magenta("\u{1F3A4} Speech (TTS) Models\n"));
13578
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n\n");
13579
+ const grouped = /* @__PURE__ */ new Map();
13580
+ for (const model of models) {
13581
+ if (!grouped.has(model.provider)) {
13582
+ grouped.set(model.provider, []);
13583
+ }
13584
+ grouped.get(model.provider).push(model);
13585
+ }
13586
+ for (const [provider, providerModels] of Array.from(grouped.entries()).sort()) {
13587
+ const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
13588
+ stream2.write(import_chalk8.default.bold.yellow(`${providerName}
13589
+ `));
13590
+ if (verbose) {
13591
+ for (const model of providerModels) {
13592
+ stream2.write(import_chalk8.default.bold.green(`
13593
+ ${model.modelId}
13594
+ `));
13595
+ stream2.write(import_chalk8.default.dim(" " + "\u2500".repeat(60)) + "\n");
13596
+ stream2.write(` ${import_chalk8.default.dim("Name:")} ${import_chalk8.default.white(model.displayName)}
13597
+ `);
13598
+ stream2.write(` ${import_chalk8.default.dim("Voices:")} ${import_chalk8.default.yellow(model.voices.length.toString())} voices
13599
+ `);
13600
+ if (model.voices.length <= 6) {
13601
+ stream2.write(` ${import_chalk8.default.dim(model.voices.join(", "))}
13602
+ `);
13603
+ } else {
13604
+ stream2.write(` ${import_chalk8.default.dim(model.voices.slice(0, 6).join(", ") + "...")}
13605
+ `);
13606
+ }
13607
+ stream2.write(` ${import_chalk8.default.dim("Formats:")} ${import_chalk8.default.yellow(model.formats.join(", "))}
13608
+ `);
13609
+ stream2.write(` ${import_chalk8.default.dim("Max Input:")} ${import_chalk8.default.yellow(model.maxInputLength.toString())} chars
13610
+ `);
13611
+ stream2.write(` ${import_chalk8.default.dim("Pricing:")} ${import_chalk8.default.cyan(formatSpeechPrice(model))}
13612
+ `);
13613
+ if (model.features) {
13614
+ const features = [];
13615
+ if (model.features.multiSpeaker) features.push("multi-speaker");
13616
+ if (model.features.voiceInstructions) features.push("voice-instructions");
13617
+ if (model.features.languages) features.push(`${model.features.languages} languages`);
13618
+ if (features.length > 0) {
13619
+ stream2.write(` ${import_chalk8.default.dim("Features:")} ${import_chalk8.default.blue(features.join(", "))}
13620
+ `);
13621
+ }
13622
+ }
13623
+ }
13624
+ } else {
13625
+ const idWidth = 30;
13626
+ const nameWidth = 28;
13627
+ const voicesWidth = 12;
13628
+ const priceWidth = 18;
13629
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
13630
+ stream2.write(
13631
+ import_chalk8.default.bold(
13632
+ "Model ID".padEnd(idWidth) + " " + "Display Name".padEnd(nameWidth) + " " + "Voices".padEnd(voicesWidth) + " " + "Price".padEnd(priceWidth)
13633
+ ) + "\n"
13634
+ );
13635
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
13636
+ for (const model of providerModels) {
13637
+ stream2.write(
13638
+ import_chalk8.default.green(model.modelId.padEnd(idWidth)) + " " + import_chalk8.default.white(model.displayName.substring(0, nameWidth - 1).padEnd(nameWidth)) + " " + import_chalk8.default.yellow(`${model.voices.length} voices`.padEnd(voicesWidth)) + " " + import_chalk8.default.cyan(formatSpeechPrice(model).padEnd(priceWidth)) + "\n"
13639
+ );
13640
+ }
13641
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
13642
+ }
13643
+ stream2.write("\n");
13644
+ }
13645
+ }
13646
+ function formatImagePrice(model) {
13647
+ if (model.pricing.perImage !== void 0) {
13648
+ return `$${model.pricing.perImage.toFixed(2)}/img`;
13649
+ }
13650
+ if (model.pricing.bySize) {
13651
+ const prices = Object.values(model.pricing.bySize);
13652
+ const minPrice = Math.min(...prices.flatMap((p) => typeof p === "number" ? [p] : Object.values(p)));
13653
+ const maxPrice = Math.max(...prices.flatMap((p) => typeof p === "number" ? [p] : Object.values(p)));
13654
+ if (minPrice === maxPrice) {
13655
+ return `$${minPrice.toFixed(2)}/img`;
13656
+ }
13657
+ return `$${minPrice.toFixed(2)}-${maxPrice.toFixed(2)}`;
13658
+ }
13659
+ return "varies";
13660
+ }
13661
+ function formatSpeechPrice(model) {
13662
+ if (model.pricing.perCharacter !== void 0) {
13663
+ const perMillion = model.pricing.perCharacter * 1e6;
13664
+ return `$${perMillion.toFixed(0)}/1M chars`;
13665
+ }
13666
+ if (model.pricing.perMinute !== void 0) {
13667
+ return `~$${model.pricing.perMinute.toFixed(2)}/min`;
13668
+ }
13669
+ return "varies";
13670
+ }
13671
+ function renderJSON(textModels, imageModels, speechModels, stream2) {
13672
+ const output = {};
13673
+ if (textModels.length > 0) {
13674
+ output.textModels = textModels.map((model) => ({
11273
13675
  provider: model.provider,
11274
13676
  modelId: model.modelId,
11275
13677
  displayName: model.displayName,
@@ -11285,9 +13687,33 @@ function renderJSON(models, stream2) {
11285
13687
  knowledgeCutoff: model.knowledgeCutoff,
11286
13688
  features: model.features,
11287
13689
  metadata: model.metadata
11288
- })),
11289
- shortcuts: MODEL_ALIASES
11290
- };
13690
+ }));
13691
+ output.shortcuts = MODEL_ALIASES;
13692
+ }
13693
+ if (imageModels.length > 0) {
13694
+ output.imageModels = imageModels.map((model) => ({
13695
+ provider: model.provider,
13696
+ modelId: model.modelId,
13697
+ displayName: model.displayName,
13698
+ supportedSizes: model.supportedSizes,
13699
+ supportedQualities: model.supportedQualities,
13700
+ maxImages: model.maxImages,
13701
+ pricing: model.pricing,
13702
+ features: model.features
13703
+ }));
13704
+ }
13705
+ if (speechModels.length > 0) {
13706
+ output.speechModels = speechModels.map((model) => ({
13707
+ provider: model.provider,
13708
+ modelId: model.modelId,
13709
+ displayName: model.displayName,
13710
+ voices: model.voices,
13711
+ formats: model.formats,
13712
+ maxInputLength: model.maxInputLength,
13713
+ pricing: model.pricing,
13714
+ features: model.features
13715
+ }));
13716
+ }
11291
13717
  stream2.write(JSON.stringify(output, null, 2) + "\n");
11292
13718
  }
11293
13719
  function formatTokens2(count) {
@@ -11300,7 +13726,7 @@ function formatTokens2(count) {
11300
13726
  }
11301
13727
  }
11302
13728
  function registerModelsCommand(program, env) {
11303
- program.command(COMMANDS.models).description("List all available LLM models with pricing and capabilities.").option("--provider <name>", "Filter by provider (openai, anthropic, gemini)").option("--format <format>", "Output format: table or json", "table").option("--verbose", "Show detailed model information", false).action(
13729
+ program.command(COMMANDS.models).description("List available models with pricing and capabilities.").option("--provider <name>", "Filter by provider (openai, anthropic, gemini)").option("--format <format>", "Output format: table or json", "table").option("--verbose", "Show detailed model information", false).option("--text", "Show text/LLM models (default if no type specified)").option("--image", "Show image generation models").option("--speech", "Show speech/TTS models").option("--all", "Show all model types (text, image, speech)").action(
11304
13730
  (options) => executeAction(
11305
13731
  () => handleModelsCommand(options, env),
11306
13732
  env
@@ -11308,6 +13734,96 @@ function registerModelsCommand(program, env) {
11308
13734
  );
11309
13735
  }
11310
13736
 
13737
+ // src/cli/speech-command.ts
13738
+ var import_node_fs12 = require("fs");
13739
+ var DEFAULT_SPEECH_MODEL = "tts-1";
13740
+ var DEFAULT_VOICE = "nova";
13741
+ async function executeSpeech(textArg, options, env) {
13742
+ const text3 = await resolvePrompt(textArg, env);
13743
+ const client = env.createClient();
13744
+ const model = options.model;
13745
+ const voice = options.voice ?? DEFAULT_VOICE;
13746
+ const speed = options.speed ? Number.parseFloat(options.speed) : void 0;
13747
+ const stderrTTY = env.stderr.isTTY === true;
13748
+ if (!options.quiet && stderrTTY) {
13749
+ env.stderr.write(`${SUMMARY_PREFIX} Generating speech with ${model} (voice: ${voice})...
13750
+ `);
13751
+ }
13752
+ const result = await client.speech.generate({
13753
+ model,
13754
+ input: text3,
13755
+ voice,
13756
+ responseFormat: options.format,
13757
+ speed
13758
+ });
13759
+ const audioBuffer = Buffer.from(result.audio);
13760
+ if (options.output) {
13761
+ (0, import_node_fs12.writeFileSync)(options.output, audioBuffer);
13762
+ if (!options.quiet) {
13763
+ env.stderr.write(`${SUMMARY_PREFIX} Audio saved to ${options.output}
13764
+ `);
13765
+ }
13766
+ } else {
13767
+ env.stdout.write(audioBuffer);
13768
+ }
13769
+ if (!options.quiet && stderrTTY) {
13770
+ const parts = [
13771
+ `${result.usage.characterCount} characters`,
13772
+ `format: ${result.format}`
13773
+ ];
13774
+ if (result.cost !== void 0) {
13775
+ parts.push(`cost: ${formatCost(result.cost)}`);
13776
+ }
13777
+ env.stderr.write(`${SUMMARY_PREFIX} ${parts.join(" | ")}
13778
+ `);
13779
+ }
13780
+ }
13781
+ function registerSpeechCommand(program, env, config) {
13782
+ program.command(COMMANDS.speech).description("Generate speech audio from text.").argument("[text]", "Text to convert to speech. If omitted, stdin is used when available.").option(
13783
+ OPTION_FLAGS.model,
13784
+ OPTION_DESCRIPTIONS.model,
13785
+ config?.model ?? DEFAULT_SPEECH_MODEL
13786
+ ).option(OPTION_FLAGS.voice, OPTION_DESCRIPTIONS.voice, config?.voice ?? DEFAULT_VOICE).option(OPTION_FLAGS.speechFormat, OPTION_DESCRIPTIONS.speechFormat, config?.format).option(OPTION_FLAGS.speechSpeed, OPTION_DESCRIPTIONS.speechSpeed, config?.speed?.toString()).option(OPTION_FLAGS.speechOutput, OPTION_DESCRIPTIONS.speechOutput, config?.output).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, config?.quiet ?? false).action(
13787
+ (text3, options) => executeAction(() => executeSpeech(text3, options, env), env)
13788
+ );
13789
+ }
13790
+
13791
+ // src/cli/vision-command.ts
13792
+ init_model_shortcuts();
13793
+ async function executeVision(imagePath, options, env) {
13794
+ const client = env.createClient();
13795
+ const model = resolveModel(options.model);
13796
+ const imageBuffer = await readFileBuffer(imagePath);
13797
+ const prompt = options.prompt ?? "Describe this image in detail.";
13798
+ const stderrTTY = env.stderr.isTTY === true;
13799
+ if (!options.quiet && stderrTTY) {
13800
+ env.stderr.write(`${SUMMARY_PREFIX} Analyzing image with ${model}...
13801
+ `);
13802
+ }
13803
+ const result = await client.vision.analyze({
13804
+ model,
13805
+ image: imageBuffer,
13806
+ prompt,
13807
+ maxTokens: options.maxTokens
13808
+ });
13809
+ env.stdout.write(result);
13810
+ env.stdout.write("\n");
13811
+ }
13812
+ function registerVisionCommand(program, env) {
13813
+ program.command(COMMANDS.vision ?? "vision").description("Analyze an image using vision-capable models").argument("<image>", "Path to image file to analyze").option(
13814
+ OPTION_FLAGS.model,
13815
+ OPTION_DESCRIPTIONS.model,
13816
+ "gpt-4o"
13817
+ // Default to a vision-capable model
13818
+ ).option("-p, --prompt <prompt>", "Analysis prompt describing what to extract or describe").option(
13819
+ OPTION_FLAGS.maxTokens,
13820
+ OPTION_DESCRIPTIONS.maxTokens,
13821
+ createNumericParser({ label: "Max tokens", integer: true, min: 1 })
13822
+ ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet).action(
13823
+ (imagePath, options) => executeAction(() => executeVision(imagePath, options, env), env)
13824
+ );
13825
+ }
13826
+
11311
13827
  // src/cli/environment.ts
11312
13828
  var import_node_readline = __toESM(require("readline"), 1);
11313
13829
  var import_chalk9 = __toESM(require("chalk"), 1);
@@ -11353,7 +13869,7 @@ function createLoggerFactory(config) {
11353
13869
  }
11354
13870
  function createPromptFunction(stdin, stdout) {
11355
13871
  return (question) => {
11356
- return new Promise((resolve2) => {
13872
+ return new Promise((resolve3) => {
11357
13873
  const rl = import_node_readline.default.createInterface({
11358
13874
  input: stdin,
11359
13875
  output: stdout
@@ -11368,7 +13884,7 @@ function createPromptFunction(stdin, stdout) {
11368
13884
  `);
11369
13885
  rl.question(import_chalk9.default.green.bold("You: "), (answer) => {
11370
13886
  rl.close();
11371
- resolve2(answer);
13887
+ resolve3(answer);
11372
13888
  });
11373
13889
  });
11374
13890
  };
@@ -11459,6 +13975,9 @@ function createProgram(env, config) {
11459
13975
  });
11460
13976
  registerCompleteCommand(program, env, config?.complete);
11461
13977
  registerAgentCommand(program, env, config?.agent);
13978
+ registerImageCommand(program, env, config?.image);
13979
+ registerSpeechCommand(program, env, config?.speech);
13980
+ registerVisionCommand(program, env);
11462
13981
  registerModelsCommand(program, env);
11463
13982
  registerGadgetCommand(program, env);
11464
13983
  if (config) {