llmist 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -124,348 +124,187 @@ var init_constants = __esm({
124
124
  }
125
125
  });
126
126
 
127
- // src/core/model-shortcuts.ts
128
- function isKnownModelPattern(model) {
129
- const normalized = model.toLowerCase();
130
- if (MODEL_ALIASES[normalized]) {
131
- return true;
132
- }
133
- return KNOWN_MODEL_PATTERNS.some((pattern) => pattern.test(model));
127
+ // src/core/input-content.ts
128
+ function isImagePart(part) {
129
+ return part.type === "image";
134
130
  }
135
- function resolveModel(model, options = {}) {
136
- if (model.includes(":")) {
137
- return model;
138
- }
139
- const normalized = model.toLowerCase();
140
- if (MODEL_ALIASES[normalized]) {
141
- return MODEL_ALIASES[normalized];
142
- }
143
- const modelLower = model.toLowerCase();
144
- if (modelLower.startsWith("gpt")) {
145
- return `openai:${model}`;
131
+ function isAudioPart(part) {
132
+ return part.type === "audio";
133
+ }
134
+ function text(content) {
135
+ return { type: "text", text: content };
136
+ }
137
+ function imageFromUrl(url) {
138
+ return {
139
+ type: "image",
140
+ source: { type: "url", url }
141
+ };
142
+ }
143
+ function detectImageMimeType(data) {
144
+ const bytes = data instanceof Buffer ? data : Buffer.from(data);
145
+ for (const { bytes: magic, mimeType } of IMAGE_MAGIC_BYTES) {
146
+ if (bytes.length >= magic.length) {
147
+ let matches = true;
148
+ for (let i = 0; i < magic.length; i++) {
149
+ if (bytes[i] !== magic[i]) {
150
+ matches = false;
151
+ break;
152
+ }
153
+ }
154
+ if (matches) {
155
+ if (mimeType === "image/webp") {
156
+ if (bytes.length >= 12) {
157
+ const webpMarker = bytes[8] === 87 && bytes[9] === 69 && bytes[10] === 66 && bytes[11] === 80;
158
+ if (!webpMarker) continue;
159
+ }
160
+ }
161
+ return mimeType;
162
+ }
163
+ }
146
164
  }
147
- if (modelLower.startsWith("claude")) {
148
- return `anthropic:${model}`;
165
+ return null;
166
+ }
167
+ function detectAudioMimeType(data) {
168
+ const bytes = data instanceof Buffer ? data : Buffer.from(data);
169
+ for (const { bytes: magic, mimeType } of AUDIO_MAGIC_BYTES) {
170
+ if (bytes.length >= magic.length) {
171
+ let matches = true;
172
+ for (let i = 0; i < magic.length; i++) {
173
+ if (bytes[i] !== magic[i]) {
174
+ matches = false;
175
+ break;
176
+ }
177
+ }
178
+ if (matches) {
179
+ if (mimeType === "audio/wav") {
180
+ if (bytes.length >= 12) {
181
+ const waveMarker = bytes[8] === 87 && bytes[9] === 65 && bytes[10] === 86 && bytes[11] === 69;
182
+ if (!waveMarker) continue;
183
+ }
184
+ }
185
+ return mimeType;
186
+ }
187
+ }
149
188
  }
150
- if (modelLower.startsWith("gemini")) {
151
- return `gemini:${model}`;
189
+ return null;
190
+ }
191
+ function toBase64(data) {
192
+ if (typeof data === "string") {
193
+ return data;
152
194
  }
153
- if (modelLower.match(/^o\d/)) {
154
- return `openai:${model}`;
195
+ return Buffer.from(data).toString("base64");
196
+ }
197
+ function audioFromBuffer(buffer, mediaType) {
198
+ const detectedType = mediaType ?? detectAudioMimeType(buffer);
199
+ if (!detectedType) {
200
+ throw new Error(
201
+ "Could not detect audio MIME type. Please provide the mediaType parameter explicitly."
202
+ );
155
203
  }
156
- if (!isKnownModelPattern(model)) {
157
- if (options.strict) {
158
- throw new Error(
159
- `Unknown model '${model}'. Did you mean one of: gpt4, sonnet, haiku, flash? Use explicit provider prefix like 'openai:${model}' to bypass this check.`
160
- );
161
- }
162
- if (!options.silent) {
163
- console.warn(
164
- `\u26A0\uFE0F Unknown model '${model}', falling back to 'openai:${model}'. This might be a typo. Did you mean: gpt4, gpt5, gpt5-nano, sonnet, haiku, flash? Use { strict: true } to error on unknown models, or { silent: true } to suppress this warning.`
165
- );
204
+ return {
205
+ type: "audio",
206
+ source: {
207
+ type: "base64",
208
+ mediaType: detectedType,
209
+ data: toBase64(buffer)
166
210
  }
167
- }
168
- return `openai:${model}`;
211
+ };
169
212
  }
170
- var MODEL_ALIASES, KNOWN_MODEL_PATTERNS;
171
- var init_model_shortcuts = __esm({
172
- "src/core/model-shortcuts.ts"() {
213
+ function isDataUrl(input) {
214
+ return input.startsWith("data:");
215
+ }
216
+ function parseDataUrl(url) {
217
+ const match = url.match(/^data:([^;]+);base64,(.+)$/);
218
+ if (!match) return null;
219
+ return { mimeType: match[1], data: match[2] };
220
+ }
221
+ var IMAGE_MAGIC_BYTES, AUDIO_MAGIC_BYTES;
222
+ var init_input_content = __esm({
223
+ "src/core/input-content.ts"() {
173
224
  "use strict";
174
- MODEL_ALIASES = {
175
- // OpenAI aliases
176
- gpt4: "openai:gpt-4o",
177
- gpt4o: "openai:gpt-4o",
178
- gpt5: "openai:gpt-5",
179
- "gpt5-mini": "openai:gpt-5-mini",
180
- "gpt5-nano": "openai:gpt-5-nano",
181
- // Anthropic aliases
182
- sonnet: "anthropic:claude-sonnet-4-5",
183
- "claude-sonnet": "anthropic:claude-sonnet-4-5",
184
- haiku: "anthropic:claude-haiku-4-5",
185
- "claude-haiku": "anthropic:claude-haiku-4-5",
186
- opus: "anthropic:claude-opus-4-5",
187
- "claude-opus": "anthropic:claude-opus-4-5",
188
- // Gemini aliases
189
- flash: "gemini:gemini-2.0-flash",
190
- "gemini-flash": "gemini:gemini-2.0-flash",
191
- "gemini-pro": "gemini:gemini-2.5-pro",
192
- pro: "gemini:gemini-2.5-pro"
193
- };
194
- KNOWN_MODEL_PATTERNS = [
195
- /^gpt-?\d/i,
196
- // gpt-4, gpt-3.5, gpt4, etc.
197
- /^claude-?\d/i,
198
- // claude-3, claude-2, etc.
199
- /^gemini-?(\d|pro|flash)/i,
200
- // gemini-2.0, gemini-pro, gemini-flash, etc.
201
- /^o\d/i
202
- // OpenAI o1, o3, etc.
225
+ IMAGE_MAGIC_BYTES = [
226
+ { bytes: [255, 216, 255], mimeType: "image/jpeg" },
227
+ { bytes: [137, 80, 78, 71], mimeType: "image/png" },
228
+ { bytes: [71, 73, 70, 56], mimeType: "image/gif" },
229
+ // WebP starts with RIFF....WEBP
230
+ { bytes: [82, 73, 70, 70], mimeType: "image/webp" }
231
+ ];
232
+ AUDIO_MAGIC_BYTES = [
233
+ // MP3 frame sync
234
+ { bytes: [255, 251], mimeType: "audio/mp3" },
235
+ { bytes: [255, 250], mimeType: "audio/mp3" },
236
+ // ID3 tag (MP3)
237
+ { bytes: [73, 68, 51], mimeType: "audio/mp3" },
238
+ // OGG
239
+ { bytes: [79, 103, 103, 83], mimeType: "audio/ogg" },
240
+ // WAV (RIFF)
241
+ { bytes: [82, 73, 70, 70], mimeType: "audio/wav" },
242
+ // WebM
243
+ { bytes: [26, 69, 223, 163], mimeType: "audio/webm" }
203
244
  ];
204
245
  }
205
246
  });
206
247
 
207
- // src/gadgets/schema-validator.ts
208
- function validateGadgetSchema(schema, gadgetName) {
209
- let jsonSchema;
210
- try {
211
- jsonSchema = z.toJSONSchema(schema, { target: "draft-7" });
212
- } catch (error) {
213
- const errorMessage = error instanceof Error ? error.message : String(error);
214
- throw new Error(
215
- `Gadget "${gadgetName}" has a schema that cannot be serialized to JSON Schema.
216
- This usually happens with unsupported patterns like:
217
- - z.record() - use z.object({}).passthrough() instead
218
- - Complex transforms or custom refinements
219
- - Circular references
220
-
221
- Original error: ${errorMessage}
222
-
223
- Only use schema patterns that Zod v4's native toJSONSchema() supports.`
224
- );
225
- }
226
- const issues = findUnknownTypes(jsonSchema);
227
- if (issues.length > 0) {
228
- const fieldList = issues.join(", ");
229
- throw new Error(
230
- `Gadget "${gadgetName}" uses z.unknown() which produces incomplete schemas.
231
- Problematic fields: ${fieldList}
232
-
233
- z.unknown() doesn't generate type information in JSON Schema, making it unclear
234
- to the LLM what data structure to provide.
235
-
236
- Suggestions:
237
- - Use z.object({}).passthrough() for flexible objects
238
- - Use z.record(z.string()) for key-value objects with string values
239
- - Define specific structure if possible
240
-
241
- Example fixes:
242
- // \u274C Bad
243
- content: z.unknown()
244
-
245
- // \u2705 Good
246
- content: z.object({}).passthrough() // for flexible objects
247
- content: z.record(z.string()) // for key-value objects
248
- content: z.array(z.string()) // for arrays of strings
249
- `
250
- );
251
- }
252
- }
253
- function findUnknownTypes(schema, path = []) {
254
- const issues = [];
255
- if (!schema || typeof schema !== "object") {
256
- return issues;
257
- }
258
- if (schema.definitions) {
259
- for (const defSchema of Object.values(schema.definitions)) {
260
- issues.push(...findUnknownTypes(defSchema, []));
261
- }
262
- }
263
- if (schema.properties) {
264
- for (const [propName, propSchema] of Object.entries(schema.properties)) {
265
- const propPath = [...path, propName];
266
- if (hasNoType(propSchema)) {
267
- issues.push(propPath.join(".") || propName);
268
- }
269
- issues.push(...findUnknownTypes(propSchema, propPath));
270
- }
271
- }
272
- if (schema.items) {
273
- const itemPath = [...path, "[]"];
274
- if (hasNoType(schema.items)) {
275
- issues.push(itemPath.join("."));
276
- }
277
- issues.push(...findUnknownTypes(schema.items, itemPath));
278
- }
279
- if (schema.anyOf) {
280
- schema.anyOf.forEach((subSchema, index) => {
281
- issues.push(...findUnknownTypes(subSchema, [...path, `anyOf[${index}]`]));
282
- });
283
- }
284
- if (schema.oneOf) {
285
- schema.oneOf.forEach((subSchema, index) => {
286
- issues.push(...findUnknownTypes(subSchema, [...path, `oneOf[${index}]`]));
287
- });
288
- }
289
- if (schema.allOf) {
290
- schema.allOf.forEach((subSchema, index) => {
291
- issues.push(...findUnknownTypes(subSchema, [...path, `allOf[${index}]`]));
292
- });
293
- }
294
- return issues;
248
+ // src/core/prompt-config.ts
249
+ function resolvePromptTemplate(template, defaultValue, context) {
250
+ const resolved = template ?? defaultValue;
251
+ return typeof resolved === "function" ? resolved(context) : resolved;
295
252
  }
296
- function hasNoType(prop) {
297
- if (!prop || typeof prop !== "object") {
298
- return false;
253
+ function resolveRulesTemplate(rules, context) {
254
+ const resolved = rules ?? DEFAULT_PROMPTS.rules;
255
+ if (Array.isArray(resolved)) {
256
+ return resolved;
299
257
  }
300
- const hasType = prop.type !== void 0;
301
- const hasRef = prop.$ref !== void 0;
302
- const hasUnion = prop.anyOf !== void 0 || prop.oneOf !== void 0 || prop.allOf !== void 0;
303
- if (hasType || hasRef || hasUnion) {
304
- return false;
258
+ if (typeof resolved === "function") {
259
+ const result = resolved(context);
260
+ return Array.isArray(result) ? result : [result];
305
261
  }
306
- const keys = Object.keys(prop);
307
- const metadataKeys = ["description", "title", "default", "examples"];
308
- const hasOnlyMetadata = keys.every((key) => metadataKeys.includes(key));
309
- return hasOnlyMetadata || keys.length === 0;
262
+ return [resolved];
310
263
  }
311
- var z;
312
- var init_schema_validator = __esm({
313
- "src/gadgets/schema-validator.ts"() {
264
+ var DEFAULT_PROMPTS;
265
+ var init_prompt_config = __esm({
266
+ "src/core/prompt-config.ts"() {
314
267
  "use strict";
315
- z = __toESM(require("zod"), 1);
268
+ DEFAULT_PROMPTS = {
269
+ mainInstruction: [
270
+ "\u26A0\uFE0F CRITICAL: RESPOND ONLY WITH GADGET INVOCATIONS",
271
+ "DO NOT use function calling or tool calling",
272
+ "You must output the exact text markers shown below in plain text.",
273
+ "EACH MARKER MUST START WITH A NEWLINE."
274
+ ].join("\n"),
275
+ criticalUsage: "INVOKE gadgets using the markers - do not describe what you want to do.",
276
+ formatDescription: (ctx) => `Parameters using ${ctx.argPrefix}name markers (value on next line(s), no escaping needed)`,
277
+ rules: () => [
278
+ "Output ONLY plain text with the exact markers - never use function/tool calling",
279
+ "You can invoke multiple gadgets in a single response",
280
+ "Gadgets without dependencies execute immediately (in parallel if multiple)",
281
+ "Use :invocation_id:dep1,dep2 syntax when a gadget needs results from prior gadgets",
282
+ "If any dependency fails, dependent gadgets are automatically skipped"
283
+ ],
284
+ customExamples: null
285
+ };
316
286
  }
317
287
  });
318
288
 
319
- // src/gadgets/registry.ts
320
- var GadgetRegistry;
321
- var init_registry = __esm({
322
- "src/gadgets/registry.ts"() {
323
- "use strict";
324
- init_schema_validator();
325
- GadgetRegistry = class _GadgetRegistry {
326
- gadgets = /* @__PURE__ */ new Map();
327
- /**
328
- * Creates a registry from an array of gadget classes or instances,
329
- * or an object mapping names to gadgets.
330
- *
331
- * @param gadgets - Array of gadgets/classes or object with custom names
332
- * @returns New GadgetRegistry with all gadgets registered
333
- *
334
- * @example
335
- * ```typescript
336
- * // From array of classes
337
- * const registry = GadgetRegistry.from([Calculator, Weather]);
338
- *
339
- * // From array of instances
340
- * const registry = GadgetRegistry.from([new Calculator(), new Weather()]);
341
- *
342
- * // From object with custom names
343
- * const registry = GadgetRegistry.from({
344
- * calc: Calculator,
345
- * weather: new Weather({ apiKey: "..." })
346
- * });
347
- * ```
348
- */
349
- static from(gadgets) {
350
- const registry = new _GadgetRegistry();
351
- if (Array.isArray(gadgets)) {
352
- registry.registerMany(gadgets);
353
- } else {
354
- for (const [name, gadget] of Object.entries(gadgets)) {
355
- const instance = typeof gadget === "function" ? new gadget() : gadget;
356
- registry.register(name, instance);
357
- }
358
- }
359
- return registry;
360
- }
361
- /**
362
- * Registers multiple gadgets at once from an array.
363
- *
364
- * @param gadgets - Array of gadget instances or classes
365
- * @returns This registry for chaining
366
- *
367
- * @example
368
- * ```typescript
369
- * registry.registerMany([Calculator, Weather, Email]);
370
- * registry.registerMany([new Calculator(), new Weather()]);
371
- * ```
372
- */
373
- registerMany(gadgets) {
374
- for (const gadget of gadgets) {
375
- const instance = typeof gadget === "function" ? new gadget() : gadget;
376
- this.registerByClass(instance);
377
- }
378
- return this;
379
- }
380
- // Register a gadget by name
381
- register(name, gadget) {
382
- const normalizedName = name.toLowerCase();
383
- if (this.gadgets.has(normalizedName)) {
384
- throw new Error(`Gadget '${name}' is already registered`);
385
- }
386
- if (gadget.parameterSchema) {
387
- validateGadgetSchema(gadget.parameterSchema, name);
388
- }
389
- this.gadgets.set(normalizedName, gadget);
390
- }
391
- // Register a gadget using its name property or class name
392
- registerByClass(gadget) {
393
- const name = gadget.name ?? gadget.constructor.name;
394
- this.register(name, gadget);
395
- }
396
- // Get gadget by name (case-insensitive)
397
- get(name) {
398
- return this.gadgets.get(name.toLowerCase());
399
- }
400
- // Check if gadget exists (case-insensitive)
401
- has(name) {
402
- return this.gadgets.has(name.toLowerCase());
403
- }
404
- // Get all registered gadget names
405
- getNames() {
406
- return Array.from(this.gadgets.keys());
407
- }
408
- // Get all gadgets for instruction generation
409
- getAll() {
410
- return Array.from(this.gadgets.values());
411
- }
412
- // Unregister gadget (useful for testing, case-insensitive)
413
- unregister(name) {
414
- return this.gadgets.delete(name.toLowerCase());
415
- }
416
- // Clear all gadgets (useful for testing)
417
- clear() {
418
- this.gadgets.clear();
419
- }
420
- };
421
- }
422
- });
423
-
424
- // src/core/prompt-config.ts
425
- function resolvePromptTemplate(template, defaultValue, context) {
426
- const resolved = template ?? defaultValue;
427
- return typeof resolved === "function" ? resolved(context) : resolved;
428
- }
429
- function resolveRulesTemplate(rules, context) {
430
- const resolved = rules ?? DEFAULT_PROMPTS.rules;
431
- if (Array.isArray(resolved)) {
432
- return resolved;
433
- }
434
- if (typeof resolved === "function") {
435
- const result = resolved(context);
436
- return Array.isArray(result) ? result : [result];
437
- }
438
- return [resolved];
439
- }
440
- var DEFAULT_PROMPTS;
441
- var init_prompt_config = __esm({
442
- "src/core/prompt-config.ts"() {
443
- "use strict";
444
- DEFAULT_PROMPTS = {
445
- mainInstruction: [
446
- "\u26A0\uFE0F CRITICAL: RESPOND ONLY WITH GADGET INVOCATIONS",
447
- "DO NOT use function calling or tool calling",
448
- "You must output the exact text markers shown below in plain text.",
449
- "EACH MARKER MUST START WITH A NEWLINE."
450
- ].join("\n"),
451
- criticalUsage: "INVOKE gadgets using the markers - do not describe what you want to do.",
452
- formatDescription: (ctx) => `Parameters using ${ctx.argPrefix}name markers (value on next line(s), no escaping needed)`,
453
- rules: () => [
454
- "Output ONLY plain text with the exact markers - never use function/tool calling",
455
- "You can invoke multiple gadgets in a single response",
456
- "For dependent gadgets, invoke the first one and wait for the result"
457
- ],
458
- customExamples: null
459
- };
460
- }
461
- });
462
-
463
- // src/core/messages.ts
464
- var LLMMessageBuilder;
465
- var init_messages = __esm({
466
- "src/core/messages.ts"() {
289
+ // src/core/messages.ts
290
+ function normalizeContent(content) {
291
+ if (typeof content === "string") {
292
+ return [{ type: "text", text: content }];
293
+ }
294
+ return content;
295
+ }
296
+ function extractText(content) {
297
+ if (typeof content === "string") {
298
+ return content;
299
+ }
300
+ return content.filter((part) => part.type === "text").map((part) => part.text).join("");
301
+ }
302
+ var LLMMessageBuilder;
303
+ var init_messages = __esm({
304
+ "src/core/messages.ts"() {
467
305
  "use strict";
468
306
  init_constants();
307
+ init_input_content();
469
308
  init_prompt_config();
470
309
  LLMMessageBuilder = class {
471
310
  messages = [];
@@ -567,6 +406,10 @@ CRITICAL: ${criticalUsage}
567
406
  parts.push(`
568
407
  1. Start marker: ${this.startPrefix}gadget_name`);
569
408
  parts.push(`
409
+ With ID: ${this.startPrefix}gadget_name:my_id`);
410
+ parts.push(`
411
+ With dependencies: ${this.startPrefix}gadget_name:my_id:dep1,dep2`);
412
+ parts.push(`
570
413
  2. ${formatDescription}`);
571
414
  parts.push(`
572
415
  3. End marker: ${this.endPrefix}`);
@@ -616,6 +459,25 @@ ${this.endPrefix}`;
616
459
  EXAMPLE (Multiple Gadgets):
617
460
 
618
461
  ${multipleExample}`);
462
+ const dependencyExample = `${this.startPrefix}fetch_data:fetch_1
463
+ ${this.argPrefix}url
464
+ https://api.example.com/users
465
+ ${this.endPrefix}
466
+ ${this.startPrefix}fetch_data:fetch_2
467
+ ${this.argPrefix}url
468
+ https://api.example.com/orders
469
+ ${this.endPrefix}
470
+ ${this.startPrefix}merge_data:merge_1:fetch_1,fetch_2
471
+ ${this.argPrefix}format
472
+ json
473
+ ${this.endPrefix}`;
474
+ parts.push(`
475
+
476
+ EXAMPLE (With Dependencies):
477
+ merge_1 waits for fetch_1 AND fetch_2 to complete.
478
+ If either fails, merge_1 is automatically skipped.
479
+
480
+ ${dependencyExample}`);
619
481
  parts.push(`
620
482
 
621
483
  BLOCK FORMAT SYNTAX:
@@ -634,89 +496,503 @@ class Calculator {
634
496
  }
635
497
  }
636
498
 
637
- BLOCK FORMAT RULES:
638
- - Each parameter starts with ${this.argPrefix}parameterName on its own line
639
- - The value starts on the NEXT line after the marker
640
- - Value ends when the next ${this.argPrefix} or ${this.endPrefix} appears
641
- - NO escaping needed - write values exactly as they should appear
642
- - Perfect for code, JSON, markdown, or any content with special characters
643
-
644
- NESTED OBJECTS (use / separator):
645
- ${this.argPrefix}config/timeout
646
- 30
647
- ${this.argPrefix}config/retries
648
- 3
649
- Produces: { "config": { "timeout": "30", "retries": "3" } }
650
-
651
- ARRAYS (use numeric indices):
652
- ${this.argPrefix}items/0
653
- first
654
- ${this.argPrefix}items/1
655
- second
656
- Produces: { "items": ["first", "second"] }`);
657
- return parts.join("");
658
- }
659
- buildRulesSection(context) {
660
- const parts = [];
661
- parts.push("\n\nRULES:");
662
- const rules = resolveRulesTemplate(this.promptConfig.rules, context);
663
- for (const rule of rules) {
664
- parts.push(`
665
- - ${rule}`);
499
+ BLOCK FORMAT RULES:
500
+ - Each parameter starts with ${this.argPrefix}parameterName on its own line
501
+ - The value starts on the NEXT line after the marker
502
+ - Value ends when the next ${this.argPrefix} or ${this.endPrefix} appears
503
+ - NO escaping needed - write values exactly as they should appear
504
+ - Perfect for code, JSON, markdown, or any content with special characters
505
+
506
+ NESTED OBJECTS (use / separator):
507
+ ${this.argPrefix}config/timeout
508
+ 30
509
+ ${this.argPrefix}config/retries
510
+ 3
511
+ Produces: { "config": { "timeout": "30", "retries": "3" } }
512
+
513
+ ARRAYS (use numeric indices):
514
+ ${this.argPrefix}items/0
515
+ first
516
+ ${this.argPrefix}items/1
517
+ second
518
+ Produces: { "items": ["first", "second"] }`);
519
+ return parts.join("");
520
+ }
521
+ buildRulesSection(context) {
522
+ const parts = [];
523
+ parts.push("\n\nRULES:");
524
+ const rules = resolveRulesTemplate(this.promptConfig.rules, context);
525
+ for (const rule of rules) {
526
+ parts.push(`
527
+ - ${rule}`);
528
+ }
529
+ return parts.join("");
530
+ }
531
+ /**
532
+ * Add a user message.
533
+ * Content can be a string (text only) or an array of content parts (multimodal).
534
+ *
535
+ * @param content - Message content
536
+ * @param metadata - Optional metadata
537
+ *
538
+ * @example
539
+ * ```typescript
540
+ * // Text only
541
+ * builder.addUser("Hello!");
542
+ *
543
+ * // Multimodal
544
+ * builder.addUser([
545
+ * text("What's in this image?"),
546
+ * imageFromBuffer(imageData),
547
+ * ]);
548
+ * ```
549
+ */
550
+ addUser(content, metadata) {
551
+ this.messages.push({ role: "user", content, metadata });
552
+ return this;
553
+ }
554
+ addAssistant(content, metadata) {
555
+ this.messages.push({ role: "assistant", content, metadata });
556
+ return this;
557
+ }
558
+ /**
559
+ * Add a user message with an image attachment.
560
+ *
561
+ * @param textContent - Text prompt
562
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
563
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
564
+ *
565
+ * @example
566
+ * ```typescript
567
+ * builder.addUserWithImage(
568
+ * "What's in this image?",
569
+ * await fs.readFile("photo.jpg"),
570
+ * "image/jpeg" // Optional - auto-detected
571
+ * );
572
+ * ```
573
+ */
574
+ addUserWithImage(textContent, imageData, mimeType) {
575
+ const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
576
+ const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
577
+ if (!detectedMime) {
578
+ throw new Error(
579
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
580
+ );
581
+ }
582
+ const content = [
583
+ text(textContent),
584
+ {
585
+ type: "image",
586
+ source: {
587
+ type: "base64",
588
+ mediaType: detectedMime,
589
+ data: toBase64(imageBuffer)
590
+ }
591
+ }
592
+ ];
593
+ this.messages.push({ role: "user", content });
594
+ return this;
595
+ }
596
+ /**
597
+ * Add a user message with an image URL (OpenAI only).
598
+ *
599
+ * @param textContent - Text prompt
600
+ * @param imageUrl - URL to the image
601
+ *
602
+ * @example
603
+ * ```typescript
604
+ * builder.addUserWithImageUrl(
605
+ * "What's in this image?",
606
+ * "https://example.com/image.jpg"
607
+ * );
608
+ * ```
609
+ */
610
+ addUserWithImageUrl(textContent, imageUrl) {
611
+ const content = [text(textContent), imageFromUrl(imageUrl)];
612
+ this.messages.push({ role: "user", content });
613
+ return this;
614
+ }
615
+ /**
616
+ * Add a user message with an audio attachment (Gemini only).
617
+ *
618
+ * @param textContent - Text prompt
619
+ * @param audioData - Audio data (Buffer, Uint8Array, or base64 string)
620
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
621
+ *
622
+ * @example
623
+ * ```typescript
624
+ * builder.addUserWithAudio(
625
+ * "Transcribe this audio",
626
+ * await fs.readFile("recording.mp3"),
627
+ * "audio/mp3" // Optional - auto-detected
628
+ * );
629
+ * ```
630
+ */
631
+ addUserWithAudio(textContent, audioData, mimeType) {
632
+ const audioBuffer = typeof audioData === "string" ? Buffer.from(audioData, "base64") : audioData;
633
+ const content = [text(textContent), audioFromBuffer(audioBuffer, mimeType)];
634
+ this.messages.push({ role: "user", content });
635
+ return this;
636
+ }
637
+ /**
638
+ * Add a user message with multiple content parts.
639
+ * Provides full flexibility for complex multimodal messages.
640
+ *
641
+ * @param parts - Array of content parts
642
+ *
643
+ * @example
644
+ * ```typescript
645
+ * builder.addUserMultimodal([
646
+ * text("Compare these images:"),
647
+ * imageFromBuffer(image1),
648
+ * imageFromBuffer(image2),
649
+ * ]);
650
+ * ```
651
+ */
652
+ addUserMultimodal(parts) {
653
+ this.messages.push({ role: "user", content: parts });
654
+ return this;
655
+ }
656
+ addGadgetCall(gadget, parameters, result) {
657
+ const paramStr = this.formatBlockParameters(parameters, "");
658
+ this.messages.push({
659
+ role: "assistant",
660
+ content: `${this.startPrefix}${gadget}
661
+ ${paramStr}
662
+ ${this.endPrefix}`
663
+ });
664
+ this.messages.push({
665
+ role: "user",
666
+ content: `Result: ${result}`
667
+ });
668
+ return this;
669
+ }
670
+ /**
671
+ * Format parameters as Block format with JSON Pointer paths.
672
+ * Uses the configured argPrefix for consistency with system prompt.
673
+ */
674
+ formatBlockParameters(params, prefix) {
675
+ const lines = [];
676
+ for (const [key, value] of Object.entries(params)) {
677
+ const fullPath = prefix ? `${prefix}/${key}` : key;
678
+ if (Array.isArray(value)) {
679
+ value.forEach((item, index) => {
680
+ const itemPath = `${fullPath}/${index}`;
681
+ if (typeof item === "object" && item !== null) {
682
+ lines.push(this.formatBlockParameters(item, itemPath));
683
+ } else {
684
+ lines.push(`${this.argPrefix}${itemPath}`);
685
+ lines.push(String(item));
686
+ }
687
+ });
688
+ } else if (typeof value === "object" && value !== null) {
689
+ lines.push(this.formatBlockParameters(value, fullPath));
690
+ } else {
691
+ lines.push(`${this.argPrefix}${fullPath}`);
692
+ lines.push(String(value));
693
+ }
694
+ }
695
+ return lines.join("\n");
696
+ }
697
+ build() {
698
+ return [...this.messages];
699
+ }
700
+ };
701
+ }
702
+ });
703
+
704
+ // src/core/model-shortcuts.ts
705
+ function isKnownModelPattern(model) {
706
+ const normalized = model.toLowerCase();
707
+ if (MODEL_ALIASES[normalized]) {
708
+ return true;
709
+ }
710
+ return KNOWN_MODEL_PATTERNS.some((pattern) => pattern.test(model));
711
+ }
712
+ function resolveModel(model, options = {}) {
713
+ if (model.includes(":")) {
714
+ return model;
715
+ }
716
+ const normalized = model.toLowerCase();
717
+ if (MODEL_ALIASES[normalized]) {
718
+ return MODEL_ALIASES[normalized];
719
+ }
720
+ const modelLower = model.toLowerCase();
721
+ if (modelLower.startsWith("gpt")) {
722
+ return `openai:${model}`;
723
+ }
724
+ if (modelLower.startsWith("claude")) {
725
+ return `anthropic:${model}`;
726
+ }
727
+ if (modelLower.startsWith("gemini")) {
728
+ return `gemini:${model}`;
729
+ }
730
+ if (modelLower.match(/^o\d/)) {
731
+ return `openai:${model}`;
732
+ }
733
+ if (!isKnownModelPattern(model)) {
734
+ if (options.strict) {
735
+ throw new Error(
736
+ `Unknown model '${model}'. Did you mean one of: gpt4, sonnet, haiku, flash? Use explicit provider prefix like 'openai:${model}' to bypass this check.`
737
+ );
738
+ }
739
+ if (!options.silent) {
740
+ console.warn(
741
+ `\u26A0\uFE0F Unknown model '${model}', falling back to 'openai:${model}'. This might be a typo. Did you mean: gpt4, gpt5, gpt5-nano, sonnet, haiku, flash? Use { strict: true } to error on unknown models, or { silent: true } to suppress this warning.`
742
+ );
743
+ }
744
+ }
745
+ return `openai:${model}`;
746
+ }
747
+ var MODEL_ALIASES, KNOWN_MODEL_PATTERNS;
748
+ var init_model_shortcuts = __esm({
749
+ "src/core/model-shortcuts.ts"() {
750
+ "use strict";
751
+ MODEL_ALIASES = {
752
+ // OpenAI aliases
753
+ gpt4: "openai:gpt-4o",
754
+ gpt4o: "openai:gpt-4o",
755
+ gpt5: "openai:gpt-5",
756
+ "gpt5-mini": "openai:gpt-5-mini",
757
+ "gpt5-nano": "openai:gpt-5-nano",
758
+ // Anthropic aliases
759
+ sonnet: "anthropic:claude-sonnet-4-5",
760
+ "claude-sonnet": "anthropic:claude-sonnet-4-5",
761
+ haiku: "anthropic:claude-haiku-4-5",
762
+ "claude-haiku": "anthropic:claude-haiku-4-5",
763
+ opus: "anthropic:claude-opus-4-5",
764
+ "claude-opus": "anthropic:claude-opus-4-5",
765
+ // Gemini aliases
766
+ flash: "gemini:gemini-2.0-flash",
767
+ "gemini-flash": "gemini:gemini-2.0-flash",
768
+ "gemini-pro": "gemini:gemini-2.5-pro",
769
+ pro: "gemini:gemini-2.5-pro"
770
+ };
771
+ KNOWN_MODEL_PATTERNS = [
772
+ /^gpt-?\d/i,
773
+ // gpt-4, gpt-3.5, gpt4, etc.
774
+ /^claude-?\d/i,
775
+ // claude-3, claude-2, etc.
776
+ /^gemini-?(\d|pro|flash)/i,
777
+ // gemini-2.0, gemini-pro, gemini-flash, etc.
778
+ /^o\d/i
779
+ // OpenAI o1, o3, etc.
780
+ ];
781
+ }
782
+ });
783
+
784
+ // src/gadgets/schema-validator.ts
785
+ function validateGadgetSchema(schema, gadgetName) {
786
+ let jsonSchema;
787
+ try {
788
+ jsonSchema = z.toJSONSchema(schema, { target: "draft-7" });
789
+ } catch (error) {
790
+ const errorMessage = error instanceof Error ? error.message : String(error);
791
+ throw new Error(
792
+ `Gadget "${gadgetName}" has a schema that cannot be serialized to JSON Schema.
793
+ This usually happens with unsupported patterns like:
794
+ - z.record() - use z.object({}).passthrough() instead
795
+ - Complex transforms or custom refinements
796
+ - Circular references
797
+
798
+ Original error: ${errorMessage}
799
+
800
+ Only use schema patterns that Zod v4's native toJSONSchema() supports.`
801
+ );
802
+ }
803
+ const issues = findUnknownTypes(jsonSchema);
804
+ if (issues.length > 0) {
805
+ const fieldList = issues.join(", ");
806
+ throw new Error(
807
+ `Gadget "${gadgetName}" uses z.unknown() which produces incomplete schemas.
808
+ Problematic fields: ${fieldList}
809
+
810
+ z.unknown() doesn't generate type information in JSON Schema, making it unclear
811
+ to the LLM what data structure to provide.
812
+
813
+ Suggestions:
814
+ - Use z.object({}).passthrough() for flexible objects
815
+ - Use z.record(z.string()) for key-value objects with string values
816
+ - Define specific structure if possible
817
+
818
+ Example fixes:
819
+ // \u274C Bad
820
+ content: z.unknown()
821
+
822
+ // \u2705 Good
823
+ content: z.object({}).passthrough() // for flexible objects
824
+ content: z.record(z.string()) // for key-value objects
825
+ content: z.array(z.string()) // for arrays of strings
826
+ `
827
+ );
828
+ }
829
+ }
830
+ function findUnknownTypes(schema, path = []) {
831
+ const issues = [];
832
+ if (!schema || typeof schema !== "object") {
833
+ return issues;
834
+ }
835
+ if (schema.definitions) {
836
+ for (const defSchema of Object.values(schema.definitions)) {
837
+ issues.push(...findUnknownTypes(defSchema, []));
838
+ }
839
+ }
840
+ if (schema.properties) {
841
+ for (const [propName, propSchema] of Object.entries(schema.properties)) {
842
+ const propPath = [...path, propName];
843
+ if (hasNoType(propSchema)) {
844
+ issues.push(propPath.join(".") || propName);
845
+ }
846
+ issues.push(...findUnknownTypes(propSchema, propPath));
847
+ }
848
+ }
849
+ if (schema.items) {
850
+ const itemPath = [...path, "[]"];
851
+ if (hasNoType(schema.items)) {
852
+ issues.push(itemPath.join("."));
853
+ }
854
+ issues.push(...findUnknownTypes(schema.items, itemPath));
855
+ }
856
+ if (schema.anyOf) {
857
+ schema.anyOf.forEach((subSchema, index) => {
858
+ issues.push(...findUnknownTypes(subSchema, [...path, `anyOf[${index}]`]));
859
+ });
860
+ }
861
+ if (schema.oneOf) {
862
+ schema.oneOf.forEach((subSchema, index) => {
863
+ issues.push(...findUnknownTypes(subSchema, [...path, `oneOf[${index}]`]));
864
+ });
865
+ }
866
+ if (schema.allOf) {
867
+ schema.allOf.forEach((subSchema, index) => {
868
+ issues.push(...findUnknownTypes(subSchema, [...path, `allOf[${index}]`]));
869
+ });
870
+ }
871
+ return issues;
872
+ }
873
+ function hasNoType(prop) {
874
+ if (!prop || typeof prop !== "object") {
875
+ return false;
876
+ }
877
+ const hasType = prop.type !== void 0;
878
+ const hasRef = prop.$ref !== void 0;
879
+ const hasUnion = prop.anyOf !== void 0 || prop.oneOf !== void 0 || prop.allOf !== void 0;
880
+ if (hasType || hasRef || hasUnion) {
881
+ return false;
882
+ }
883
+ const keys = Object.keys(prop);
884
+ const metadataKeys = ["description", "title", "default", "examples"];
885
+ const hasOnlyMetadata = keys.every((key) => metadataKeys.includes(key));
886
+ return hasOnlyMetadata || keys.length === 0;
887
+ }
888
+ var z;
889
+ var init_schema_validator = __esm({
890
+ "src/gadgets/schema-validator.ts"() {
891
+ "use strict";
892
+ z = __toESM(require("zod"), 1);
893
+ }
894
+ });
895
+
896
+ // src/gadgets/registry.ts
897
+ var GadgetRegistry;
898
+ var init_registry = __esm({
899
+ "src/gadgets/registry.ts"() {
900
+ "use strict";
901
+ init_schema_validator();
902
+ GadgetRegistry = class _GadgetRegistry {
903
+ gadgets = /* @__PURE__ */ new Map();
904
+ /**
905
+ * Creates a registry from an array of gadget classes or instances,
906
+ * or an object mapping names to gadgets.
907
+ *
908
+ * @param gadgets - Array of gadgets/classes or object with custom names
909
+ * @returns New GadgetRegistry with all gadgets registered
910
+ *
911
+ * @example
912
+ * ```typescript
913
+ * // From array of classes
914
+ * const registry = GadgetRegistry.from([Calculator, Weather]);
915
+ *
916
+ * // From array of instances
917
+ * const registry = GadgetRegistry.from([new Calculator(), new Weather()]);
918
+ *
919
+ * // From object with custom names
920
+ * const registry = GadgetRegistry.from({
921
+ * calc: Calculator,
922
+ * weather: new Weather({ apiKey: "..." })
923
+ * });
924
+ * ```
925
+ */
926
+ static from(gadgets) {
927
+ const registry = new _GadgetRegistry();
928
+ if (Array.isArray(gadgets)) {
929
+ registry.registerMany(gadgets);
930
+ } else {
931
+ for (const [name, gadget] of Object.entries(gadgets)) {
932
+ const instance = typeof gadget === "function" ? new gadget() : gadget;
933
+ registry.register(name, instance);
934
+ }
666
935
  }
667
- return parts.join("");
936
+ return registry;
668
937
  }
669
- addUser(content, metadata) {
670
- this.messages.push({ role: "user", content, metadata });
938
+ /**
939
+ * Registers multiple gadgets at once from an array.
940
+ *
941
+ * @param gadgets - Array of gadget instances or classes
942
+ * @returns This registry for chaining
943
+ *
944
+ * @example
945
+ * ```typescript
946
+ * registry.registerMany([Calculator, Weather, Email]);
947
+ * registry.registerMany([new Calculator(), new Weather()]);
948
+ * ```
949
+ */
950
+ registerMany(gadgets) {
951
+ for (const gadget of gadgets) {
952
+ const instance = typeof gadget === "function" ? new gadget() : gadget;
953
+ this.registerByClass(instance);
954
+ }
671
955
  return this;
672
956
  }
673
- addAssistant(content, metadata) {
674
- this.messages.push({ role: "assistant", content, metadata });
675
- return this;
957
+ // Register a gadget by name
958
+ register(name, gadget) {
959
+ const normalizedName = name.toLowerCase();
960
+ if (this.gadgets.has(normalizedName)) {
961
+ throw new Error(`Gadget '${name}' is already registered`);
962
+ }
963
+ if (gadget.parameterSchema) {
964
+ validateGadgetSchema(gadget.parameterSchema, name);
965
+ }
966
+ this.gadgets.set(normalizedName, gadget);
676
967
  }
677
- addGadgetCall(gadget, parameters, result) {
678
- const paramStr = this.formatBlockParameters(parameters, "");
679
- this.messages.push({
680
- role: "assistant",
681
- content: `${this.startPrefix}${gadget}
682
- ${paramStr}
683
- ${this.endPrefix}`
684
- });
685
- this.messages.push({
686
- role: "user",
687
- content: `Result: ${result}`
688
- });
689
- return this;
968
+ // Register a gadget using its name property or class name
969
+ registerByClass(gadget) {
970
+ const name = gadget.name ?? gadget.constructor.name;
971
+ this.register(name, gadget);
690
972
  }
691
- /**
692
- * Format parameters as Block format with JSON Pointer paths.
693
- * Uses the configured argPrefix for consistency with system prompt.
694
- */
695
- formatBlockParameters(params, prefix) {
696
- const lines = [];
697
- for (const [key, value] of Object.entries(params)) {
698
- const fullPath = prefix ? `${prefix}/${key}` : key;
699
- if (Array.isArray(value)) {
700
- value.forEach((item, index) => {
701
- const itemPath = `${fullPath}/${index}`;
702
- if (typeof item === "object" && item !== null) {
703
- lines.push(this.formatBlockParameters(item, itemPath));
704
- } else {
705
- lines.push(`${this.argPrefix}${itemPath}`);
706
- lines.push(String(item));
707
- }
708
- });
709
- } else if (typeof value === "object" && value !== null) {
710
- lines.push(this.formatBlockParameters(value, fullPath));
711
- } else {
712
- lines.push(`${this.argPrefix}${fullPath}`);
713
- lines.push(String(value));
714
- }
715
- }
716
- return lines.join("\n");
973
+ // Get gadget by name (case-insensitive)
974
+ get(name) {
975
+ return this.gadgets.get(name.toLowerCase());
717
976
  }
718
- build() {
719
- return [...this.messages];
977
+ // Check if gadget exists (case-insensitive)
978
+ has(name) {
979
+ return this.gadgets.has(name.toLowerCase());
980
+ }
981
+ // Get all registered gadget names
982
+ getNames() {
983
+ return Array.from(this.gadgets.keys());
984
+ }
985
+ // Get all gadgets for instruction generation
986
+ getAll() {
987
+ return Array.from(this.gadgets.values());
988
+ }
989
+ // Unregister gadget (useful for testing, case-insensitive)
990
+ unregister(name) {
991
+ return this.gadgets.delete(name.toLowerCase());
992
+ }
993
+ // Clear all gadgets (useful for testing)
994
+ clear() {
995
+ this.gadgets.clear();
720
996
  }
721
997
  };
722
998
  }
@@ -1913,7 +2189,7 @@ var init_conversation_manager = __esm({
1913
2189
  if (msg.role === "user") {
1914
2190
  this.historyBuilder.addUser(msg.content);
1915
2191
  } else if (msg.role === "assistant") {
1916
- this.historyBuilder.addAssistant(msg.content);
2192
+ this.historyBuilder.addAssistant(extractText(msg.content));
1917
2193
  }
1918
2194
  }
1919
2195
  }
@@ -1934,8 +2210,10 @@ async function runWithHandlers(agentGenerator, handlers) {
1934
2210
  if (handlers.onGadgetCall) {
1935
2211
  await handlers.onGadgetCall({
1936
2212
  gadgetName: event.call.gadgetName,
2213
+ invocationId: event.call.invocationId,
1937
2214
  parameters: event.call.parameters,
1938
- parametersRaw: event.call.parametersRaw
2215
+ parametersRaw: event.call.parametersRaw,
2216
+ dependencies: event.call.dependencies
1939
2217
  });
1940
2218
  }
1941
2219
  break;
@@ -2497,7 +2775,27 @@ var init_cost_reporting_client = __esm({
2497
2775
  constructor(client, reportCost) {
2498
2776
  this.client = client;
2499
2777
  this.reportCost = reportCost;
2778
+ this.image = {
2779
+ generate: async (options) => {
2780
+ const result = await this.client.image.generate(options);
2781
+ if (result.cost !== void 0 && result.cost > 0) {
2782
+ this.reportCost(result.cost);
2783
+ }
2784
+ return result;
2785
+ }
2786
+ };
2787
+ this.speech = {
2788
+ generate: async (options) => {
2789
+ const result = await this.client.speech.generate(options);
2790
+ if (result.cost !== void 0 && result.cost > 0) {
2791
+ this.reportCost(result.cost);
2792
+ }
2793
+ return result;
2794
+ }
2795
+ };
2500
2796
  }
2797
+ image;
2798
+ speech;
2501
2799
  /**
2502
2800
  * Access to model registry for cost estimation.
2503
2801
  */
@@ -2762,15 +3060,37 @@ var init_parser = __esm({
2762
3060
  return segment.trim().length > 0 ? segment : void 0;
2763
3061
  }
2764
3062
  /**
2765
- * Parse gadget name, handling both old format (name:invocationId) and new format (just name).
2766
- * For new format, generates a unique invocation ID.
3063
+ * Parse gadget name with optional invocation ID and dependencies.
3064
+ *
3065
+ * Supported formats:
3066
+ * - `GadgetName` - Auto-generate ID, no dependencies
3067
+ * - `GadgetName:my_id` - Explicit ID, no dependencies
3068
+ * - `GadgetName:my_id:dep1,dep2` - Explicit ID with dependencies
3069
+ *
3070
+ * Dependencies must be comma-separated invocation IDs.
2767
3071
  */
2768
3072
  parseGadgetName(gadgetName) {
2769
- if (gadgetName.includes(":")) {
2770
- const parts = gadgetName.split(":");
2771
- return { actualName: parts[0], invocationId: parts[1] };
3073
+ const parts = gadgetName.split(":");
3074
+ if (parts.length === 1) {
3075
+ return {
3076
+ actualName: parts[0],
3077
+ invocationId: `gadget_${++globalInvocationCounter}`,
3078
+ dependencies: []
3079
+ };
3080
+ } else if (parts.length === 2) {
3081
+ return {
3082
+ actualName: parts[0],
3083
+ invocationId: parts[1].trim(),
3084
+ dependencies: []
3085
+ };
3086
+ } else {
3087
+ const deps = parts[2].split(",").map((d) => d.trim()).filter((d) => d.length > 0);
3088
+ return {
3089
+ actualName: parts[0],
3090
+ invocationId: parts[1].trim(),
3091
+ dependencies: deps
3092
+ };
2772
3093
  }
2773
- return { actualName: gadgetName, invocationId: `gadget_${++globalInvocationCounter}` };
2774
3094
  }
2775
3095
  /**
2776
3096
  * Extract the error message from a parse error.
@@ -2806,39 +3126,20 @@ var init_parser = __esm({
2806
3126
  const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
2807
3127
  if (metadataEndIndex === -1) break;
2808
3128
  const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
2809
- const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
3129
+ const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
2810
3130
  const contentStartIndex = metadataEndIndex + 1;
2811
3131
  let partEndIndex;
2812
3132
  let endMarkerLength = 0;
2813
- if (gadgetName.includes(":")) {
2814
- const oldEndMarker = `${this.endPrefix + actualGadgetName}:${invocationId}`;
2815
- partEndIndex = this.buffer.indexOf(oldEndMarker, contentStartIndex);
2816
- if (partEndIndex === -1) break;
2817
- endMarkerLength = oldEndMarker.length;
3133
+ const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
3134
+ const endPos = this.buffer.indexOf(this.endPrefix, contentStartIndex);
3135
+ if (nextStartPos !== -1 && (endPos === -1 || nextStartPos < endPos)) {
3136
+ partEndIndex = nextStartPos;
3137
+ endMarkerLength = 0;
3138
+ } else if (endPos !== -1) {
3139
+ partEndIndex = endPos;
3140
+ endMarkerLength = this.endPrefix.length;
2818
3141
  } else {
2819
- const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
2820
- let validEndPos = -1;
2821
- let searchPos = contentStartIndex;
2822
- while (true) {
2823
- const endPos = this.buffer.indexOf(this.endPrefix, searchPos);
2824
- if (endPos === -1) break;
2825
- const afterEnd = this.buffer.substring(endPos + this.endPrefix.length);
2826
- if (afterEnd.startsWith("\n") || afterEnd.startsWith("\r") || afterEnd.startsWith(this.startPrefix) || afterEnd.length === 0) {
2827
- validEndPos = endPos;
2828
- break;
2829
- } else {
2830
- searchPos = endPos + this.endPrefix.length;
2831
- }
2832
- }
2833
- if (nextStartPos !== -1 && (validEndPos === -1 || nextStartPos < validEndPos)) {
2834
- partEndIndex = nextStartPos;
2835
- endMarkerLength = 0;
2836
- } else if (validEndPos !== -1) {
2837
- partEndIndex = validEndPos;
2838
- endMarkerLength = this.endPrefix.length;
2839
- } else {
2840
- break;
2841
- }
3142
+ break;
2842
3143
  }
2843
3144
  const parametersRaw = this.buffer.substring(contentStartIndex, partEndIndex).trim();
2844
3145
  const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2849,7 +3150,8 @@ var init_parser = __esm({
2849
3150
  invocationId,
2850
3151
  parametersRaw,
2851
3152
  parameters,
2852
- parseError
3153
+ parseError,
3154
+ dependencies
2853
3155
  }
2854
3156
  };
2855
3157
  startIndex = partEndIndex + endMarkerLength;
@@ -2872,7 +3174,7 @@ var init_parser = __esm({
2872
3174
  const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
2873
3175
  if (metadataEndIndex !== -1) {
2874
3176
  const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
2875
- const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
3177
+ const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
2876
3178
  const contentStartIndex = metadataEndIndex + 1;
2877
3179
  const parametersRaw = this.buffer.substring(contentStartIndex).trim();
2878
3180
  const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2883,7 +3185,8 @@ var init_parser = __esm({
2883
3185
  invocationId,
2884
3186
  parametersRaw,
2885
3187
  parameters,
2886
- parseError
3188
+ parseError,
3189
+ dependencies
2887
3190
  }
2888
3191
  };
2889
3192
  return;
@@ -3253,6 +3556,13 @@ var init_stream_processor = __esm({
3253
3556
  accumulatedText = "";
3254
3557
  shouldStopExecution = false;
3255
3558
  observerFailureCount = 0;
3559
+ // Dependency tracking for gadget execution DAG
3560
+ /** Gadgets waiting for their dependencies to complete */
3561
+ pendingGadgets = /* @__PURE__ */ new Map();
3562
+ /** Completed gadget results, keyed by invocation ID */
3563
+ completedResults = /* @__PURE__ */ new Map();
3564
+ /** Invocation IDs of gadgets that have failed (error or skipped due to dependency) */
3565
+ failedInvocations = /* @__PURE__ */ new Set();
3256
3566
  constructor(options) {
3257
3567
  this.iteration = options.iteration;
3258
3568
  this.registry = options.registry;
@@ -3353,6 +3663,16 @@ var init_stream_processor = __esm({
3353
3663
  }
3354
3664
  }
3355
3665
  }
3666
+ const finalPendingEvents = await this.processPendingGadgets();
3667
+ outputs.push(...finalPendingEvents);
3668
+ if (finalPendingEvents.some((e) => e.type === "gadget_result")) {
3669
+ didExecuteGadgets = true;
3670
+ }
3671
+ for (const evt of finalPendingEvents) {
3672
+ if (evt.type === "gadget_result" && evt.result.breaksLoop) {
3673
+ shouldBreakLoop = true;
3674
+ }
3675
+ }
3356
3676
  }
3357
3677
  let finalMessage = this.accumulatedText;
3358
3678
  if (this.hooks.interceptors?.interceptAssistantMessage) {
@@ -3404,7 +3724,11 @@ var init_stream_processor = __esm({
3404
3724
  return [{ type: "text", content }];
3405
3725
  }
3406
3726
  /**
3407
- * Process a gadget call through the full lifecycle.
3727
+ * Process a gadget call through the full lifecycle, handling dependencies.
3728
+ *
3729
+ * Gadgets without dependencies (or with all dependencies satisfied) execute immediately.
3730
+ * Gadgets with unsatisfied dependencies are queued for later execution.
3731
+ * After each execution, pending gadgets are checked to see if they can now run.
3408
3732
  */
3409
3733
  async processGadgetCall(call) {
3410
3734
  if (this.shouldStopExecution) {
@@ -3415,6 +3739,53 @@ var init_stream_processor = __esm({
3415
3739
  }
3416
3740
  const events = [];
3417
3741
  events.push({ type: "gadget_call", call });
3742
+ if (call.dependencies.length > 0) {
3743
+ if (call.dependencies.includes(call.invocationId)) {
3744
+ this.logger.warn("Gadget has self-referential dependency (depends on itself)", {
3745
+ gadgetName: call.gadgetName,
3746
+ invocationId: call.invocationId
3747
+ });
3748
+ this.failedInvocations.add(call.invocationId);
3749
+ const skipEvent = {
3750
+ type: "gadget_skipped",
3751
+ gadgetName: call.gadgetName,
3752
+ invocationId: call.invocationId,
3753
+ parameters: call.parameters ?? {},
3754
+ failedDependency: call.invocationId,
3755
+ failedDependencyError: `Gadget "${call.invocationId}" cannot depend on itself (self-referential dependency)`
3756
+ };
3757
+ events.push(skipEvent);
3758
+ return events;
3759
+ }
3760
+ const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
3761
+ if (failedDep) {
3762
+ const skipEvents = await this.handleFailedDependency(call, failedDep);
3763
+ events.push(...skipEvents);
3764
+ return events;
3765
+ }
3766
+ const unsatisfied = call.dependencies.filter((dep) => !this.completedResults.has(dep));
3767
+ if (unsatisfied.length > 0) {
3768
+ this.logger.debug("Queueing gadget for later - waiting on dependencies", {
3769
+ gadgetName: call.gadgetName,
3770
+ invocationId: call.invocationId,
3771
+ waitingOn: unsatisfied
3772
+ });
3773
+ this.pendingGadgets.set(call.invocationId, call);
3774
+ return events;
3775
+ }
3776
+ }
3777
+ const executeEvents = await this.executeGadgetWithHooks(call);
3778
+ events.push(...executeEvents);
3779
+ const triggeredEvents = await this.processPendingGadgets();
3780
+ events.push(...triggeredEvents);
3781
+ return events;
3782
+ }
3783
+ /**
3784
+ * Execute a gadget through the full hook lifecycle.
3785
+ * This is the core execution logic, extracted from processGadgetCall.
3786
+ */
3787
+ async executeGadgetWithHooks(call) {
3788
+ const events = [];
3418
3789
  if (call.parseError) {
3419
3790
  this.logger.warn("Gadget has parse error", {
3420
3791
  gadgetName: call.gadgetName,
@@ -3545,6 +3916,10 @@ var init_stream_processor = __esm({
3545
3916
  });
3546
3917
  }
3547
3918
  await this.runObserversInParallel(completeObservers);
3919
+ this.completedResults.set(result.invocationId, result);
3920
+ if (result.error) {
3921
+ this.failedInvocations.add(result.invocationId);
3922
+ }
3548
3923
  events.push({ type: "gadget_result", result });
3549
3924
  if (result.error) {
3550
3925
  const errorType = this.determineErrorType(call, result);
@@ -3560,6 +3935,162 @@ var init_stream_processor = __esm({
3560
3935
  }
3561
3936
  return events;
3562
3937
  }
3938
+ /**
3939
+ * Handle a gadget that cannot execute because a dependency failed.
3940
+ * Calls the onDependencySkipped controller to allow customization.
3941
+ */
3942
+ async handleFailedDependency(call, failedDep) {
3943
+ const events = [];
3944
+ const depResult = this.completedResults.get(failedDep);
3945
+ const depError = depResult?.error ?? "Dependency failed";
3946
+ let action = { action: "skip" };
3947
+ if (this.hooks.controllers?.onDependencySkipped) {
3948
+ const context = {
3949
+ iteration: this.iteration,
3950
+ gadgetName: call.gadgetName,
3951
+ invocationId: call.invocationId,
3952
+ parameters: call.parameters ?? {},
3953
+ failedDependency: failedDep,
3954
+ failedDependencyError: depError,
3955
+ logger: this.logger
3956
+ };
3957
+ action = await this.hooks.controllers.onDependencySkipped(context);
3958
+ }
3959
+ if (action.action === "skip") {
3960
+ this.failedInvocations.add(call.invocationId);
3961
+ const skipEvent = {
3962
+ type: "gadget_skipped",
3963
+ gadgetName: call.gadgetName,
3964
+ invocationId: call.invocationId,
3965
+ parameters: call.parameters ?? {},
3966
+ failedDependency: failedDep,
3967
+ failedDependencyError: depError
3968
+ };
3969
+ events.push(skipEvent);
3970
+ if (this.hooks.observers?.onGadgetSkipped) {
3971
+ const observeContext = {
3972
+ iteration: this.iteration,
3973
+ gadgetName: call.gadgetName,
3974
+ invocationId: call.invocationId,
3975
+ parameters: call.parameters ?? {},
3976
+ failedDependency: failedDep,
3977
+ failedDependencyError: depError,
3978
+ logger: this.logger
3979
+ };
3980
+ await this.safeObserve(() => this.hooks.observers.onGadgetSkipped(observeContext));
3981
+ }
3982
+ this.logger.info("Gadget skipped due to failed dependency", {
3983
+ gadgetName: call.gadgetName,
3984
+ invocationId: call.invocationId,
3985
+ failedDependency: failedDep
3986
+ });
3987
+ } else if (action.action === "execute_anyway") {
3988
+ this.logger.info("Executing gadget despite failed dependency (controller override)", {
3989
+ gadgetName: call.gadgetName,
3990
+ invocationId: call.invocationId,
3991
+ failedDependency: failedDep
3992
+ });
3993
+ const executeEvents = await this.executeGadgetWithHooks(call);
3994
+ events.push(...executeEvents);
3995
+ } else if (action.action === "use_fallback") {
3996
+ const fallbackResult = {
3997
+ gadgetName: call.gadgetName,
3998
+ invocationId: call.invocationId,
3999
+ parameters: call.parameters ?? {},
4000
+ result: action.fallbackResult,
4001
+ executionTimeMs: 0
4002
+ };
4003
+ this.completedResults.set(call.invocationId, fallbackResult);
4004
+ events.push({ type: "gadget_result", result: fallbackResult });
4005
+ this.logger.info("Using fallback result for gadget with failed dependency", {
4006
+ gadgetName: call.gadgetName,
4007
+ invocationId: call.invocationId,
4008
+ failedDependency: failedDep
4009
+ });
4010
+ }
4011
+ return events;
4012
+ }
4013
+ /**
4014
+ * Process pending gadgets whose dependencies are now satisfied.
4015
+ * Executes ready gadgets in parallel and continues until no more can be triggered.
4016
+ */
4017
+ async processPendingGadgets() {
4018
+ const events = [];
4019
+ let progress = true;
4020
+ while (progress && this.pendingGadgets.size > 0) {
4021
+ progress = false;
4022
+ const readyToExecute = [];
4023
+ const readyToSkip = [];
4024
+ for (const [invocationId, call] of this.pendingGadgets) {
4025
+ const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
4026
+ if (failedDep) {
4027
+ readyToSkip.push({ call, failedDep });
4028
+ continue;
4029
+ }
4030
+ const allSatisfied = call.dependencies.every((dep) => this.completedResults.has(dep));
4031
+ if (allSatisfied) {
4032
+ readyToExecute.push(call);
4033
+ }
4034
+ }
4035
+ for (const { call, failedDep } of readyToSkip) {
4036
+ this.pendingGadgets.delete(call.invocationId);
4037
+ const skipEvents = await this.handleFailedDependency(call, failedDep);
4038
+ events.push(...skipEvents);
4039
+ progress = true;
4040
+ }
4041
+ if (readyToExecute.length > 0) {
4042
+ this.logger.debug("Executing ready gadgets in parallel", {
4043
+ count: readyToExecute.length,
4044
+ invocationIds: readyToExecute.map((c) => c.invocationId)
4045
+ });
4046
+ for (const call of readyToExecute) {
4047
+ this.pendingGadgets.delete(call.invocationId);
4048
+ }
4049
+ const executePromises = readyToExecute.map((call) => this.executeGadgetWithHooks(call));
4050
+ const results = await Promise.all(executePromises);
4051
+ for (const executeEvents of results) {
4052
+ events.push(...executeEvents);
4053
+ }
4054
+ progress = true;
4055
+ }
4056
+ }
4057
+ if (this.pendingGadgets.size > 0) {
4058
+ const pendingIds = new Set(this.pendingGadgets.keys());
4059
+ for (const [invocationId, call] of this.pendingGadgets) {
4060
+ const missingDeps = call.dependencies.filter((dep) => !this.completedResults.has(dep));
4061
+ const circularDeps = missingDeps.filter((dep) => pendingIds.has(dep));
4062
+ const trulyMissingDeps = missingDeps.filter((dep) => !pendingIds.has(dep));
4063
+ let errorMessage;
4064
+ let logLevel = "warn";
4065
+ if (circularDeps.length > 0 && trulyMissingDeps.length > 0) {
4066
+ errorMessage = `Dependencies unresolvable: circular=[${circularDeps.join(", ")}], missing=[${trulyMissingDeps.join(", ")}]`;
4067
+ logLevel = "error";
4068
+ } else if (circularDeps.length > 0) {
4069
+ errorMessage = `Circular dependency detected: "${invocationId}" depends on "${circularDeps[0]}" which also depends on "${invocationId}" (directly or indirectly)`;
4070
+ } else {
4071
+ errorMessage = `Dependency "${missingDeps[0]}" was never executed - check that the invocation ID exists and is spelled correctly`;
4072
+ }
4073
+ this.logger[logLevel]("Gadget has unresolvable dependencies", {
4074
+ gadgetName: call.gadgetName,
4075
+ invocationId,
4076
+ circularDependencies: circularDeps,
4077
+ missingDependencies: trulyMissingDeps
4078
+ });
4079
+ this.failedInvocations.add(invocationId);
4080
+ const skipEvent = {
4081
+ type: "gadget_skipped",
4082
+ gadgetName: call.gadgetName,
4083
+ invocationId,
4084
+ parameters: call.parameters ?? {},
4085
+ failedDependency: missingDeps[0],
4086
+ failedDependencyError: errorMessage
4087
+ };
4088
+ events.push(skipEvent);
4089
+ }
4090
+ this.pendingGadgets.clear();
4091
+ }
4092
+ return events;
4093
+ }
3563
4094
  /**
3564
4095
  * Safely execute an observer, catching and logging any errors.
3565
4096
  * Observers are non-critical, so errors are logged but don't crash the system.
@@ -3997,9 +4528,9 @@ var init_agent = __esm({
3997
4528
  if (msg.role === "user") {
3998
4529
  this.conversation.addUserMessage(msg.content);
3999
4530
  } else if (msg.role === "assistant") {
4000
- this.conversation.addAssistantMessage(msg.content);
4531
+ this.conversation.addAssistantMessage(extractText(msg.content));
4001
4532
  } else if (msg.role === "system") {
4002
- this.conversation.addUserMessage(`[System] ${msg.content}`);
4533
+ this.conversation.addUserMessage(`[System] ${extractText(msg.content)}`);
4003
4534
  }
4004
4535
  }
4005
4536
  }
@@ -4219,6 +4750,7 @@ var init_builder = __esm({
4219
4750
  "src/agent/builder.ts"() {
4220
4751
  "use strict";
4221
4752
  init_constants();
4753
+ init_input_content();
4222
4754
  init_model_shortcuts();
4223
4755
  init_registry();
4224
4756
  init_agent();
@@ -4866,13 +5398,17 @@ ${endPrefix}`
4866
5398
  * }
4867
5399
  * ```
4868
5400
  */
4869
- ask(userPrompt) {
5401
+ /**
5402
+ * Build AgentOptions with the given user prompt.
5403
+ * Centralizes options construction for ask(), askWithImage(), and askWithContent().
5404
+ */
5405
+ buildAgentOptions(userPrompt) {
4870
5406
  if (!this.client) {
4871
5407
  const { LLMist: LLMistClass } = (init_client(), __toCommonJS(client_exports));
4872
5408
  this.client = new LLMistClass();
4873
5409
  }
4874
5410
  const registry = GadgetRegistry.from(this.gadgets);
4875
- const options = {
5411
+ return {
4876
5412
  client: this.client,
4877
5413
  model: this.model ?? "openai:gpt-5-nano",
4878
5414
  systemPrompt: this.systemPrompt,
@@ -4898,6 +5434,83 @@ ${endPrefix}`
4898
5434
  compactionConfig: this.compactionConfig,
4899
5435
  signal: this.signal
4900
5436
  };
5437
+ }
5438
+ ask(userPrompt) {
5439
+ const options = this.buildAgentOptions(userPrompt);
5440
+ return new Agent(AGENT_INTERNAL_KEY, options);
5441
+ }
5442
+ /**
5443
+ * Build and create the agent with a multimodal user prompt (text + image).
5444
+ * Returns the Agent instance ready to run.
5445
+ *
5446
+ * @param textPrompt - Text prompt describing what to do with the image
5447
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
5448
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
5449
+ * @returns Configured Agent instance
5450
+ *
5451
+ * @example
5452
+ * ```typescript
5453
+ * const agent = LLMist.createAgent()
5454
+ * .withModel("gpt-4o")
5455
+ * .withSystem("You analyze images")
5456
+ * .askWithImage(
5457
+ * "What's in this image?",
5458
+ * await fs.readFile("photo.jpg")
5459
+ * );
5460
+ *
5461
+ * for await (const event of agent.run()) {
5462
+ * // handle events
5463
+ * }
5464
+ * ```
5465
+ */
5466
+ askWithImage(textPrompt, imageData, mimeType) {
5467
+ const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
5468
+ const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
5469
+ if (!detectedMime) {
5470
+ throw new Error(
5471
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
5472
+ );
5473
+ }
5474
+ const userContent = [
5475
+ text(textPrompt),
5476
+ {
5477
+ type: "image",
5478
+ source: {
5479
+ type: "base64",
5480
+ mediaType: detectedMime,
5481
+ data: toBase64(imageBuffer)
5482
+ }
5483
+ }
5484
+ ];
5485
+ const options = this.buildAgentOptions(userContent);
5486
+ return new Agent(AGENT_INTERNAL_KEY, options);
5487
+ }
5488
+ /**
5489
+ * Build and return an Agent configured with multimodal content.
5490
+ * More flexible than askWithImage - accepts any combination of content parts.
5491
+ *
5492
+ * @param content - Array of content parts (text, images, audio)
5493
+ * @returns A configured Agent ready for execution
5494
+ *
5495
+ * @example
5496
+ * ```typescript
5497
+ * import { text, imageFromBuffer, audioFromBuffer } from "llmist";
5498
+ *
5499
+ * const agent = LLMist.createAgent()
5500
+ * .withModel("gemini:gemini-2.5-flash")
5501
+ * .askWithContent([
5502
+ * text("Describe this image and transcribe the audio:"),
5503
+ * imageFromBuffer(imageData),
5504
+ * audioFromBuffer(audioData),
5505
+ * ]);
5506
+ *
5507
+ * for await (const event of agent.run()) {
5508
+ * // handle events
5509
+ * }
5510
+ * ```
5511
+ */
5512
+ askWithContent(content) {
5513
+ const options = this.buildAgentOptions(content);
4901
5514
  return new Agent(AGENT_INTERNAL_KEY, options);
4902
5515
  }
4903
5516
  /**
@@ -5373,6 +5986,7 @@ var init_anthropic = __esm({
5373
5986
  "src/providers/anthropic.ts"() {
5374
5987
  "use strict";
5375
5988
  import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
5989
+ init_messages();
5376
5990
  init_anthropic_models();
5377
5991
  init_base_provider();
5378
5992
  init_constants2();
@@ -5385,11 +5999,33 @@ var init_anthropic = __esm({
5385
5999
  getModelSpecs() {
5386
6000
  return ANTHROPIC_MODELS;
5387
6001
  }
6002
+ // =========================================================================
6003
+ // Image Generation (Not Supported)
6004
+ // =========================================================================
6005
+ supportsImageGeneration(_modelId) {
6006
+ return false;
6007
+ }
6008
+ async generateImage() {
6009
+ throw new Error(
6010
+ "Anthropic does not support image generation. Use OpenAI (DALL-E, GPT Image) or Google Gemini (Imagen) instead."
6011
+ );
6012
+ }
6013
+ // =========================================================================
6014
+ // Speech Generation (Not Supported)
6015
+ // =========================================================================
6016
+ supportsSpeechGeneration(_modelId) {
6017
+ return false;
6018
+ }
6019
+ async generateSpeech() {
6020
+ throw new Error(
6021
+ "Anthropic does not support speech generation. Use OpenAI (TTS) or Google Gemini (TTS) instead."
6022
+ );
6023
+ }
5388
6024
  buildRequestPayload(options, descriptor, spec, messages) {
5389
6025
  const systemMessages = messages.filter((message) => message.role === "system");
5390
6026
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
5391
6027
  type: "text",
5392
- text: m.content,
6028
+ text: extractText(m.content),
5393
6029
  // Add cache_control to the LAST system message block
5394
6030
  ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
5395
6031
  })) : void 0;
@@ -5402,14 +6038,10 @@ var init_anthropic = __esm({
5402
6038
  );
5403
6039
  const conversation = nonSystemMessages.map((message, index) => ({
5404
6040
  role: message.role,
5405
- content: [
5406
- {
5407
- type: "text",
5408
- text: message.content,
5409
- // Add cache_control to the LAST user message
5410
- ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
5411
- }
5412
- ]
6041
+ content: this.convertToAnthropicContent(
6042
+ message.content,
6043
+ message.role === "user" && index === lastUserIndex
6044
+ )
5413
6045
  }));
5414
6046
  const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
5415
6047
  const payload = {
@@ -5425,6 +6057,52 @@ var init_anthropic = __esm({
5425
6057
  };
5426
6058
  return payload;
5427
6059
  }
6060
+ /**
6061
+ * Convert llmist content to Anthropic's content block format.
6062
+ * Handles text, images (base64 only), and applies cache_control.
6063
+ */
6064
+ convertToAnthropicContent(content, addCacheControl) {
6065
+ const parts = normalizeContent(content);
6066
+ return parts.map((part, index) => {
6067
+ const isLastPart = index === parts.length - 1;
6068
+ const cacheControl = addCacheControl && isLastPart ? { cache_control: { type: "ephemeral" } } : {};
6069
+ if (part.type === "text") {
6070
+ return {
6071
+ type: "text",
6072
+ text: part.text,
6073
+ ...cacheControl
6074
+ };
6075
+ }
6076
+ if (part.type === "image") {
6077
+ return this.convertImagePart(part, cacheControl);
6078
+ }
6079
+ if (part.type === "audio") {
6080
+ throw new Error(
6081
+ "Anthropic does not support audio input. Use Google Gemini for audio processing."
6082
+ );
6083
+ }
6084
+ throw new Error(`Unsupported content type: ${part.type}`);
6085
+ });
6086
+ }
6087
+ /**
6088
+ * Convert an image content part to Anthropic's image block format.
6089
+ */
6090
+ convertImagePart(part, cacheControl) {
6091
+ if (part.source.type === "url") {
6092
+ throw new Error(
6093
+ "Anthropic does not support image URLs. Please provide base64-encoded image data instead."
6094
+ );
6095
+ }
6096
+ return {
6097
+ type: "image",
6098
+ source: {
6099
+ type: "base64",
6100
+ media_type: part.source.mediaType,
6101
+ data: part.source.data
6102
+ },
6103
+ ...cacheControl
6104
+ };
6105
+ }
5428
6106
  async executeStreamRequest(payload, signal) {
5429
6107
  const client = this.client;
5430
6108
  const stream2 = await client.messages.create(payload, signal ? { signal } : void 0);
@@ -5507,17 +6185,12 @@ var init_anthropic = __esm({
5507
6185
  async countTokens(messages, descriptor, _spec) {
5508
6186
  const client = this.client;
5509
6187
  const systemMessages = messages.filter((message) => message.role === "system");
5510
- const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
6188
+ const system = systemMessages.length > 0 ? systemMessages.map((m) => extractText(m.content)).join("\n\n") : void 0;
5511
6189
  const conversation = messages.filter(
5512
6190
  (message) => message.role !== "system"
5513
6191
  ).map((message) => ({
5514
6192
  role: message.role,
5515
- content: [
5516
- {
5517
- type: "text",
5518
- text: message.content
5519
- }
5520
- ]
6193
+ content: this.convertToAnthropicContent(message.content, false)
5521
6194
  }));
5522
6195
  try {
5523
6196
  const response = await client.messages.countTokens({
@@ -5531,14 +6204,201 @@ var init_anthropic = __esm({
5531
6204
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
5532
6205
  error
5533
6206
  );
5534
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
5535
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
6207
+ let totalChars = 0;
6208
+ let imageCount = 0;
6209
+ for (const msg of messages) {
6210
+ const parts = normalizeContent(msg.content);
6211
+ for (const part of parts) {
6212
+ if (part.type === "text") {
6213
+ totalChars += part.text.length;
6214
+ } else if (part.type === "image") {
6215
+ imageCount++;
6216
+ }
6217
+ }
6218
+ }
6219
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 1e3;
5536
6220
  }
5537
6221
  }
5538
6222
  };
5539
6223
  }
5540
6224
  });
5541
6225
 
6226
+ // src/providers/gemini-image-models.ts
6227
+ function getGeminiImageModelSpec(modelId) {
6228
+ return geminiImageModels.find((m) => m.modelId === modelId);
6229
+ }
6230
+ function isGeminiImageModel(modelId) {
6231
+ return geminiImageModels.some((m) => m.modelId === modelId);
6232
+ }
6233
+ function calculateGeminiImageCost(modelId, size = "1:1", n = 1) {
6234
+ const spec = getGeminiImageModelSpec(modelId);
6235
+ if (!spec) return void 0;
6236
+ if (spec.pricing.perImage !== void 0) {
6237
+ return spec.pricing.perImage * n;
6238
+ }
6239
+ if (spec.pricing.bySize) {
6240
+ const sizePrice = spec.pricing.bySize[size];
6241
+ if (typeof sizePrice === "number") {
6242
+ return sizePrice * n;
6243
+ }
6244
+ }
6245
+ return void 0;
6246
+ }
6247
+ var IMAGEN4_ASPECT_RATIOS, GEMINI_IMAGE_ASPECT_RATIOS, geminiImageModels;
6248
+ var init_gemini_image_models = __esm({
6249
+ "src/providers/gemini-image-models.ts"() {
6250
+ "use strict";
6251
+ IMAGEN4_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
6252
+ GEMINI_IMAGE_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
6253
+ geminiImageModels = [
6254
+ // Imagen 4 Family (standalone image generation)
6255
+ {
6256
+ provider: "gemini",
6257
+ modelId: "imagen-4.0-fast-generate-001",
6258
+ displayName: "Imagen 4 Fast",
6259
+ pricing: {
6260
+ perImage: 0.02
6261
+ },
6262
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
6263
+ maxImages: 4,
6264
+ defaultSize: "1:1",
6265
+ features: {
6266
+ textRendering: true
6267
+ }
6268
+ },
6269
+ {
6270
+ provider: "gemini",
6271
+ modelId: "imagen-4.0-generate-001",
6272
+ displayName: "Imagen 4",
6273
+ pricing: {
6274
+ perImage: 0.04
6275
+ },
6276
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
6277
+ maxImages: 4,
6278
+ defaultSize: "1:1",
6279
+ features: {
6280
+ textRendering: true
6281
+ }
6282
+ },
6283
+ {
6284
+ provider: "gemini",
6285
+ modelId: "imagen-4.0-ultra-generate-001",
6286
+ displayName: "Imagen 4 Ultra",
6287
+ pricing: {
6288
+ perImage: 0.06
6289
+ },
6290
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
6291
+ maxImages: 4,
6292
+ defaultSize: "1:1",
6293
+ features: {
6294
+ textRendering: true
6295
+ }
6296
+ },
6297
+ // Preview versions
6298
+ {
6299
+ provider: "gemini",
6300
+ modelId: "imagen-4.0-generate-preview-06-06",
6301
+ displayName: "Imagen 4 (Preview)",
6302
+ pricing: {
6303
+ perImage: 0.04
6304
+ },
6305
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
6306
+ maxImages: 4,
6307
+ defaultSize: "1:1",
6308
+ features: {
6309
+ textRendering: true
6310
+ }
6311
+ },
6312
+ {
6313
+ provider: "gemini",
6314
+ modelId: "imagen-4.0-ultra-generate-preview-06-06",
6315
+ displayName: "Imagen 4 Ultra (Preview)",
6316
+ pricing: {
6317
+ perImage: 0.06
6318
+ },
6319
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
6320
+ maxImages: 4,
6321
+ defaultSize: "1:1",
6322
+ features: {
6323
+ textRendering: true
6324
+ }
6325
+ },
6326
+ // Gemini Native Image Generation (multimodal models)
6327
+ {
6328
+ provider: "gemini",
6329
+ modelId: "gemini-2.5-flash-image",
6330
+ displayName: "Gemini 2.5 Flash Image",
6331
+ pricing: {
6332
+ perImage: 0.039
6333
+ },
6334
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
6335
+ maxImages: 1,
6336
+ defaultSize: "1:1",
6337
+ features: {
6338
+ conversational: true,
6339
+ textRendering: true
6340
+ }
6341
+ },
6342
+ {
6343
+ provider: "gemini",
6344
+ modelId: "gemini-2.5-flash-image-preview",
6345
+ displayName: "Gemini 2.5 Flash Image (Preview)",
6346
+ pricing: {
6347
+ perImage: 0.039
6348
+ },
6349
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
6350
+ maxImages: 1,
6351
+ defaultSize: "1:1",
6352
+ features: {
6353
+ conversational: true,
6354
+ textRendering: true
6355
+ }
6356
+ },
6357
+ {
6358
+ provider: "gemini",
6359
+ modelId: "gemini-3-pro-image-preview",
6360
+ displayName: "Gemini 3 Pro Image (Preview)",
6361
+ pricing: {
6362
+ // Token-based: ~$0.134 per 1K/2K image, $0.24 per 4K
6363
+ // Using 2K as default
6364
+ bySize: {
6365
+ "1K": 0.134,
6366
+ "2K": 0.134,
6367
+ "4K": 0.24
6368
+ }
6369
+ },
6370
+ supportedSizes: ["1K", "2K", "4K"],
6371
+ maxImages: 1,
6372
+ defaultSize: "2K",
6373
+ features: {
6374
+ conversational: true,
6375
+ textRendering: true
6376
+ }
6377
+ },
6378
+ // Alias: nano-banana-pro-preview is gemini-3-pro-image-preview
6379
+ {
6380
+ provider: "gemini",
6381
+ modelId: "nano-banana-pro-preview",
6382
+ displayName: "Nano Banana Pro (Gemini 3 Pro Image)",
6383
+ pricing: {
6384
+ bySize: {
6385
+ "1K": 0.134,
6386
+ "2K": 0.134,
6387
+ "4K": 0.24
6388
+ }
6389
+ },
6390
+ supportedSizes: ["1K", "2K", "4K"],
6391
+ maxImages: 1,
6392
+ defaultSize: "2K",
6393
+ features: {
6394
+ conversational: true,
6395
+ textRendering: true
6396
+ }
6397
+ }
6398
+ ];
6399
+ }
6400
+ });
6401
+
5542
6402
  // src/providers/gemini-models.ts
5543
6403
  var GEMINI_MODELS;
5544
6404
  var init_gemini_models = __esm({
@@ -5692,20 +6552,159 @@ var init_gemini_models = __esm({
5692
6552
  contextWindow: 1048576,
5693
6553
  maxOutputTokens: 8192,
5694
6554
  pricing: {
5695
- input: 0.075,
5696
- output: 0.3
5697
- // No context caching available for 2.0-flash-lite
6555
+ input: 0.075,
6556
+ output: 0.3
6557
+ // No context caching available for 2.0-flash-lite
6558
+ },
6559
+ knowledgeCutoff: "2024-08",
6560
+ features: {
6561
+ streaming: true,
6562
+ functionCalling: true,
6563
+ vision: true,
6564
+ structuredOutputs: true
6565
+ },
6566
+ metadata: {
6567
+ family: "Gemini 2.0",
6568
+ notes: "Smallest and most cost effective 2.0 model for at scale usage."
6569
+ }
6570
+ }
6571
+ ];
6572
+ }
6573
+ });
6574
+
6575
+ // src/providers/gemini-speech-models.ts
6576
+ function getGeminiSpeechModelSpec(modelId) {
6577
+ return geminiSpeechModels.find((m) => m.modelId === modelId);
6578
+ }
6579
+ function isGeminiSpeechModel(modelId) {
6580
+ return geminiSpeechModels.some((m) => m.modelId === modelId);
6581
+ }
6582
+ function calculateGeminiSpeechCost(modelId, characterCount, estimatedMinutes) {
6583
+ const spec = getGeminiSpeechModelSpec(modelId);
6584
+ if (!spec) return void 0;
6585
+ if (spec.pricing.perMinute !== void 0) {
6586
+ if (estimatedMinutes !== void 0) {
6587
+ return estimatedMinutes * spec.pricing.perMinute;
6588
+ }
6589
+ const approxMinutes = characterCount / 750;
6590
+ return approxMinutes * spec.pricing.perMinute;
6591
+ }
6592
+ return void 0;
6593
+ }
6594
+ var GEMINI_TTS_VOICES, GEMINI_TTS_FORMATS, geminiSpeechModels;
6595
+ var init_gemini_speech_models = __esm({
6596
+ "src/providers/gemini-speech-models.ts"() {
6597
+ "use strict";
6598
+ GEMINI_TTS_VOICES = [
6599
+ "Zephyr",
6600
+ // Bright
6601
+ "Puck",
6602
+ // Upbeat
6603
+ "Charon",
6604
+ // Informative
6605
+ "Kore",
6606
+ // Firm
6607
+ "Fenrir",
6608
+ // Excitable
6609
+ "Leda",
6610
+ // Youthful
6611
+ "Orus",
6612
+ // Firm
6613
+ "Aoede",
6614
+ // Breezy
6615
+ "Callirrhoe",
6616
+ // Easy-going
6617
+ "Autonoe",
6618
+ // Bright
6619
+ "Enceladus",
6620
+ // Breathy
6621
+ "Iapetus",
6622
+ // Clear
6623
+ "Umbriel",
6624
+ // Easy-going
6625
+ "Algieba",
6626
+ // Smooth
6627
+ "Despina",
6628
+ // Smooth
6629
+ "Erinome",
6630
+ // Clear
6631
+ "Algenib",
6632
+ // Gravelly
6633
+ "Rasalgethi",
6634
+ // Informative
6635
+ "Laomedeia",
6636
+ // Upbeat
6637
+ "Achernar",
6638
+ // Soft
6639
+ "Alnilam",
6640
+ // Firm
6641
+ "Schedar",
6642
+ // Even
6643
+ "Gacrux",
6644
+ // Mature
6645
+ "Pulcherrima",
6646
+ // Forward
6647
+ "Achird",
6648
+ // Friendly
6649
+ "Zubenelgenubi",
6650
+ // Casual
6651
+ "Vindemiatrix",
6652
+ // Gentle
6653
+ "Sadachbia",
6654
+ // Lively
6655
+ "Sadaltager",
6656
+ // Knowledgeable
6657
+ "Sulafat"
6658
+ // Warm
6659
+ ];
6660
+ GEMINI_TTS_FORMATS = ["pcm", "wav"];
6661
+ geminiSpeechModels = [
6662
+ {
6663
+ provider: "gemini",
6664
+ modelId: "gemini-2.5-flash-preview-tts",
6665
+ displayName: "Gemini 2.5 Flash TTS (Preview)",
6666
+ pricing: {
6667
+ // $0.50 per 1M input tokens = $0.0000005 per token
6668
+ perInputToken: 5e-7,
6669
+ // $10.00 per 1M audio output tokens = $0.00001 per token
6670
+ perAudioOutputToken: 1e-5,
6671
+ // Rough estimate: ~$0.01 per minute of audio
6672
+ perMinute: 0.01
6673
+ },
6674
+ voices: [...GEMINI_TTS_VOICES],
6675
+ formats: GEMINI_TTS_FORMATS,
6676
+ maxInputLength: 8e3,
6677
+ // bytes (text + prompt combined)
6678
+ defaultVoice: "Zephyr",
6679
+ defaultFormat: "wav",
6680
+ features: {
6681
+ multiSpeaker: true,
6682
+ languages: 24,
6683
+ voiceInstructions: true
6684
+ }
6685
+ },
6686
+ {
6687
+ provider: "gemini",
6688
+ modelId: "gemini-2.5-pro-preview-tts",
6689
+ displayName: "Gemini 2.5 Pro TTS (Preview)",
6690
+ pricing: {
6691
+ // $1.00 per 1M input tokens = $0.000001 per token
6692
+ perInputToken: 1e-6,
6693
+ // $20.00 per 1M audio output tokens = $0.00002 per token
6694
+ perAudioOutputToken: 2e-5,
6695
+ // Rough estimate: ~$0.02 per minute of audio
6696
+ perMinute: 0.02
5698
6697
  },
5699
- knowledgeCutoff: "2024-08",
6698
+ voices: [...GEMINI_TTS_VOICES],
6699
+ formats: GEMINI_TTS_FORMATS,
6700
+ maxInputLength: 8e3,
6701
+ // bytes
6702
+ defaultVoice: "Zephyr",
6703
+ defaultFormat: "wav",
5700
6704
  features: {
5701
- streaming: true,
5702
- functionCalling: true,
5703
- vision: true,
5704
- structuredOutputs: true
5705
- },
5706
- metadata: {
5707
- family: "Gemini 2.0",
5708
- notes: "Smallest and most cost effective 2.0 model for at scale usage."
6705
+ multiSpeaker: true,
6706
+ languages: 24,
6707
+ voiceInstructions: true
5709
6708
  }
5710
6709
  }
5711
6710
  ];
@@ -5713,6 +6712,31 @@ var init_gemini_models = __esm({
5713
6712
  });
5714
6713
 
5715
6714
  // src/providers/gemini.ts
6715
+ function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
6716
+ const byteRate = sampleRate * numChannels * bitsPerSample / 8;
6717
+ const blockAlign = numChannels * bitsPerSample / 8;
6718
+ const dataSize = pcmData.length;
6719
+ const headerSize = 44;
6720
+ const fileSize = headerSize + dataSize - 8;
6721
+ const buffer = new ArrayBuffer(headerSize + dataSize);
6722
+ const view = new DataView(buffer);
6723
+ const uint8 = new Uint8Array(buffer);
6724
+ view.setUint32(0, 1380533830, false);
6725
+ view.setUint32(4, fileSize, true);
6726
+ view.setUint32(8, 1463899717, false);
6727
+ view.setUint32(12, 1718449184, false);
6728
+ view.setUint32(16, 16, true);
6729
+ view.setUint16(20, 1, true);
6730
+ view.setUint16(22, numChannels, true);
6731
+ view.setUint32(24, sampleRate, true);
6732
+ view.setUint32(28, byteRate, true);
6733
+ view.setUint16(32, blockAlign, true);
6734
+ view.setUint16(34, bitsPerSample, true);
6735
+ view.setUint32(36, 1684108385, false);
6736
+ view.setUint32(40, dataSize, true);
6737
+ uint8.set(pcmData, headerSize);
6738
+ return buffer;
6739
+ }
5716
6740
  function createGeminiProviderFromEnv() {
5717
6741
  return createProviderFromEnv("GEMINI_API_KEY", import_genai.GoogleGenAI, GeminiGenerativeProvider);
5718
6742
  }
@@ -5721,9 +6745,12 @@ var init_gemini = __esm({
5721
6745
  "src/providers/gemini.ts"() {
5722
6746
  "use strict";
5723
6747
  import_genai = require("@google/genai");
6748
+ init_messages();
5724
6749
  init_base_provider();
5725
6750
  init_constants2();
6751
+ init_gemini_image_models();
5726
6752
  init_gemini_models();
6753
+ init_gemini_speech_models();
5727
6754
  init_utils();
5728
6755
  GEMINI_ROLE_MAP = {
5729
6756
  system: "user",
@@ -5738,6 +6765,139 @@ var init_gemini = __esm({
5738
6765
  getModelSpecs() {
5739
6766
  return GEMINI_MODELS;
5740
6767
  }
6768
+ // =========================================================================
6769
+ // Image Generation
6770
+ // =========================================================================
6771
+ getImageModelSpecs() {
6772
+ return geminiImageModels;
6773
+ }
6774
+ supportsImageGeneration(modelId) {
6775
+ return isGeminiImageModel(modelId);
6776
+ }
6777
+ async generateImage(options) {
6778
+ const client = this.client;
6779
+ const spec = getGeminiImageModelSpec(options.model);
6780
+ const isImagenModel = options.model.startsWith("imagen");
6781
+ const aspectRatio = options.size ?? spec?.defaultSize ?? "1:1";
6782
+ const n = options.n ?? 1;
6783
+ if (isImagenModel) {
6784
+ const response2 = await client.models.generateImages({
6785
+ model: options.model,
6786
+ prompt: options.prompt,
6787
+ config: {
6788
+ numberOfImages: n,
6789
+ aspectRatio,
6790
+ outputMimeType: options.responseFormat === "b64_json" ? "image/png" : "image/jpeg"
6791
+ }
6792
+ });
6793
+ const images2 = response2.generatedImages ?? [];
6794
+ const cost2 = calculateGeminiImageCost(options.model, aspectRatio, images2.length);
6795
+ return {
6796
+ // Gemini's imageBytes is already base64 encoded, so use it directly
6797
+ images: images2.map((img) => ({
6798
+ b64Json: img.image?.imageBytes ?? void 0
6799
+ })),
6800
+ model: options.model,
6801
+ usage: {
6802
+ imagesGenerated: images2.length,
6803
+ size: aspectRatio,
6804
+ quality: "standard"
6805
+ },
6806
+ cost: cost2
6807
+ };
6808
+ }
6809
+ const response = await client.models.generateContent({
6810
+ model: options.model,
6811
+ contents: [{ role: "user", parts: [{ text: options.prompt }] }],
6812
+ config: {
6813
+ responseModalities: [import_genai.Modality.IMAGE, import_genai.Modality.TEXT]
6814
+ }
6815
+ });
6816
+ const images = [];
6817
+ const candidate = response.candidates?.[0];
6818
+ if (candidate?.content?.parts) {
6819
+ for (const part of candidate.content.parts) {
6820
+ if ("inlineData" in part && part.inlineData) {
6821
+ images.push({
6822
+ b64Json: part.inlineData.data
6823
+ });
6824
+ }
6825
+ }
6826
+ }
6827
+ const cost = calculateGeminiImageCost(options.model, aspectRatio, images.length);
6828
+ return {
6829
+ images,
6830
+ model: options.model,
6831
+ usage: {
6832
+ imagesGenerated: images.length,
6833
+ size: aspectRatio,
6834
+ quality: "standard"
6835
+ },
6836
+ cost
6837
+ };
6838
+ }
6839
+ // =========================================================================
6840
+ // Speech Generation
6841
+ // =========================================================================
6842
+ getSpeechModelSpecs() {
6843
+ return geminiSpeechModels;
6844
+ }
6845
+ supportsSpeechGeneration(modelId) {
6846
+ return isGeminiSpeechModel(modelId);
6847
+ }
6848
+ async generateSpeech(options) {
6849
+ const client = this.client;
6850
+ const spec = getGeminiSpeechModelSpec(options.model);
6851
+ const voice = options.voice ?? spec?.defaultVoice ?? "Zephyr";
6852
+ const response = await client.models.generateContent({
6853
+ model: options.model,
6854
+ contents: [
6855
+ {
6856
+ role: "user",
6857
+ parts: [{ text: options.input }]
6858
+ }
6859
+ ],
6860
+ config: {
6861
+ responseModalities: [import_genai.Modality.AUDIO],
6862
+ speechConfig: {
6863
+ voiceConfig: {
6864
+ prebuiltVoiceConfig: {
6865
+ voiceName: voice
6866
+ }
6867
+ }
6868
+ }
6869
+ }
6870
+ });
6871
+ let pcmData;
6872
+ const candidate = response.candidates?.[0];
6873
+ if (candidate?.content?.parts) {
6874
+ for (const part of candidate.content.parts) {
6875
+ if ("inlineData" in part && part.inlineData?.data) {
6876
+ const base64 = part.inlineData.data;
6877
+ const binary = atob(base64);
6878
+ pcmData = new Uint8Array(binary.length);
6879
+ for (let i = 0; i < binary.length; i++) {
6880
+ pcmData[i] = binary.charCodeAt(i);
6881
+ }
6882
+ break;
6883
+ }
6884
+ }
6885
+ }
6886
+ if (!pcmData) {
6887
+ throw new Error("No audio data in Gemini TTS response");
6888
+ }
6889
+ const audioData = wrapPcmInWav(pcmData, 24e3, 16, 1);
6890
+ const cost = calculateGeminiSpeechCost(options.model, options.input.length);
6891
+ return {
6892
+ audio: audioData,
6893
+ model: options.model,
6894
+ usage: {
6895
+ characterCount: options.input.length
6896
+ },
6897
+ cost,
6898
+ format: spec?.defaultFormat ?? "wav"
6899
+ };
6900
+ }
5741
6901
  buildRequestPayload(options, descriptor, _spec, messages) {
5742
6902
  const contents = this.convertMessagesToContents(messages);
5743
6903
  const generationConfig = this.buildGenerationConfig(options);
@@ -5755,7 +6915,7 @@ var init_gemini = __esm({
5755
6915
  };
5756
6916
  return {
5757
6917
  model: descriptor.name,
5758
- contents: this.convertContentsForNewSDK(contents),
6918
+ contents,
5759
6919
  config
5760
6920
  };
5761
6921
  }
@@ -5790,18 +6950,25 @@ var init_gemini = __esm({
5790
6950
  if (message.role === "system") {
5791
6951
  expandedMessages.push({
5792
6952
  role: "user",
5793
- content: message.content
6953
+ content: extractText(message.content)
5794
6954
  });
5795
6955
  expandedMessages.push({
5796
6956
  role: "assistant",
5797
6957
  content: "Understood."
5798
6958
  });
5799
6959
  } else {
5800
- expandedMessages.push(message);
6960
+ expandedMessages.push({
6961
+ role: message.role,
6962
+ content: message.content
6963
+ });
5801
6964
  }
5802
6965
  }
5803
6966
  return this.mergeConsecutiveMessages(expandedMessages);
5804
6967
  }
6968
+ /**
6969
+ * Merge consecutive messages with the same role (required by Gemini).
6970
+ * Handles multimodal content by converting to Gemini's part format.
6971
+ */
5805
6972
  mergeConsecutiveMessages(messages) {
5806
6973
  if (messages.length === 0) {
5807
6974
  return [];
@@ -5810,15 +6977,16 @@ var init_gemini = __esm({
5810
6977
  let currentGroup = null;
5811
6978
  for (const message of messages) {
5812
6979
  const geminiRole = GEMINI_ROLE_MAP[message.role];
6980
+ const geminiParts = this.convertToGeminiParts(message.content);
5813
6981
  if (currentGroup && currentGroup.role === geminiRole) {
5814
- currentGroup.parts.push({ text: message.content });
6982
+ currentGroup.parts.push(...geminiParts);
5815
6983
  } else {
5816
6984
  if (currentGroup) {
5817
6985
  result.push(currentGroup);
5818
6986
  }
5819
6987
  currentGroup = {
5820
6988
  role: geminiRole,
5821
- parts: [{ text: message.content }]
6989
+ parts: geminiParts
5822
6990
  };
5823
6991
  }
5824
6992
  }
@@ -5827,11 +6995,39 @@ var init_gemini = __esm({
5827
6995
  }
5828
6996
  return result;
5829
6997
  }
5830
- convertContentsForNewSDK(contents) {
5831
- return contents.map((content) => ({
5832
- role: content.role,
5833
- parts: content.parts.map((part) => ({ text: part.text }))
5834
- }));
6998
+ /**
6999
+ * Convert llmist content to Gemini's part format.
7000
+ * Handles text, images, and audio (Gemini supports all three).
7001
+ */
7002
+ convertToGeminiParts(content) {
7003
+ const parts = normalizeContent(content);
7004
+ return parts.map((part) => {
7005
+ if (part.type === "text") {
7006
+ return { text: part.text };
7007
+ }
7008
+ if (part.type === "image") {
7009
+ if (part.source.type === "url") {
7010
+ throw new Error(
7011
+ "Gemini does not support image URLs directly. Please provide base64-encoded image data."
7012
+ );
7013
+ }
7014
+ return {
7015
+ inlineData: {
7016
+ mimeType: part.source.mediaType,
7017
+ data: part.source.data
7018
+ }
7019
+ };
7020
+ }
7021
+ if (part.type === "audio") {
7022
+ return {
7023
+ inlineData: {
7024
+ mimeType: part.source.mediaType,
7025
+ data: part.source.data
7026
+ }
7027
+ };
7028
+ }
7029
+ throw new Error(`Unsupported content type: ${part.type}`);
7030
+ });
5835
7031
  }
5836
7032
  buildGenerationConfig(options) {
5837
7033
  const config = {};
@@ -5852,9 +7048,9 @@ var init_gemini = __esm({
5852
7048
  async *wrapStream(iterable) {
5853
7049
  const stream2 = iterable;
5854
7050
  for await (const chunk of stream2) {
5855
- const text = this.extractText(chunk);
5856
- if (text) {
5857
- yield { text, rawEvent: chunk };
7051
+ const text3 = this.extractText(chunk);
7052
+ if (text3) {
7053
+ yield { text: text3, rawEvent: chunk };
5858
7054
  }
5859
7055
  const finishReason = this.extractFinishReason(chunk);
5860
7056
  const usage = this.extractUsage(chunk);
@@ -5915,7 +7111,7 @@ var init_gemini = __esm({
5915
7111
  try {
5916
7112
  const response = await client.models.countTokens({
5917
7113
  model: descriptor.name,
5918
- contents: this.convertContentsForNewSDK(contents)
7114
+ contents
5919
7115
  // Note: systemInstruction not used - it's not supported by countTokens()
5920
7116
  // and would cause a 2100% token counting error
5921
7117
  });
@@ -5925,14 +7121,140 @@ var init_gemini = __esm({
5925
7121
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
5926
7122
  error
5927
7123
  );
5928
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
5929
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
7124
+ let totalChars = 0;
7125
+ let mediaCount = 0;
7126
+ for (const msg of messages) {
7127
+ const parts = normalizeContent(msg.content);
7128
+ for (const part of parts) {
7129
+ if (part.type === "text") {
7130
+ totalChars += part.text.length;
7131
+ } else if (part.type === "image" || part.type === "audio") {
7132
+ mediaCount++;
7133
+ }
7134
+ }
7135
+ }
7136
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + mediaCount * 258;
5930
7137
  }
5931
7138
  }
5932
7139
  };
5933
7140
  }
5934
7141
  });
5935
7142
 
7143
+ // src/providers/openai-image-models.ts
7144
+ function getOpenAIImageModelSpec(modelId) {
7145
+ return openaiImageModels.find((m) => m.modelId === modelId);
7146
+ }
7147
+ function isOpenAIImageModel(modelId) {
7148
+ return openaiImageModels.some((m) => m.modelId === modelId);
7149
+ }
7150
+ function calculateOpenAIImageCost(modelId, size, quality = "standard", n = 1) {
7151
+ const spec = getOpenAIImageModelSpec(modelId);
7152
+ if (!spec) return void 0;
7153
+ const sizePrice = spec.pricing.bySize?.[size];
7154
+ if (sizePrice === void 0) return void 0;
7155
+ let pricePerImage;
7156
+ if (typeof sizePrice === "number") {
7157
+ pricePerImage = sizePrice;
7158
+ } else {
7159
+ pricePerImage = sizePrice[quality];
7160
+ if (pricePerImage === void 0) return void 0;
7161
+ }
7162
+ return pricePerImage * n;
7163
+ }
7164
+ var GPT_IMAGE_SIZES, GPT_IMAGE_QUALITIES, DALLE3_SIZES, DALLE3_QUALITIES, DALLE2_SIZES, openaiImageModels;
7165
+ var init_openai_image_models = __esm({
7166
+ "src/providers/openai-image-models.ts"() {
7167
+ "use strict";
7168
+ GPT_IMAGE_SIZES = ["1024x1024", "1024x1536", "1536x1024"];
7169
+ GPT_IMAGE_QUALITIES = ["low", "medium", "high"];
7170
+ DALLE3_SIZES = ["1024x1024", "1024x1792", "1792x1024"];
7171
+ DALLE3_QUALITIES = ["standard", "hd"];
7172
+ DALLE2_SIZES = ["256x256", "512x512", "1024x1024"];
7173
+ openaiImageModels = [
7174
+ // GPT Image 1 Family (flagship)
7175
+ {
7176
+ provider: "openai",
7177
+ modelId: "gpt-image-1",
7178
+ displayName: "GPT Image 1",
7179
+ pricing: {
7180
+ bySize: {
7181
+ "1024x1024": { low: 0.011, medium: 0.04, high: 0.17 },
7182
+ "1024x1536": { low: 0.016, medium: 0.06, high: 0.25 },
7183
+ "1536x1024": { low: 0.016, medium: 0.06, high: 0.25 }
7184
+ }
7185
+ },
7186
+ supportedSizes: [...GPT_IMAGE_SIZES],
7187
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
7188
+ maxImages: 1,
7189
+ defaultSize: "1024x1024",
7190
+ defaultQuality: "medium",
7191
+ features: {
7192
+ textRendering: true,
7193
+ transparency: true
7194
+ }
7195
+ },
7196
+ {
7197
+ provider: "openai",
7198
+ modelId: "gpt-image-1-mini",
7199
+ displayName: "GPT Image 1 Mini",
7200
+ pricing: {
7201
+ bySize: {
7202
+ "1024x1024": { low: 5e-3, medium: 0.02, high: 0.052 },
7203
+ "1024x1536": { low: 75e-4, medium: 0.03, high: 0.078 },
7204
+ "1536x1024": { low: 75e-4, medium: 0.03, high: 0.078 }
7205
+ }
7206
+ },
7207
+ supportedSizes: [...GPT_IMAGE_SIZES],
7208
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
7209
+ maxImages: 1,
7210
+ defaultSize: "1024x1024",
7211
+ defaultQuality: "medium",
7212
+ features: {
7213
+ textRendering: true,
7214
+ transparency: true
7215
+ }
7216
+ },
7217
+ // DALL-E Family
7218
+ {
7219
+ provider: "openai",
7220
+ modelId: "dall-e-3",
7221
+ displayName: "DALL-E 3",
7222
+ pricing: {
7223
+ bySize: {
7224
+ "1024x1024": { standard: 0.04, hd: 0.08 },
7225
+ "1024x1792": { standard: 0.08, hd: 0.12 },
7226
+ "1792x1024": { standard: 0.08, hd: 0.12 }
7227
+ }
7228
+ },
7229
+ supportedSizes: [...DALLE3_SIZES],
7230
+ supportedQualities: [...DALLE3_QUALITIES],
7231
+ maxImages: 1,
7232
+ // DALL-E 3 only supports n=1
7233
+ defaultSize: "1024x1024",
7234
+ defaultQuality: "standard",
7235
+ features: {
7236
+ textRendering: true
7237
+ }
7238
+ },
7239
+ {
7240
+ provider: "openai",
7241
+ modelId: "dall-e-2",
7242
+ displayName: "DALL-E 2 (Legacy)",
7243
+ pricing: {
7244
+ bySize: {
7245
+ "256x256": 0.016,
7246
+ "512x512": 0.018,
7247
+ "1024x1024": 0.02
7248
+ }
7249
+ },
7250
+ supportedSizes: [...DALLE2_SIZES],
7251
+ maxImages: 10,
7252
+ defaultSize: "1024x1024"
7253
+ }
7254
+ ];
7255
+ }
7256
+ });
7257
+
5936
7258
  // src/providers/openai-models.ts
5937
7259
  var OPENAI_MODELS;
5938
7260
  var init_openai_models = __esm({
@@ -6297,6 +7619,144 @@ var init_openai_models = __esm({
6297
7619
  }
6298
7620
  });
6299
7621
 
7622
+ // src/providers/openai-speech-models.ts
7623
+ function getOpenAISpeechModelSpec(modelId) {
7624
+ return openaiSpeechModels.find((m) => m.modelId === modelId);
7625
+ }
7626
+ function isOpenAISpeechModel(modelId) {
7627
+ return openaiSpeechModels.some((m) => m.modelId === modelId);
7628
+ }
7629
+ function calculateOpenAISpeechCost(modelId, characterCount, estimatedMinutes) {
7630
+ const spec = getOpenAISpeechModelSpec(modelId);
7631
+ if (!spec) return void 0;
7632
+ if (spec.pricing.perCharacter !== void 0) {
7633
+ return characterCount * spec.pricing.perCharacter;
7634
+ }
7635
+ if (spec.pricing.perMinute !== void 0 && estimatedMinutes !== void 0) {
7636
+ return estimatedMinutes * spec.pricing.perMinute;
7637
+ }
7638
+ if (spec.pricing.perMinute !== void 0) {
7639
+ const approxMinutes = characterCount / 750;
7640
+ return approxMinutes * spec.pricing.perMinute;
7641
+ }
7642
+ return void 0;
7643
+ }
7644
+ var OPENAI_TTS_VOICES, OPENAI_TTS_EXTENDED_VOICES, OPENAI_TTS_FORMATS, openaiSpeechModels;
7645
+ var init_openai_speech_models = __esm({
7646
+ "src/providers/openai-speech-models.ts"() {
7647
+ "use strict";
7648
+ OPENAI_TTS_VOICES = [
7649
+ "alloy",
7650
+ "echo",
7651
+ "fable",
7652
+ "onyx",
7653
+ "nova",
7654
+ "shimmer"
7655
+ ];
7656
+ OPENAI_TTS_EXTENDED_VOICES = [
7657
+ ...OPENAI_TTS_VOICES,
7658
+ "ash",
7659
+ "ballad",
7660
+ "coral",
7661
+ "sage",
7662
+ "verse"
7663
+ ];
7664
+ OPENAI_TTS_FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
7665
+ openaiSpeechModels = [
7666
+ // Standard TTS models (character-based pricing)
7667
+ {
7668
+ provider: "openai",
7669
+ modelId: "tts-1",
7670
+ displayName: "TTS-1",
7671
+ pricing: {
7672
+ // $15 per 1M characters = $0.000015 per character
7673
+ perCharacter: 15e-6
7674
+ },
7675
+ voices: [...OPENAI_TTS_VOICES],
7676
+ formats: OPENAI_TTS_FORMATS,
7677
+ maxInputLength: 4096,
7678
+ defaultVoice: "alloy",
7679
+ defaultFormat: "mp3",
7680
+ features: {
7681
+ voiceInstructions: false
7682
+ }
7683
+ },
7684
+ {
7685
+ provider: "openai",
7686
+ modelId: "tts-1-1106",
7687
+ displayName: "TTS-1 (Nov 2023)",
7688
+ pricing: {
7689
+ perCharacter: 15e-6
7690
+ },
7691
+ voices: [...OPENAI_TTS_VOICES],
7692
+ formats: OPENAI_TTS_FORMATS,
7693
+ maxInputLength: 4096,
7694
+ defaultVoice: "alloy",
7695
+ defaultFormat: "mp3",
7696
+ features: {
7697
+ voiceInstructions: false
7698
+ }
7699
+ },
7700
+ {
7701
+ provider: "openai",
7702
+ modelId: "tts-1-hd",
7703
+ displayName: "TTS-1 HD",
7704
+ pricing: {
7705
+ // $30 per 1M characters = $0.00003 per character
7706
+ perCharacter: 3e-5
7707
+ },
7708
+ voices: [...OPENAI_TTS_VOICES],
7709
+ formats: OPENAI_TTS_FORMATS,
7710
+ maxInputLength: 4096,
7711
+ defaultVoice: "alloy",
7712
+ defaultFormat: "mp3",
7713
+ features: {
7714
+ voiceInstructions: false
7715
+ }
7716
+ },
7717
+ {
7718
+ provider: "openai",
7719
+ modelId: "tts-1-hd-1106",
7720
+ displayName: "TTS-1 HD (Nov 2023)",
7721
+ pricing: {
7722
+ perCharacter: 3e-5
7723
+ },
7724
+ voices: [...OPENAI_TTS_VOICES],
7725
+ formats: OPENAI_TTS_FORMATS,
7726
+ maxInputLength: 4096,
7727
+ defaultVoice: "alloy",
7728
+ defaultFormat: "mp3",
7729
+ features: {
7730
+ voiceInstructions: false
7731
+ }
7732
+ },
7733
+ // Token-based TTS model with voice instructions support
7734
+ {
7735
+ provider: "openai",
7736
+ modelId: "gpt-4o-mini-tts",
7737
+ displayName: "GPT-4o Mini TTS",
7738
+ pricing: {
7739
+ // $0.60 per 1M input tokens = $0.0000006 per token
7740
+ perInputToken: 6e-7,
7741
+ // $12 per 1M audio output tokens = $0.000012 per token
7742
+ perAudioOutputToken: 12e-6,
7743
+ // ~$0.015 per minute of audio
7744
+ perMinute: 0.015
7745
+ },
7746
+ voices: [...OPENAI_TTS_EXTENDED_VOICES],
7747
+ formats: OPENAI_TTS_FORMATS,
7748
+ maxInputLength: 2e3,
7749
+ // tokens, not characters
7750
+ defaultVoice: "alloy",
7751
+ defaultFormat: "mp3",
7752
+ features: {
7753
+ voiceInstructions: true
7754
+ }
7755
+ }
7756
+ ];
7757
+ }
7758
+ });
7759
+
6300
7760
  // src/providers/openai.ts
6301
7761
  function sanitizeExtra(extra, allowTemperature) {
6302
7762
  if (!extra) {
@@ -6316,9 +7776,12 @@ var init_openai = __esm({
6316
7776
  "use strict";
6317
7777
  import_openai = __toESM(require("openai"), 1);
6318
7778
  import_tiktoken = require("tiktoken");
7779
+ init_messages();
6319
7780
  init_base_provider();
6320
7781
  init_constants2();
7782
+ init_openai_image_models();
6321
7783
  init_openai_models();
7784
+ init_openai_speech_models();
6322
7785
  init_utils();
6323
7786
  ROLE_MAP = {
6324
7787
  system: "system",
@@ -6333,6 +7796,87 @@ var init_openai = __esm({
6333
7796
  getModelSpecs() {
6334
7797
  return OPENAI_MODELS;
6335
7798
  }
7799
+ // =========================================================================
7800
+ // Image Generation
7801
+ // =========================================================================
7802
+ getImageModelSpecs() {
7803
+ return openaiImageModels;
7804
+ }
7805
+ supportsImageGeneration(modelId) {
7806
+ return isOpenAIImageModel(modelId);
7807
+ }
7808
+ async generateImage(options) {
7809
+ const client = this.client;
7810
+ const spec = getOpenAIImageModelSpec(options.model);
7811
+ const size = options.size ?? spec?.defaultSize ?? "1024x1024";
7812
+ const quality = options.quality ?? spec?.defaultQuality ?? "standard";
7813
+ const n = options.n ?? 1;
7814
+ const isDallE2 = options.model === "dall-e-2";
7815
+ const isGptImage = options.model.startsWith("gpt-image");
7816
+ const requestParams = {
7817
+ model: options.model,
7818
+ prompt: options.prompt,
7819
+ size,
7820
+ n
7821
+ };
7822
+ if (!isDallE2 && !isGptImage) {
7823
+ requestParams.quality = quality;
7824
+ }
7825
+ if (isGptImage) {
7826
+ } else if (!isDallE2) {
7827
+ requestParams.response_format = options.responseFormat ?? "url";
7828
+ }
7829
+ const response = await client.images.generate(requestParams);
7830
+ const cost = calculateOpenAIImageCost(options.model, size, quality, n);
7831
+ const images = response.data ?? [];
7832
+ return {
7833
+ images: images.map((img) => ({
7834
+ url: img.url,
7835
+ b64Json: img.b64_json,
7836
+ revisedPrompt: img.revised_prompt
7837
+ })),
7838
+ model: options.model,
7839
+ usage: {
7840
+ imagesGenerated: images.length,
7841
+ size,
7842
+ quality
7843
+ },
7844
+ cost
7845
+ };
7846
+ }
7847
+ // =========================================================================
7848
+ // Speech Generation
7849
+ // =========================================================================
7850
+ getSpeechModelSpecs() {
7851
+ return openaiSpeechModels;
7852
+ }
7853
+ supportsSpeechGeneration(modelId) {
7854
+ return isOpenAISpeechModel(modelId);
7855
+ }
7856
+ async generateSpeech(options) {
7857
+ const client = this.client;
7858
+ const spec = getOpenAISpeechModelSpec(options.model);
7859
+ const format = options.responseFormat ?? spec?.defaultFormat ?? "mp3";
7860
+ const voice = options.voice ?? spec?.defaultVoice ?? "alloy";
7861
+ const response = await client.audio.speech.create({
7862
+ model: options.model,
7863
+ input: options.input,
7864
+ voice,
7865
+ response_format: format,
7866
+ speed: options.speed ?? 1
7867
+ });
7868
+ const audioBuffer = await response.arrayBuffer();
7869
+ const cost = calculateOpenAISpeechCost(options.model, options.input.length);
7870
+ return {
7871
+ audio: audioBuffer,
7872
+ model: options.model,
7873
+ usage: {
7874
+ characterCount: options.input.length
7875
+ },
7876
+ cost,
7877
+ format
7878
+ };
7879
+ }
6336
7880
  buildRequestPayload(options, descriptor, spec, messages) {
6337
7881
  const { maxTokens, temperature, topP, stopSequences, extra } = options;
6338
7882
  const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
@@ -6340,11 +7884,7 @@ var init_openai = __esm({
6340
7884
  const sanitizedExtra = sanitizeExtra(extra, shouldIncludeTemperature);
6341
7885
  return {
6342
7886
  model: descriptor.name,
6343
- messages: messages.map((message) => ({
6344
- role: ROLE_MAP[message.role],
6345
- content: message.content,
6346
- name: message.name
6347
- })),
7887
+ messages: messages.map((message) => this.convertToOpenAIMessage(message)),
6348
7888
  // Only set max_completion_tokens if explicitly provided
6349
7889
  // Otherwise let the API use "as much as fits" in the context window
6350
7890
  ...maxTokens !== void 0 ? { max_completion_tokens: maxTokens } : {},
@@ -6356,6 +7896,77 @@ var init_openai = __esm({
6356
7896
  ...shouldIncludeTemperature ? { temperature } : {}
6357
7897
  };
6358
7898
  }
7899
+ /**
7900
+ * Convert an LLMMessage to OpenAI's ChatCompletionMessageParam.
7901
+ * Handles role-specific content type requirements:
7902
+ * - system/assistant: string content only
7903
+ * - user: string or multimodal array content
7904
+ */
7905
+ convertToOpenAIMessage(message) {
7906
+ const role = ROLE_MAP[message.role];
7907
+ if (role === "user") {
7908
+ const content = this.convertToOpenAIContent(message.content);
7909
+ return {
7910
+ role: "user",
7911
+ content,
7912
+ ...message.name ? { name: message.name } : {}
7913
+ };
7914
+ }
7915
+ const textContent = typeof message.content === "string" ? message.content : extractText(message.content);
7916
+ if (role === "system") {
7917
+ return {
7918
+ role: "system",
7919
+ content: textContent,
7920
+ ...message.name ? { name: message.name } : {}
7921
+ };
7922
+ }
7923
+ return {
7924
+ role: "assistant",
7925
+ content: textContent,
7926
+ ...message.name ? { name: message.name } : {}
7927
+ };
7928
+ }
7929
+ /**
7930
+ * Convert llmist content to OpenAI's content format.
7931
+ * Optimizes by returning string for text-only content, array for multimodal.
7932
+ */
7933
+ convertToOpenAIContent(content) {
7934
+ if (typeof content === "string") {
7935
+ return content;
7936
+ }
7937
+ return content.map((part) => {
7938
+ if (part.type === "text") {
7939
+ return { type: "text", text: part.text };
7940
+ }
7941
+ if (part.type === "image") {
7942
+ return this.convertImagePart(part);
7943
+ }
7944
+ if (part.type === "audio") {
7945
+ throw new Error(
7946
+ "OpenAI chat completions do not support audio input. Use Whisper for transcription or Gemini for audio understanding."
7947
+ );
7948
+ }
7949
+ throw new Error(`Unsupported content type: ${part.type}`);
7950
+ });
7951
+ }
7952
+ /**
7953
+ * Convert an image content part to OpenAI's image_url format.
7954
+ * Supports both URLs and base64 data URLs.
7955
+ */
7956
+ convertImagePart(part) {
7957
+ if (part.source.type === "url") {
7958
+ return {
7959
+ type: "image_url",
7960
+ image_url: { url: part.source.url }
7961
+ };
7962
+ }
7963
+ return {
7964
+ type: "image_url",
7965
+ image_url: {
7966
+ url: `data:${part.source.mediaType};base64,${part.source.data}`
7967
+ }
7968
+ };
7969
+ }
6359
7970
  async executeStreamRequest(payload, signal) {
6360
7971
  const client = this.client;
6361
7972
  const stream2 = await client.chat.completions.create(payload, signal ? { signal } : void 0);
@@ -6364,9 +7975,9 @@ var init_openai = __esm({
6364
7975
  async *wrapStream(iterable) {
6365
7976
  const stream2 = iterable;
6366
7977
  for await (const chunk of stream2) {
6367
- const text = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
6368
- if (text) {
6369
- yield { text, rawEvent: chunk };
7978
+ const text3 = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
7979
+ if (text3) {
7980
+ yield { text: text3, rawEvent: chunk };
6370
7981
  }
6371
7982
  const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
6372
7983
  const usage = chunk.usage ? {
@@ -6414,17 +8025,26 @@ var init_openai = __esm({
6414
8025
  }
6415
8026
  try {
6416
8027
  let tokenCount = 0;
8028
+ let imageCount = 0;
6417
8029
  for (const message of messages) {
6418
8030
  tokenCount += OPENAI_MESSAGE_OVERHEAD_TOKENS;
6419
8031
  const roleText = ROLE_MAP[message.role];
6420
8032
  tokenCount += encoding.encode(roleText).length;
6421
- tokenCount += encoding.encode(message.content ?? "").length;
8033
+ const textContent = extractText(message.content);
8034
+ tokenCount += encoding.encode(textContent).length;
8035
+ const parts = normalizeContent(message.content);
8036
+ for (const part of parts) {
8037
+ if (part.type === "image") {
8038
+ imageCount++;
8039
+ }
8040
+ }
6422
8041
  if (message.name) {
6423
8042
  tokenCount += encoding.encode(message.name).length;
6424
8043
  tokenCount += OPENAI_NAME_FIELD_OVERHEAD_TOKENS;
6425
8044
  }
6426
8045
  }
6427
8046
  tokenCount += OPENAI_REPLY_PRIMING_TOKENS;
8047
+ tokenCount += imageCount * 765;
6428
8048
  return tokenCount;
6429
8049
  } finally {
6430
8050
  encoding.free();
@@ -6434,8 +8054,19 @@ var init_openai = __esm({
6434
8054
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
6435
8055
  error
6436
8056
  );
6437
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
6438
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
8057
+ let totalChars = 0;
8058
+ let imageCount = 0;
8059
+ for (const msg of messages) {
8060
+ const parts = normalizeContent(msg.content);
8061
+ for (const part of parts) {
8062
+ if (part.type === "text") {
8063
+ totalChars += part.text.length;
8064
+ } else if (part.type === "image") {
8065
+ imageCount++;
8066
+ }
8067
+ }
8068
+ }
8069
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 765;
6439
8070
  }
6440
8071
  }
6441
8072
  };
@@ -6673,30 +8304,109 @@ var init_model_registry = __esm({
6673
8304
  }
6674
8305
  });
6675
8306
 
6676
- // src/core/options.ts
6677
- var ModelIdentifierParser;
6678
- var init_options = __esm({
6679
- "src/core/options.ts"() {
8307
+ // src/core/namespaces/image.ts
8308
+ var ImageNamespace;
8309
+ var init_image = __esm({
8310
+ "src/core/namespaces/image.ts"() {
8311
+ "use strict";
8312
+ ImageNamespace = class {
8313
+ constructor(adapters, defaultProvider) {
8314
+ this.adapters = adapters;
8315
+ this.defaultProvider = defaultProvider;
8316
+ }
8317
+ /**
8318
+ * Generate images from a text prompt.
8319
+ *
8320
+ * @param options - Image generation options
8321
+ * @returns Promise resolving to the generation result with images and cost
8322
+ * @throws Error if the provider doesn't support image generation
8323
+ */
8324
+ async generate(options) {
8325
+ const modelId = options.model;
8326
+ const adapter = this.findImageAdapter(modelId);
8327
+ if (!adapter || !adapter.generateImage) {
8328
+ throw new Error(
8329
+ `No provider supports image generation for model "${modelId}". Available image models: ${this.listModels().map((m) => m.modelId).join(", ")}`
8330
+ );
8331
+ }
8332
+ return adapter.generateImage(options);
8333
+ }
8334
+ /**
8335
+ * List all available image generation models.
8336
+ */
8337
+ listModels() {
8338
+ const models = [];
8339
+ for (const adapter of this.adapters) {
8340
+ if (adapter.getImageModelSpecs) {
8341
+ models.push(...adapter.getImageModelSpecs());
8342
+ }
8343
+ }
8344
+ return models;
8345
+ }
8346
+ /**
8347
+ * Check if a model is supported for image generation.
8348
+ */
8349
+ supportsModel(modelId) {
8350
+ return this.findImageAdapter(modelId) !== void 0;
8351
+ }
8352
+ findImageAdapter(modelId) {
8353
+ return this.adapters.find(
8354
+ (adapter) => adapter.supportsImageGeneration?.(modelId) ?? false
8355
+ );
8356
+ }
8357
+ };
8358
+ }
8359
+ });
8360
+
8361
+ // src/core/namespaces/speech.ts
8362
+ var SpeechNamespace;
8363
+ var init_speech = __esm({
8364
+ "src/core/namespaces/speech.ts"() {
6680
8365
  "use strict";
6681
- ModelIdentifierParser = class {
6682
- constructor(defaultProvider = "openai") {
8366
+ SpeechNamespace = class {
8367
+ constructor(adapters, defaultProvider) {
8368
+ this.adapters = adapters;
6683
8369
  this.defaultProvider = defaultProvider;
6684
8370
  }
6685
- parse(identifier) {
6686
- const trimmed = identifier.trim();
6687
- if (!trimmed) {
6688
- throw new Error("Model identifier cannot be empty");
6689
- }
6690
- const [maybeProvider, ...rest] = trimmed.split(":");
6691
- if (rest.length === 0) {
6692
- return { provider: this.defaultProvider, name: maybeProvider };
8371
+ /**
8372
+ * Generate speech audio from text.
8373
+ *
8374
+ * @param options - Speech generation options
8375
+ * @returns Promise resolving to the generation result with audio and cost
8376
+ * @throws Error if the provider doesn't support speech generation
8377
+ */
8378
+ async generate(options) {
8379
+ const modelId = options.model;
8380
+ const adapter = this.findSpeechAdapter(modelId);
8381
+ if (!adapter || !adapter.generateSpeech) {
8382
+ throw new Error(
8383
+ `No provider supports speech generation for model "${modelId}". Available speech models: ${this.listModels().map((m) => m.modelId).join(", ")}`
8384
+ );
6693
8385
  }
6694
- const provider = maybeProvider;
6695
- const name = rest.join(":");
6696
- if (!name) {
6697
- throw new Error("Model name cannot be empty");
8386
+ return adapter.generateSpeech(options);
8387
+ }
8388
+ /**
8389
+ * List all available speech generation models.
8390
+ */
8391
+ listModels() {
8392
+ const models = [];
8393
+ for (const adapter of this.adapters) {
8394
+ if (adapter.getSpeechModelSpecs) {
8395
+ models.push(...adapter.getSpeechModelSpecs());
8396
+ }
6698
8397
  }
6699
- return { provider, name };
8398
+ return models;
8399
+ }
8400
+ /**
8401
+ * Check if a model is supported for speech generation.
8402
+ */
8403
+ supportsModel(modelId) {
8404
+ return this.findSpeechAdapter(modelId) !== void 0;
8405
+ }
8406
+ findSpeechAdapter(modelId) {
8407
+ return this.adapters.find(
8408
+ (adapter) => adapter.supportsSpeechGeneration?.(modelId) ?? false
8409
+ );
6700
8410
  }
6701
8411
  };
6702
8412
  }
@@ -6745,6 +8455,201 @@ var init_quick_methods = __esm({
6745
8455
  }
6746
8456
  });
6747
8457
 
8458
+ // src/core/namespaces/text.ts
8459
+ var TextNamespace;
8460
+ var init_text = __esm({
8461
+ "src/core/namespaces/text.ts"() {
8462
+ "use strict";
8463
+ init_quick_methods();
8464
+ TextNamespace = class {
8465
+ constructor(client) {
8466
+ this.client = client;
8467
+ }
8468
+ /**
8469
+ * Generate a complete text response.
8470
+ *
8471
+ * @param prompt - User prompt
8472
+ * @param options - Optional configuration
8473
+ * @returns Complete text response
8474
+ */
8475
+ async complete(prompt, options) {
8476
+ return complete(this.client, prompt, options);
8477
+ }
8478
+ /**
8479
+ * Stream text chunks.
8480
+ *
8481
+ * @param prompt - User prompt
8482
+ * @param options - Optional configuration
8483
+ * @returns Async generator yielding text chunks
8484
+ */
8485
+ stream(prompt, options) {
8486
+ return stream(this.client, prompt, options);
8487
+ }
8488
+ };
8489
+ }
8490
+ });
8491
+
8492
+ // src/core/namespaces/vision.ts
8493
+ var VisionNamespace;
8494
+ var init_vision = __esm({
8495
+ "src/core/namespaces/vision.ts"() {
8496
+ "use strict";
8497
+ init_input_content();
8498
+ init_messages();
8499
+ VisionNamespace = class {
8500
+ constructor(client) {
8501
+ this.client = client;
8502
+ }
8503
+ /**
8504
+ * Build a message builder with the image content attached.
8505
+ * Handles URLs, data URLs, base64 strings, and binary buffers.
8506
+ */
8507
+ buildImageMessage(options) {
8508
+ const builder = new LLMMessageBuilder();
8509
+ if (options.systemPrompt) {
8510
+ builder.addSystem(options.systemPrompt);
8511
+ }
8512
+ if (typeof options.image === "string") {
8513
+ if (options.image.startsWith("http://") || options.image.startsWith("https://")) {
8514
+ builder.addUserWithImageUrl(options.prompt, options.image);
8515
+ } else if (isDataUrl(options.image)) {
8516
+ const parsed = parseDataUrl(options.image);
8517
+ if (!parsed) {
8518
+ throw new Error("Invalid data URL format");
8519
+ }
8520
+ builder.addUserWithImage(
8521
+ options.prompt,
8522
+ parsed.data,
8523
+ parsed.mimeType
8524
+ );
8525
+ } else {
8526
+ const buffer = Buffer.from(options.image, "base64");
8527
+ builder.addUserWithImage(options.prompt, buffer, options.mimeType);
8528
+ }
8529
+ } else {
8530
+ builder.addUserWithImage(options.prompt, options.image, options.mimeType);
8531
+ }
8532
+ return builder;
8533
+ }
8534
+ /**
8535
+ * Stream the response and collect text and usage information.
8536
+ */
8537
+ async streamAndCollect(options, builder) {
8538
+ let response = "";
8539
+ let finalUsage;
8540
+ for await (const chunk of this.client.stream({
8541
+ model: options.model,
8542
+ messages: builder.build(),
8543
+ maxTokens: options.maxTokens,
8544
+ temperature: options.temperature
8545
+ })) {
8546
+ response += chunk.text;
8547
+ if (chunk.usage) {
8548
+ finalUsage = {
8549
+ inputTokens: chunk.usage.inputTokens,
8550
+ outputTokens: chunk.usage.outputTokens,
8551
+ totalTokens: chunk.usage.totalTokens
8552
+ };
8553
+ }
8554
+ }
8555
+ return { text: response.trim(), usage: finalUsage };
8556
+ }
8557
+ /**
8558
+ * Analyze an image with a vision-capable model.
8559
+ * Returns the analysis as a string.
8560
+ *
8561
+ * @param options - Vision analysis options
8562
+ * @returns Promise resolving to the analysis text
8563
+ * @throws Error if the image format is unsupported or model doesn't support vision
8564
+ *
8565
+ * @example
8566
+ * ```typescript
8567
+ * // From file
8568
+ * const result = await llmist.vision.analyze({
8569
+ * model: "gpt-4o",
8570
+ * image: await fs.readFile("photo.jpg"),
8571
+ * prompt: "What's in this image?",
8572
+ * });
8573
+ *
8574
+ * // From URL (OpenAI only)
8575
+ * const result = await llmist.vision.analyze({
8576
+ * model: "gpt-4o",
8577
+ * image: "https://example.com/image.jpg",
8578
+ * prompt: "Describe this image",
8579
+ * });
8580
+ * ```
8581
+ */
8582
+ async analyze(options) {
8583
+ const builder = this.buildImageMessage(options);
8584
+ const { text: text3 } = await this.streamAndCollect(options, builder);
8585
+ return text3;
8586
+ }
8587
+ /**
8588
+ * Analyze an image and return detailed result with usage info.
8589
+ *
8590
+ * @param options - Vision analysis options
8591
+ * @returns Promise resolving to the analysis result with usage info
8592
+ */
8593
+ async analyzeWithUsage(options) {
8594
+ const builder = this.buildImageMessage(options);
8595
+ const { text: text3, usage } = await this.streamAndCollect(options, builder);
8596
+ return {
8597
+ text: text3,
8598
+ model: options.model,
8599
+ usage
8600
+ };
8601
+ }
8602
+ /**
8603
+ * Check if a model supports vision/image input.
8604
+ *
8605
+ * @param modelId - Model ID to check
8606
+ * @returns True if the model supports vision
8607
+ */
8608
+ supportsModel(modelId) {
8609
+ const spec = this.client.modelRegistry.getModelSpec(modelId);
8610
+ return spec?.features?.vision === true;
8611
+ }
8612
+ /**
8613
+ * List all models that support vision.
8614
+ *
8615
+ * @returns Array of model IDs that support vision
8616
+ */
8617
+ listModels() {
8618
+ return this.client.modelRegistry.listModels().filter((spec) => spec.features?.vision === true).map((spec) => spec.modelId);
8619
+ }
8620
+ };
8621
+ }
8622
+ });
8623
+
8624
+ // src/core/options.ts
8625
+ var ModelIdentifierParser;
8626
+ var init_options = __esm({
8627
+ "src/core/options.ts"() {
8628
+ "use strict";
8629
+ ModelIdentifierParser = class {
8630
+ constructor(defaultProvider = "openai") {
8631
+ this.defaultProvider = defaultProvider;
8632
+ }
8633
+ parse(identifier) {
8634
+ const trimmed = identifier.trim();
8635
+ if (!trimmed) {
8636
+ throw new Error("Model identifier cannot be empty");
8637
+ }
8638
+ const [maybeProvider, ...rest] = trimmed.split(":");
8639
+ if (rest.length === 0) {
8640
+ return { provider: this.defaultProvider, name: maybeProvider };
8641
+ }
8642
+ const provider = maybeProvider;
8643
+ const name = rest.join(":");
8644
+ if (!name) {
8645
+ throw new Error("Model name cannot be empty");
8646
+ }
8647
+ return { provider, name };
8648
+ }
8649
+ };
8650
+ }
8651
+ });
8652
+
6748
8653
  // src/core/client.ts
6749
8654
  var client_exports = {};
6750
8655
  __export(client_exports, {
@@ -6757,12 +8662,22 @@ var init_client = __esm({
6757
8662
  init_builder();
6758
8663
  init_discovery();
6759
8664
  init_model_registry();
8665
+ init_image();
8666
+ init_speech();
8667
+ init_text();
8668
+ init_vision();
6760
8669
  init_options();
6761
8670
  init_quick_methods();
6762
8671
  LLMist = class _LLMist {
6763
8672
  parser;
8673
+ defaultProvider;
6764
8674
  modelRegistry;
6765
8675
  adapters;
8676
+ // Namespaces for different generation types
8677
+ text;
8678
+ image;
8679
+ speech;
8680
+ vision;
6766
8681
  constructor(...args) {
6767
8682
  let adapters = [];
6768
8683
  let defaultProvider;
@@ -6801,6 +8716,7 @@ var init_client = __esm({
6801
8716
  const priorityB = b.priority ?? 0;
6802
8717
  return priorityB - priorityA;
6803
8718
  });
8719
+ this.defaultProvider = resolvedDefaultProvider;
6804
8720
  this.parser = new ModelIdentifierParser(resolvedDefaultProvider);
6805
8721
  this.modelRegistry = new ModelRegistry();
6806
8722
  for (const adapter of this.adapters) {
@@ -6809,6 +8725,10 @@ var init_client = __esm({
6809
8725
  if (customModels.length > 0) {
6810
8726
  this.modelRegistry.registerModels(customModels);
6811
8727
  }
8728
+ this.text = new TextNamespace(this);
8729
+ this.image = new ImageNamespace(this.adapters, this.defaultProvider);
8730
+ this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
8731
+ this.vision = new VisionNamespace(this);
6812
8732
  }
6813
8733
  stream(options) {
6814
8734
  const descriptor = this.parser.parse(options.model);
@@ -7275,9 +9195,9 @@ function sleep(ms) {
7275
9195
  function generateInvocationId() {
7276
9196
  return `inv-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
7277
9197
  }
7278
- function splitIntoChunks(text, minChunkSize = 5, maxChunkSize = 30) {
9198
+ function splitIntoChunks(text3, minChunkSize = 5, maxChunkSize = 30) {
7279
9199
  const chunks = [];
7280
- let remaining = text;
9200
+ let remaining = text3;
7281
9201
  while (remaining.length > 0) {
7282
9202
  const chunkSize = Math.min(
7283
9203
  Math.floor(Math.random() * (maxChunkSize - minChunkSize + 1)) + minChunkSize,
@@ -7336,17 +9256,17 @@ ${String(value)}
7336
9256
  return result;
7337
9257
  }
7338
9258
  function formatGadgetCalls(gadgetCalls) {
7339
- let text = "";
9259
+ let text3 = "";
7340
9260
  const calls = [];
7341
9261
  for (const call of gadgetCalls) {
7342
9262
  const invocationId = call.invocationId ?? generateInvocationId();
7343
9263
  calls.push({ name: call.gadgetName, invocationId });
7344
9264
  const blockParams = serializeToBlockFormat(call.parameters);
7345
- text += `
9265
+ text3 += `
7346
9266
  ${GADGET_START_PREFIX}${call.gadgetName}
7347
9267
  ${blockParams}${GADGET_END_PREFIX}`;
7348
9268
  }
7349
- return { text, calls };
9269
+ return { text: text3, calls };
7350
9270
  }
7351
9271
  async function* createMockStream(response) {
7352
9272
  if (response.delayMs) {
@@ -7386,9 +9306,9 @@ async function* createMockStream(response) {
7386
9306
  };
7387
9307
  }
7388
9308
  }
7389
- function createTextMockStream(text, options) {
9309
+ function createTextMockStream(text3, options) {
7390
9310
  return createMockStream({
7391
- text,
9311
+ text: text3,
7392
9312
  delayMs: options?.delayMs,
7393
9313
  streamDelayMs: options?.streamDelayMs,
7394
9314
  usage: options?.usage,
@@ -7405,10 +9325,10 @@ var MockProviderAdapter = class {
7405
9325
  constructor(options) {
7406
9326
  this.mockManager = getMockManager(options);
7407
9327
  }
7408
- supports(descriptor) {
9328
+ supports(_descriptor) {
7409
9329
  return true;
7410
9330
  }
7411
- stream(options, descriptor, spec) {
9331
+ stream(options, descriptor, _spec) {
7412
9332
  const context = {
7413
9333
  model: options.model,
7414
9334
  provider: descriptor.provider,
@@ -7419,20 +9339,154 @@ var MockProviderAdapter = class {
7419
9339
  return this.createMockStreamFromContext(context);
7420
9340
  }
7421
9341
  async *createMockStreamFromContext(context) {
7422
- try {
7423
- const mockResponse = await this.mockManager.findMatch(context);
7424
- if (!mockResponse) {
7425
- yield {
7426
- text: "",
7427
- finishReason: "stop",
7428
- usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
7429
- };
7430
- return;
7431
- }
7432
- yield* createMockStream(mockResponse);
7433
- } catch (error) {
7434
- throw error;
9342
+ const mockResponse = await this.mockManager.findMatch(context);
9343
+ if (!mockResponse) {
9344
+ yield {
9345
+ text: "",
9346
+ finishReason: "stop",
9347
+ usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
9348
+ };
9349
+ return;
9350
+ }
9351
+ yield* createMockStream(mockResponse);
9352
+ }
9353
+ // ==========================================================================
9354
+ // Image Generation Support
9355
+ // ==========================================================================
9356
+ /**
9357
+ * Check if this adapter supports image generation for a given model.
9358
+ * Returns true if there's a registered mock with images for this model.
9359
+ */
9360
+ supportsImageGeneration(_modelId) {
9361
+ return true;
9362
+ }
9363
+ /**
9364
+ * Generate mock images based on registered mocks.
9365
+ *
9366
+ * @param options - Image generation options
9367
+ * @returns Mock image generation result
9368
+ */
9369
+ async generateImage(options) {
9370
+ const context = {
9371
+ model: options.model,
9372
+ provider: "mock",
9373
+ modelName: options.model,
9374
+ options: {
9375
+ model: options.model,
9376
+ messages: [{ role: "user", content: options.prompt }]
9377
+ },
9378
+ messages: [{ role: "user", content: options.prompt }]
9379
+ };
9380
+ const mockResponse = await this.mockManager.findMatch(context);
9381
+ if (!mockResponse?.images || mockResponse.images.length === 0) {
9382
+ throw new Error(
9383
+ `No mock registered for image generation with model "${options.model}". Use mockLLM().forModel("${options.model}").returnsImage(...).register() to add one.`
9384
+ );
9385
+ }
9386
+ return this.createImageResult(options, mockResponse);
9387
+ }
9388
+ /**
9389
+ * Transform mock response into ImageGenerationResult format.
9390
+ *
9391
+ * @param options - Original image generation options
9392
+ * @param mockResponse - Mock response containing image data
9393
+ * @returns ImageGenerationResult with mock data and zero cost
9394
+ */
9395
+ createImageResult(options, mockResponse) {
9396
+ const images = mockResponse.images ?? [];
9397
+ return {
9398
+ images: images.map((img) => ({
9399
+ b64Json: img.data,
9400
+ revisedPrompt: img.revisedPrompt
9401
+ })),
9402
+ model: options.model,
9403
+ usage: {
9404
+ imagesGenerated: images.length,
9405
+ size: options.size ?? "1024x1024",
9406
+ quality: options.quality ?? "standard"
9407
+ },
9408
+ cost: 0
9409
+ // Mock cost is always 0
9410
+ };
9411
+ }
9412
+ // ==========================================================================
9413
+ // Speech Generation Support
9414
+ // ==========================================================================
9415
+ /**
9416
+ * Check if this adapter supports speech generation for a given model.
9417
+ * Returns true if there's a registered mock with audio for this model.
9418
+ */
9419
+ supportsSpeechGeneration(_modelId) {
9420
+ return true;
9421
+ }
9422
+ /**
9423
+ * Generate mock speech based on registered mocks.
9424
+ *
9425
+ * @param options - Speech generation options
9426
+ * @returns Mock speech generation result
9427
+ */
9428
+ async generateSpeech(options) {
9429
+ const context = {
9430
+ model: options.model,
9431
+ provider: "mock",
9432
+ modelName: options.model,
9433
+ options: {
9434
+ model: options.model,
9435
+ messages: [{ role: "user", content: options.input }]
9436
+ },
9437
+ messages: [{ role: "user", content: options.input }]
9438
+ };
9439
+ const mockResponse = await this.mockManager.findMatch(context);
9440
+ if (!mockResponse?.audio) {
9441
+ throw new Error(
9442
+ `No mock registered for speech generation with model "${options.model}". Use mockLLM().forModel("${options.model}").returnsAudio(...).register() to add one.`
9443
+ );
9444
+ }
9445
+ return this.createSpeechResult(options, mockResponse);
9446
+ }
9447
+ /**
9448
+ * Transform mock response into SpeechGenerationResult format.
9449
+ * Converts base64 audio data to ArrayBuffer.
9450
+ *
9451
+ * @param options - Original speech generation options
9452
+ * @param mockResponse - Mock response containing audio data
9453
+ * @returns SpeechGenerationResult with mock data and zero cost
9454
+ */
9455
+ createSpeechResult(options, mockResponse) {
9456
+ const audio = mockResponse.audio;
9457
+ const binaryString = atob(audio.data);
9458
+ const bytes = new Uint8Array(binaryString.length);
9459
+ for (let i = 0; i < binaryString.length; i++) {
9460
+ bytes[i] = binaryString.charCodeAt(i);
7435
9461
  }
9462
+ const format = this.mimeTypeToAudioFormat(audio.mimeType);
9463
+ return {
9464
+ audio: bytes.buffer,
9465
+ model: options.model,
9466
+ usage: {
9467
+ characterCount: options.input.length
9468
+ },
9469
+ cost: 0,
9470
+ // Mock cost is always 0
9471
+ format
9472
+ };
9473
+ }
9474
+ /**
9475
+ * Map MIME type to audio format for SpeechGenerationResult.
9476
+ * Defaults to "mp3" for unknown MIME types.
9477
+ *
9478
+ * @param mimeType - Audio MIME type string
9479
+ * @returns Audio format identifier
9480
+ */
9481
+ mimeTypeToAudioFormat(mimeType) {
9482
+ const mapping = {
9483
+ "audio/mp3": "mp3",
9484
+ "audio/mpeg": "mp3",
9485
+ "audio/wav": "wav",
9486
+ "audio/webm": "opus",
9487
+ "audio/ogg": "opus"
9488
+ };
9489
+ return mapping[mimeType] ?? "mp3";
7436
9490
  }
7437
9491
  };
7438
9492
  function createMockAdapter(options) {
@@ -7440,6 +9494,20 @@ function createMockAdapter(options) {
7440
9494
  }
7441
9495
 
7442
9496
  // src/testing/mock-builder.ts
9497
+ init_input_content();
9498
+ init_messages();
9499
+ function hasImageContent(content) {
9500
+ if (typeof content === "string") return false;
9501
+ return content.some((part) => isImagePart(part));
9502
+ }
9503
+ function hasAudioContent(content) {
9504
+ if (typeof content === "string") return false;
9505
+ return content.some((part) => isAudioPart(part));
9506
+ }
9507
+ function countImages(content) {
9508
+ if (typeof content === "string") return 0;
9509
+ return content.filter((part) => isImagePart(part)).length;
9510
+ }
7443
9511
  var MockBuilder = class {
7444
9512
  matchers = [];
7445
9513
  response = {};
@@ -7502,9 +9570,9 @@ var MockBuilder = class {
7502
9570
  * @example
7503
9571
  * mockLLM().whenMessageContains('hello')
7504
9572
  */
7505
- whenMessageContains(text) {
9573
+ whenMessageContains(text3) {
7506
9574
  this.matchers.push(
7507
- (ctx) => ctx.messages.some((msg) => msg.content?.toLowerCase().includes(text.toLowerCase()))
9575
+ (ctx) => ctx.messages.some((msg) => extractText(msg.content).toLowerCase().includes(text3.toLowerCase()))
7508
9576
  );
7509
9577
  return this;
7510
9578
  }
@@ -7514,10 +9582,11 @@ var MockBuilder = class {
7514
9582
  * @example
7515
9583
  * mockLLM().whenLastMessageContains('goodbye')
7516
9584
  */
7517
- whenLastMessageContains(text) {
9585
+ whenLastMessageContains(text3) {
7518
9586
  this.matchers.push((ctx) => {
7519
9587
  const lastMsg = ctx.messages[ctx.messages.length - 1];
7520
- return lastMsg?.content?.toLowerCase().includes(text.toLowerCase()) ?? false;
9588
+ if (!lastMsg) return false;
9589
+ return extractText(lastMsg.content).toLowerCase().includes(text3.toLowerCase());
7521
9590
  });
7522
9591
  return this;
7523
9592
  }
@@ -7528,7 +9597,7 @@ var MockBuilder = class {
7528
9597
  * mockLLM().whenMessageMatches(/calculate \d+/)
7529
9598
  */
7530
9599
  whenMessageMatches(regex) {
7531
- this.matchers.push((ctx) => ctx.messages.some((msg) => regex.test(msg.content ?? "")));
9600
+ this.matchers.push((ctx) => ctx.messages.some((msg) => regex.test(extractText(msg.content))));
7532
9601
  return this;
7533
9602
  }
7534
9603
  /**
@@ -7537,10 +9606,10 @@ var MockBuilder = class {
7537
9606
  * @example
7538
9607
  * mockLLM().whenRoleContains('system', 'You are a helpful assistant')
7539
9608
  */
7540
- whenRoleContains(role, text) {
9609
+ whenRoleContains(role, text3) {
7541
9610
  this.matchers.push(
7542
9611
  (ctx) => ctx.messages.some(
7543
- (msg) => msg.role === role && msg.content?.toLowerCase().includes(text.toLowerCase())
9612
+ (msg) => msg.role === role && extractText(msg.content).toLowerCase().includes(text3.toLowerCase())
7544
9613
  )
7545
9614
  );
7546
9615
  return this;
@@ -7568,6 +9637,43 @@ var MockBuilder = class {
7568
9637
  this.matchers.push(matcher);
7569
9638
  return this;
7570
9639
  }
9640
+ // ==========================================================================
9641
+ // Multimodal Matchers
9642
+ // ==========================================================================
9643
+ /**
9644
+ * Match when any message contains an image.
9645
+ *
9646
+ * @example
9647
+ * mockLLM().whenMessageHasImage().returns("I see an image of a sunset.")
9648
+ */
9649
+ whenMessageHasImage() {
9650
+ this.matchers.push((ctx) => ctx.messages.some((msg) => hasImageContent(msg.content)));
9651
+ return this;
9652
+ }
9653
+ /**
9654
+ * Match when any message contains audio.
9655
+ *
9656
+ * @example
9657
+ * mockLLM().whenMessageHasAudio().returns("I hear music playing.")
9658
+ */
9659
+ whenMessageHasAudio() {
9660
+ this.matchers.push((ctx) => ctx.messages.some((msg) => hasAudioContent(msg.content)));
9661
+ return this;
9662
+ }
9663
+ /**
9664
+ * Match based on the number of images in the last message.
9665
+ *
9666
+ * @example
9667
+ * mockLLM().whenImageCount((n) => n >= 2).returns("Comparing multiple images...")
9668
+ */
9669
+ whenImageCount(predicate) {
9670
+ this.matchers.push((ctx) => {
9671
+ const lastMsg = ctx.messages[ctx.messages.length - 1];
9672
+ if (!lastMsg) return false;
9673
+ return predicate(countImages(lastMsg.content));
9674
+ });
9675
+ return this;
9676
+ }
7571
9677
  /**
7572
9678
  * Set the text response to return.
7573
9679
  * Can be a static string or a function that returns a string dynamically.
@@ -7577,17 +9683,17 @@ var MockBuilder = class {
7577
9683
  * mockLLM().returns(() => `Response at ${Date.now()}`)
7578
9684
  * mockLLM().returns((ctx) => `You said: ${ctx.messages[0]?.content}`)
7579
9685
  */
7580
- returns(text) {
7581
- if (typeof text === "function") {
9686
+ returns(text3) {
9687
+ if (typeof text3 === "function") {
7582
9688
  this.response = async (ctx) => {
7583
- const resolvedText = await Promise.resolve().then(() => text(ctx));
9689
+ const resolvedText = await Promise.resolve().then(() => text3(ctx));
7584
9690
  return { text: resolvedText };
7585
9691
  };
7586
9692
  } else {
7587
9693
  if (typeof this.response === "function") {
7588
9694
  throw new Error("Cannot use returns() after withResponse() with a function");
7589
9695
  }
7590
- this.response.text = text;
9696
+ this.response.text = text3;
7591
9697
  }
7592
9698
  return this;
7593
9699
  }
@@ -7624,6 +9730,112 @@ var MockBuilder = class {
7624
9730
  this.response.gadgetCalls.push({ gadgetName, parameters });
7625
9731
  return this;
7626
9732
  }
9733
+ // ==========================================================================
9734
+ // Multimodal Response Helpers
9735
+ // ==========================================================================
9736
+ /**
9737
+ * Return a single image in the response.
9738
+ * Useful for mocking image generation endpoints.
9739
+ *
9740
+ * @param data - Image data (base64 string or Buffer)
9741
+ * @param mimeType - MIME type (auto-detected if Buffer provided without type)
9742
+ *
9743
+ * @example
9744
+ * mockLLM()
9745
+ * .forModel('dall-e-3')
9746
+ * .returnsImage(pngBuffer)
9747
+ * .register();
9748
+ */
9749
+ returnsImage(data, mimeType) {
9750
+ if (typeof this.response === "function") {
9751
+ throw new Error("Cannot use returnsImage() after withResponse() with a function");
9752
+ }
9753
+ let imageData;
9754
+ let imageMime;
9755
+ if (typeof data === "string") {
9756
+ imageData = data;
9757
+ if (!mimeType) {
9758
+ throw new Error("MIME type is required when providing base64 string data");
9759
+ }
9760
+ imageMime = mimeType;
9761
+ } else {
9762
+ imageData = toBase64(data);
9763
+ const detected = mimeType ?? detectImageMimeType(data);
9764
+ if (!detected) {
9765
+ throw new Error(
9766
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
9767
+ );
9768
+ }
9769
+ imageMime = detected;
9770
+ }
9771
+ if (!this.response.images) {
9772
+ this.response.images = [];
9773
+ }
9774
+ this.response.images.push({ data: imageData, mimeType: imageMime });
9775
+ return this;
9776
+ }
9777
+ /**
9778
+ * Return multiple images in the response.
9779
+ *
9780
+ * @example
9781
+ * mockLLM()
9782
+ * .forModel('dall-e-3')
9783
+ * .returnsImages([
9784
+ * { data: pngBuffer1 },
9785
+ * { data: pngBuffer2 },
9786
+ * ])
9787
+ * .register();
9788
+ */
9789
+ returnsImages(images) {
9790
+ for (const img of images) {
9791
+ this.returnsImage(img.data, img.mimeType);
9792
+ if (img.revisedPrompt && this.response && typeof this.response !== "function") {
9793
+ const lastImage = this.response.images?.[this.response.images.length - 1];
9794
+ if (lastImage) {
9795
+ lastImage.revisedPrompt = img.revisedPrompt;
9796
+ }
9797
+ }
9798
+ }
9799
+ return this;
9800
+ }
9801
+ /**
9802
+ * Return audio data in the response.
9803
+ * Useful for mocking speech synthesis endpoints.
9804
+ *
9805
+ * @param data - Audio data (base64 string or Buffer)
9806
+ * @param mimeType - MIME type (auto-detected if Buffer provided without type)
9807
+ *
9808
+ * @example
9809
+ * mockLLM()
9810
+ * .forModel('tts-1')
9811
+ * .returnsAudio(mp3Buffer)
9812
+ * .register();
9813
+ */
9814
+ returnsAudio(data, mimeType) {
9815
+ if (typeof this.response === "function") {
9816
+ throw new Error("Cannot use returnsAudio() after withResponse() with a function");
9817
+ }
9818
+ let audioData;
9819
+ let audioMime;
9820
+ if (typeof data === "string") {
9821
+ audioData = data;
9822
+ if (!mimeType) {
9823
+ throw new Error("MIME type is required when providing base64 string data");
9824
+ }
9825
+ audioMime = mimeType;
9826
+ } else {
9827
+ audioData = toBase64(data);
9828
+ const detected = mimeType ?? detectAudioMimeType(data);
9829
+ if (!detected) {
9830
+ throw new Error(
9831
+ "Could not detect audio MIME type. Please provide the mimeType parameter explicitly."
9832
+ );
9833
+ }
9834
+ audioMime = detected;
9835
+ }
9836
+ this.response.audio = { data: audioData, mimeType: audioMime };
9837
+ return this;
9838
+ }
7627
9839
  /**
7628
9840
  * Set the complete mock response object.
7629
9841
  * This allows full control over all response properties.
@@ -7954,23 +10166,23 @@ function createTestStream(chunks) {
7954
10166
  }
7955
10167
  }();
7956
10168
  }
7957
- function createTextStream(text, options) {
10169
+ function createTextStream(text3, options) {
7958
10170
  return async function* () {
7959
10171
  if (options?.delayMs) {
7960
10172
  await sleep2(options.delayMs);
7961
10173
  }
7962
- const chunkSize = options?.chunkSize ?? text.length;
10174
+ const chunkSize = options?.chunkSize ?? text3.length;
7963
10175
  const chunks = [];
7964
- for (let i = 0; i < text.length; i += chunkSize) {
7965
- chunks.push(text.slice(i, i + chunkSize));
10176
+ for (let i = 0; i < text3.length; i += chunkSize) {
10177
+ chunks.push(text3.slice(i, i + chunkSize));
7966
10178
  }
7967
10179
  for (let i = 0; i < chunks.length; i++) {
7968
10180
  const isLast = i === chunks.length - 1;
7969
10181
  const chunk = { text: chunks[i] };
7970
10182
  if (isLast) {
7971
10183
  chunk.finishReason = options?.finishReason ?? "stop";
7972
- const inputTokens = Math.ceil(text.length / 4);
7973
- const outputTokens = Math.ceil(text.length / 4);
10184
+ const inputTokens = Math.ceil(text3.length / 4);
10185
+ const outputTokens = Math.ceil(text3.length / 4);
7974
10186
  chunk.usage = options?.usage ?? {
7975
10187
  inputTokens,
7976
10188
  outputTokens,
@@ -7992,11 +10204,11 @@ async function collectStream(stream2) {
7992
10204
  return chunks;
7993
10205
  }
7994
10206
  async function collectStreamText(stream2) {
7995
- let text = "";
10207
+ let text3 = "";
7996
10208
  for await (const chunk of stream2) {
7997
- text += chunk.text ?? "";
10209
+ text3 += chunk.text ?? "";
7998
10210
  }
7999
- return text;
10211
+ return text3;
8000
10212
  }
8001
10213
  async function getStreamFinalChunk(stream2) {
8002
10214
  let lastChunk;