llmist 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -113,375 +113,229 @@ var init_constants = __esm({
113
113
  }
114
114
  });
115
115
 
116
- // src/core/model-shortcuts.ts
117
- function isKnownModelPattern(model) {
118
- const normalized = model.toLowerCase();
119
- if (MODEL_ALIASES[normalized]) {
120
- return true;
121
- }
122
- return KNOWN_MODEL_PATTERNS.some((pattern) => pattern.test(model));
116
+ // src/core/input-content.ts
117
+ function isTextPart(part) {
118
+ return part.type === "text";
123
119
  }
124
- function resolveModel(model, options = {}) {
125
- if (model.includes(":")) {
126
- return model;
127
- }
128
- const normalized = model.toLowerCase();
129
- if (MODEL_ALIASES[normalized]) {
130
- return MODEL_ALIASES[normalized];
131
- }
132
- const modelLower = model.toLowerCase();
133
- if (modelLower.startsWith("gpt")) {
134
- return `openai:${model}`;
120
+ function isImagePart(part) {
121
+ return part.type === "image";
122
+ }
123
+ function isAudioPart(part) {
124
+ return part.type === "audio";
125
+ }
126
+ function text(content) {
127
+ return { type: "text", text: content };
128
+ }
129
+ function imageFromBase64(data, mediaType) {
130
+ return {
131
+ type: "image",
132
+ source: { type: "base64", mediaType, data }
133
+ };
134
+ }
135
+ function imageFromUrl(url) {
136
+ return {
137
+ type: "image",
138
+ source: { type: "url", url }
139
+ };
140
+ }
141
+ function detectImageMimeType(data) {
142
+ const bytes = data instanceof Buffer ? data : Buffer.from(data);
143
+ for (const { bytes: magic, mimeType } of IMAGE_MAGIC_BYTES) {
144
+ if (bytes.length >= magic.length) {
145
+ let matches = true;
146
+ for (let i = 0; i < magic.length; i++) {
147
+ if (bytes[i] !== magic[i]) {
148
+ matches = false;
149
+ break;
150
+ }
151
+ }
152
+ if (matches) {
153
+ if (mimeType === "image/webp") {
154
+ if (bytes.length >= 12) {
155
+ const webpMarker = bytes[8] === 87 && bytes[9] === 69 && bytes[10] === 66 && bytes[11] === 80;
156
+ if (!webpMarker) continue;
157
+ }
158
+ }
159
+ return mimeType;
160
+ }
161
+ }
135
162
  }
136
- if (modelLower.startsWith("claude")) {
137
- return `anthropic:${model}`;
163
+ return null;
164
+ }
165
+ function detectAudioMimeType(data) {
166
+ const bytes = data instanceof Buffer ? data : Buffer.from(data);
167
+ for (const { bytes: magic, mimeType } of AUDIO_MAGIC_BYTES) {
168
+ if (bytes.length >= magic.length) {
169
+ let matches = true;
170
+ for (let i = 0; i < magic.length; i++) {
171
+ if (bytes[i] !== magic[i]) {
172
+ matches = false;
173
+ break;
174
+ }
175
+ }
176
+ if (matches) {
177
+ if (mimeType === "audio/wav") {
178
+ if (bytes.length >= 12) {
179
+ const waveMarker = bytes[8] === 87 && bytes[9] === 65 && bytes[10] === 86 && bytes[11] === 69;
180
+ if (!waveMarker) continue;
181
+ }
182
+ }
183
+ return mimeType;
184
+ }
185
+ }
138
186
  }
139
- if (modelLower.startsWith("gemini")) {
140
- return `gemini:${model}`;
187
+ return null;
188
+ }
189
+ function toBase64(data) {
190
+ if (typeof data === "string") {
191
+ return data;
141
192
  }
142
- if (modelLower.match(/^o\d/)) {
143
- return `openai:${model}`;
193
+ return Buffer.from(data).toString("base64");
194
+ }
195
+ function imageFromBuffer(buffer, mediaType) {
196
+ const detectedType = mediaType ?? detectImageMimeType(buffer);
197
+ if (!detectedType) {
198
+ throw new Error(
199
+ "Could not detect image MIME type. Please provide the mediaType parameter explicitly."
200
+ );
144
201
  }
145
- if (!isKnownModelPattern(model)) {
146
- if (options.strict) {
147
- throw new Error(
148
- `Unknown model '${model}'. Did you mean one of: gpt4, sonnet, haiku, flash? Use explicit provider prefix like 'openai:${model}' to bypass this check.`
149
- );
150
- }
151
- if (!options.silent) {
152
- console.warn(
153
- `\u26A0\uFE0F Unknown model '${model}', falling back to 'openai:${model}'. This might be a typo. Did you mean: gpt4, gpt5, gpt5-nano, sonnet, haiku, flash? Use { strict: true } to error on unknown models, or { silent: true } to suppress this warning.`
154
- );
202
+ return {
203
+ type: "image",
204
+ source: {
205
+ type: "base64",
206
+ mediaType: detectedType,
207
+ data: toBase64(buffer)
155
208
  }
156
- }
157
- return `openai:${model}`;
209
+ };
158
210
  }
159
- function hasProviderPrefix(model) {
160
- return model.includes(":");
211
+ function audioFromBase64(data, mediaType) {
212
+ return {
213
+ type: "audio",
214
+ source: { type: "base64", mediaType, data }
215
+ };
161
216
  }
162
- function getProvider(model) {
163
- const separatorIndex = model.indexOf(":");
164
- if (separatorIndex === -1) {
165
- return void 0;
217
+ function audioFromBuffer(buffer, mediaType) {
218
+ const detectedType = mediaType ?? detectAudioMimeType(buffer);
219
+ if (!detectedType) {
220
+ throw new Error(
221
+ "Could not detect audio MIME type. Please provide the mediaType parameter explicitly."
222
+ );
166
223
  }
167
- return model.slice(0, separatorIndex);
224
+ return {
225
+ type: "audio",
226
+ source: {
227
+ type: "base64",
228
+ mediaType: detectedType,
229
+ data: toBase64(buffer)
230
+ }
231
+ };
168
232
  }
169
- function getModelId(model) {
170
- const separatorIndex = model.indexOf(":");
171
- if (separatorIndex === -1) {
172
- return model;
173
- }
174
- return model.slice(separatorIndex + 1);
233
+ function isDataUrl(input) {
234
+ return input.startsWith("data:");
175
235
  }
176
- var MODEL_ALIASES, KNOWN_MODEL_PATTERNS;
177
- var init_model_shortcuts = __esm({
178
- "src/core/model-shortcuts.ts"() {
236
+ function parseDataUrl(url) {
237
+ const match = url.match(/^data:([^;]+);base64,(.+)$/);
238
+ if (!match) return null;
239
+ return { mimeType: match[1], data: match[2] };
240
+ }
241
+ var IMAGE_MAGIC_BYTES, AUDIO_MAGIC_BYTES;
242
+ var init_input_content = __esm({
243
+ "src/core/input-content.ts"() {
179
244
  "use strict";
180
- MODEL_ALIASES = {
181
- // OpenAI aliases
182
- gpt4: "openai:gpt-4o",
183
- gpt4o: "openai:gpt-4o",
184
- gpt5: "openai:gpt-5",
185
- "gpt5-mini": "openai:gpt-5-mini",
186
- "gpt5-nano": "openai:gpt-5-nano",
187
- // Anthropic aliases
188
- sonnet: "anthropic:claude-sonnet-4-5",
189
- "claude-sonnet": "anthropic:claude-sonnet-4-5",
190
- haiku: "anthropic:claude-haiku-4-5",
191
- "claude-haiku": "anthropic:claude-haiku-4-5",
192
- opus: "anthropic:claude-opus-4-5",
193
- "claude-opus": "anthropic:claude-opus-4-5",
194
- // Gemini aliases
195
- flash: "gemini:gemini-2.0-flash",
196
- "gemini-flash": "gemini:gemini-2.0-flash",
197
- "gemini-pro": "gemini:gemini-2.5-pro",
198
- pro: "gemini:gemini-2.5-pro"
199
- };
200
- KNOWN_MODEL_PATTERNS = [
201
- /^gpt-?\d/i,
202
- // gpt-4, gpt-3.5, gpt4, etc.
203
- /^claude-?\d/i,
204
- // claude-3, claude-2, etc.
205
- /^gemini-?(\d|pro|flash)/i,
206
- // gemini-2.0, gemini-pro, gemini-flash, etc.
207
- /^o\d/i
208
- // OpenAI o1, o3, etc.
245
+ IMAGE_MAGIC_BYTES = [
246
+ { bytes: [255, 216, 255], mimeType: "image/jpeg" },
247
+ { bytes: [137, 80, 78, 71], mimeType: "image/png" },
248
+ { bytes: [71, 73, 70, 56], mimeType: "image/gif" },
249
+ // WebP starts with RIFF....WEBP
250
+ { bytes: [82, 73, 70, 70], mimeType: "image/webp" }
251
+ ];
252
+ AUDIO_MAGIC_BYTES = [
253
+ // MP3 frame sync
254
+ { bytes: [255, 251], mimeType: "audio/mp3" },
255
+ { bytes: [255, 250], mimeType: "audio/mp3" },
256
+ // ID3 tag (MP3)
257
+ { bytes: [73, 68, 51], mimeType: "audio/mp3" },
258
+ // OGG
259
+ { bytes: [79, 103, 103, 83], mimeType: "audio/ogg" },
260
+ // WAV (RIFF)
261
+ { bytes: [82, 73, 70, 70], mimeType: "audio/wav" },
262
+ // WebM
263
+ { bytes: [26, 69, 223, 163], mimeType: "audio/webm" }
209
264
  ];
210
265
  }
211
266
  });
212
267
 
213
- // src/gadgets/schema-validator.ts
214
- import * as z from "zod";
215
- function validateGadgetSchema(schema, gadgetName) {
216
- let jsonSchema;
217
- try {
218
- jsonSchema = z.toJSONSchema(schema, { target: "draft-7" });
219
- } catch (error) {
220
- const errorMessage = error instanceof Error ? error.message : String(error);
221
- throw new Error(
222
- `Gadget "${gadgetName}" has a schema that cannot be serialized to JSON Schema.
223
- This usually happens with unsupported patterns like:
224
- - z.record() - use z.object({}).passthrough() instead
225
- - Complex transforms or custom refinements
226
- - Circular references
227
-
228
- Original error: ${errorMessage}
229
-
230
- Only use schema patterns that Zod v4's native toJSONSchema() supports.`
231
- );
232
- }
233
- const issues = findUnknownTypes(jsonSchema);
234
- if (issues.length > 0) {
235
- const fieldList = issues.join(", ");
236
- throw new Error(
237
- `Gadget "${gadgetName}" uses z.unknown() which produces incomplete schemas.
238
- Problematic fields: ${fieldList}
239
-
240
- z.unknown() doesn't generate type information in JSON Schema, making it unclear
241
- to the LLM what data structure to provide.
242
-
243
- Suggestions:
244
- - Use z.object({}).passthrough() for flexible objects
245
- - Use z.record(z.string()) for key-value objects with string values
246
- - Define specific structure if possible
247
-
248
- Example fixes:
249
- // \u274C Bad
250
- content: z.unknown()
251
-
252
- // \u2705 Good
253
- content: z.object({}).passthrough() // for flexible objects
254
- content: z.record(z.string()) // for key-value objects
255
- content: z.array(z.string()) // for arrays of strings
256
- `
257
- );
258
- }
268
+ // src/core/prompt-config.ts
269
+ function resolvePromptTemplate(template, defaultValue, context) {
270
+ const resolved = template ?? defaultValue;
271
+ return typeof resolved === "function" ? resolved(context) : resolved;
259
272
  }
260
- function findUnknownTypes(schema, path = []) {
261
- const issues = [];
262
- if (!schema || typeof schema !== "object") {
263
- return issues;
264
- }
265
- if (schema.definitions) {
266
- for (const defSchema of Object.values(schema.definitions)) {
267
- issues.push(...findUnknownTypes(defSchema, []));
268
- }
269
- }
270
- if (schema.properties) {
271
- for (const [propName, propSchema] of Object.entries(schema.properties)) {
272
- const propPath = [...path, propName];
273
- if (hasNoType(propSchema)) {
274
- issues.push(propPath.join(".") || propName);
275
- }
276
- issues.push(...findUnknownTypes(propSchema, propPath));
277
- }
273
+ function resolveRulesTemplate(rules, context) {
274
+ const resolved = rules ?? DEFAULT_PROMPTS.rules;
275
+ if (Array.isArray(resolved)) {
276
+ return resolved;
278
277
  }
279
- if (schema.items) {
280
- const itemPath = [...path, "[]"];
281
- if (hasNoType(schema.items)) {
282
- issues.push(itemPath.join("."));
283
- }
284
- issues.push(...findUnknownTypes(schema.items, itemPath));
278
+ if (typeof resolved === "function") {
279
+ const result = resolved(context);
280
+ return Array.isArray(result) ? result : [result];
285
281
  }
286
- if (schema.anyOf) {
287
- schema.anyOf.forEach((subSchema, index) => {
288
- issues.push(...findUnknownTypes(subSchema, [...path, `anyOf[${index}]`]));
289
- });
282
+ return [resolved];
283
+ }
284
+ function resolveHintTemplate(template, defaultValue, context) {
285
+ const resolved = template ?? defaultValue;
286
+ if (typeof resolved === "function") {
287
+ return resolved(context);
290
288
  }
291
- if (schema.oneOf) {
292
- schema.oneOf.forEach((subSchema, index) => {
293
- issues.push(...findUnknownTypes(subSchema, [...path, `oneOf[${index}]`]));
294
- });
289
+ return resolved.replace(/\{iteration\}/g, String(context.iteration)).replace(/\{maxIterations\}/g, String(context.maxIterations)).replace(/\{remaining\}/g, String(context.remaining));
290
+ }
291
+ var DEFAULT_HINTS, DEFAULT_PROMPTS;
292
+ var init_prompt_config = __esm({
293
+ "src/core/prompt-config.ts"() {
294
+ "use strict";
295
+ DEFAULT_HINTS = {
296
+ parallelGadgetsHint: "Tip: You can call multiple gadgets in a single response for efficiency.",
297
+ iterationProgressHint: "[Iteration {iteration}/{maxIterations}] Plan your actions accordingly."
298
+ };
299
+ DEFAULT_PROMPTS = {
300
+ mainInstruction: [
301
+ "\u26A0\uFE0F CRITICAL: RESPOND ONLY WITH GADGET INVOCATIONS",
302
+ "DO NOT use function calling or tool calling",
303
+ "You must output the exact text markers shown below in plain text.",
304
+ "EACH MARKER MUST START WITH A NEWLINE."
305
+ ].join("\n"),
306
+ criticalUsage: "INVOKE gadgets using the markers - do not describe what you want to do.",
307
+ formatDescription: (ctx) => `Parameters using ${ctx.argPrefix}name markers (value on next line(s), no escaping needed)`,
308
+ rules: () => [
309
+ "Output ONLY plain text with the exact markers - never use function/tool calling",
310
+ "You can invoke multiple gadgets in a single response",
311
+ "Gadgets without dependencies execute immediately (in parallel if multiple)",
312
+ "Use :invocation_id:dep1,dep2 syntax when a gadget needs results from prior gadgets",
313
+ "If any dependency fails, dependent gadgets are automatically skipped"
314
+ ],
315
+ customExamples: null
316
+ };
295
317
  }
296
- if (schema.allOf) {
297
- schema.allOf.forEach((subSchema, index) => {
298
- issues.push(...findUnknownTypes(subSchema, [...path, `allOf[${index}]`]));
299
- });
318
+ });
319
+
320
+ // src/core/messages.ts
321
+ function normalizeContent(content) {
322
+ if (typeof content === "string") {
323
+ return [{ type: "text", text: content }];
300
324
  }
301
- return issues;
325
+ return content;
302
326
  }
303
- function hasNoType(prop) {
304
- if (!prop || typeof prop !== "object") {
305
- return false;
327
+ function extractText(content) {
328
+ if (typeof content === "string") {
329
+ return content;
306
330
  }
307
- const hasType = prop.type !== void 0;
308
- const hasRef = prop.$ref !== void 0;
309
- const hasUnion = prop.anyOf !== void 0 || prop.oneOf !== void 0 || prop.allOf !== void 0;
310
- if (hasType || hasRef || hasUnion) {
311
- return false;
312
- }
313
- const keys = Object.keys(prop);
314
- const metadataKeys = ["description", "title", "default", "examples"];
315
- const hasOnlyMetadata = keys.every((key) => metadataKeys.includes(key));
316
- return hasOnlyMetadata || keys.length === 0;
317
- }
318
- var init_schema_validator = __esm({
319
- "src/gadgets/schema-validator.ts"() {
320
- "use strict";
321
- }
322
- });
323
-
324
- // src/gadgets/registry.ts
325
- var GadgetRegistry;
326
- var init_registry = __esm({
327
- "src/gadgets/registry.ts"() {
328
- "use strict";
329
- init_schema_validator();
330
- GadgetRegistry = class _GadgetRegistry {
331
- gadgets = /* @__PURE__ */ new Map();
332
- /**
333
- * Creates a registry from an array of gadget classes or instances,
334
- * or an object mapping names to gadgets.
335
- *
336
- * @param gadgets - Array of gadgets/classes or object with custom names
337
- * @returns New GadgetRegistry with all gadgets registered
338
- *
339
- * @example
340
- * ```typescript
341
- * // From array of classes
342
- * const registry = GadgetRegistry.from([Calculator, Weather]);
343
- *
344
- * // From array of instances
345
- * const registry = GadgetRegistry.from([new Calculator(), new Weather()]);
346
- *
347
- * // From object with custom names
348
- * const registry = GadgetRegistry.from({
349
- * calc: Calculator,
350
- * weather: new Weather({ apiKey: "..." })
351
- * });
352
- * ```
353
- */
354
- static from(gadgets) {
355
- const registry = new _GadgetRegistry();
356
- if (Array.isArray(gadgets)) {
357
- registry.registerMany(gadgets);
358
- } else {
359
- for (const [name, gadget] of Object.entries(gadgets)) {
360
- const instance = typeof gadget === "function" ? new gadget() : gadget;
361
- registry.register(name, instance);
362
- }
363
- }
364
- return registry;
365
- }
366
- /**
367
- * Registers multiple gadgets at once from an array.
368
- *
369
- * @param gadgets - Array of gadget instances or classes
370
- * @returns This registry for chaining
371
- *
372
- * @example
373
- * ```typescript
374
- * registry.registerMany([Calculator, Weather, Email]);
375
- * registry.registerMany([new Calculator(), new Weather()]);
376
- * ```
377
- */
378
- registerMany(gadgets) {
379
- for (const gadget of gadgets) {
380
- const instance = typeof gadget === "function" ? new gadget() : gadget;
381
- this.registerByClass(instance);
382
- }
383
- return this;
384
- }
385
- // Register a gadget by name
386
- register(name, gadget) {
387
- const normalizedName = name.toLowerCase();
388
- if (this.gadgets.has(normalizedName)) {
389
- throw new Error(`Gadget '${name}' is already registered`);
390
- }
391
- if (gadget.parameterSchema) {
392
- validateGadgetSchema(gadget.parameterSchema, name);
393
- }
394
- this.gadgets.set(normalizedName, gadget);
395
- }
396
- // Register a gadget using its name property or class name
397
- registerByClass(gadget) {
398
- const name = gadget.name ?? gadget.constructor.name;
399
- this.register(name, gadget);
400
- }
401
- // Get gadget by name (case-insensitive)
402
- get(name) {
403
- return this.gadgets.get(name.toLowerCase());
404
- }
405
- // Check if gadget exists (case-insensitive)
406
- has(name) {
407
- return this.gadgets.has(name.toLowerCase());
408
- }
409
- // Get all registered gadget names
410
- getNames() {
411
- return Array.from(this.gadgets.keys());
412
- }
413
- // Get all gadgets for instruction generation
414
- getAll() {
415
- return Array.from(this.gadgets.values());
416
- }
417
- // Unregister gadget (useful for testing, case-insensitive)
418
- unregister(name) {
419
- return this.gadgets.delete(name.toLowerCase());
420
- }
421
- // Clear all gadgets (useful for testing)
422
- clear() {
423
- this.gadgets.clear();
424
- }
425
- };
426
- }
427
- });
428
-
429
- // src/core/prompt-config.ts
430
- function resolvePromptTemplate(template, defaultValue, context) {
431
- const resolved = template ?? defaultValue;
432
- return typeof resolved === "function" ? resolved(context) : resolved;
433
- }
434
- function resolveRulesTemplate(rules, context) {
435
- const resolved = rules ?? DEFAULT_PROMPTS.rules;
436
- if (Array.isArray(resolved)) {
437
- return resolved;
438
- }
439
- if (typeof resolved === "function") {
440
- const result = resolved(context);
441
- return Array.isArray(result) ? result : [result];
442
- }
443
- return [resolved];
331
+ return content.filter((part) => part.type === "text").map((part) => part.text).join("");
444
332
  }
445
- function resolveHintTemplate(template, defaultValue, context) {
446
- const resolved = template ?? defaultValue;
447
- if (typeof resolved === "function") {
448
- return resolved(context);
449
- }
450
- return resolved.replace(/\{iteration\}/g, String(context.iteration)).replace(/\{maxIterations\}/g, String(context.maxIterations)).replace(/\{remaining\}/g, String(context.remaining));
451
- }
452
- var DEFAULT_HINTS, DEFAULT_PROMPTS;
453
- var init_prompt_config = __esm({
454
- "src/core/prompt-config.ts"() {
455
- "use strict";
456
- DEFAULT_HINTS = {
457
- parallelGadgetsHint: "Tip: You can call multiple gadgets in a single response for efficiency.",
458
- iterationProgressHint: "[Iteration {iteration}/{maxIterations}] Plan your actions accordingly."
459
- };
460
- DEFAULT_PROMPTS = {
461
- mainInstruction: [
462
- "\u26A0\uFE0F CRITICAL: RESPOND ONLY WITH GADGET INVOCATIONS",
463
- "DO NOT use function calling or tool calling",
464
- "You must output the exact text markers shown below in plain text.",
465
- "EACH MARKER MUST START WITH A NEWLINE."
466
- ].join("\n"),
467
- criticalUsage: "INVOKE gadgets using the markers - do not describe what you want to do.",
468
- formatDescription: (ctx) => `Parameters using ${ctx.argPrefix}name markers (value on next line(s), no escaping needed)`,
469
- rules: () => [
470
- "Output ONLY plain text with the exact markers - never use function/tool calling",
471
- "You can invoke multiple gadgets in a single response",
472
- "For dependent gadgets, invoke the first one and wait for the result"
473
- ],
474
- customExamples: null
475
- };
476
- }
477
- });
478
-
479
- // src/core/messages.ts
480
333
  var LLMMessageBuilder;
481
334
  var init_messages = __esm({
482
335
  "src/core/messages.ts"() {
483
336
  "use strict";
484
337
  init_constants();
338
+ init_input_content();
485
339
  init_prompt_config();
486
340
  LLMMessageBuilder = class {
487
341
  messages = [];
@@ -583,6 +437,10 @@ CRITICAL: ${criticalUsage}
583
437
  parts.push(`
584
438
  1. Start marker: ${this.startPrefix}gadget_name`);
585
439
  parts.push(`
440
+ With ID: ${this.startPrefix}gadget_name:my_id`);
441
+ parts.push(`
442
+ With dependencies: ${this.startPrefix}gadget_name:my_id:dep1,dep2`);
443
+ parts.push(`
586
444
  2. ${formatDescription}`);
587
445
  parts.push(`
588
446
  3. End marker: ${this.endPrefix}`);
@@ -632,6 +490,25 @@ ${this.endPrefix}`;
632
490
  EXAMPLE (Multiple Gadgets):
633
491
 
634
492
  ${multipleExample}`);
493
+ const dependencyExample = `${this.startPrefix}fetch_data:fetch_1
494
+ ${this.argPrefix}url
495
+ https://api.example.com/users
496
+ ${this.endPrefix}
497
+ ${this.startPrefix}fetch_data:fetch_2
498
+ ${this.argPrefix}url
499
+ https://api.example.com/orders
500
+ ${this.endPrefix}
501
+ ${this.startPrefix}merge_data:merge_1:fetch_1,fetch_2
502
+ ${this.argPrefix}format
503
+ json
504
+ ${this.endPrefix}`;
505
+ parts.push(`
506
+
507
+ EXAMPLE (With Dependencies):
508
+ merge_1 waits for fetch_1 AND fetch_2 to complete.
509
+ If either fails, merge_1 is automatically skipped.
510
+
511
+ ${dependencyExample}`);
635
512
  parts.push(`
636
513
 
637
514
  BLOCK FORMAT SYNTAX:
@@ -650,89 +527,519 @@ class Calculator {
650
527
  }
651
528
  }
652
529
 
653
- BLOCK FORMAT RULES:
654
- - Each parameter starts with ${this.argPrefix}parameterName on its own line
655
- - The value starts on the NEXT line after the marker
656
- - Value ends when the next ${this.argPrefix} or ${this.endPrefix} appears
657
- - NO escaping needed - write values exactly as they should appear
658
- - Perfect for code, JSON, markdown, or any content with special characters
659
-
660
- NESTED OBJECTS (use / separator):
661
- ${this.argPrefix}config/timeout
662
- 30
663
- ${this.argPrefix}config/retries
664
- 3
665
- Produces: { "config": { "timeout": "30", "retries": "3" } }
666
-
667
- ARRAYS (use numeric indices):
668
- ${this.argPrefix}items/0
669
- first
670
- ${this.argPrefix}items/1
671
- second
672
- Produces: { "items": ["first", "second"] }`);
673
- return parts.join("");
674
- }
675
- buildRulesSection(context) {
676
- const parts = [];
677
- parts.push("\n\nRULES:");
678
- const rules = resolveRulesTemplate(this.promptConfig.rules, context);
679
- for (const rule of rules) {
680
- parts.push(`
681
- - ${rule}`);
530
+ BLOCK FORMAT RULES:
531
+ - Each parameter starts with ${this.argPrefix}parameterName on its own line
532
+ - The value starts on the NEXT line after the marker
533
+ - Value ends when the next ${this.argPrefix} or ${this.endPrefix} appears
534
+ - NO escaping needed - write values exactly as they should appear
535
+ - Perfect for code, JSON, markdown, or any content with special characters
536
+
537
+ NESTED OBJECTS (use / separator):
538
+ ${this.argPrefix}config/timeout
539
+ 30
540
+ ${this.argPrefix}config/retries
541
+ 3
542
+ Produces: { "config": { "timeout": "30", "retries": "3" } }
543
+
544
+ ARRAYS (use numeric indices):
545
+ ${this.argPrefix}items/0
546
+ first
547
+ ${this.argPrefix}items/1
548
+ second
549
+ Produces: { "items": ["first", "second"] }`);
550
+ return parts.join("");
551
+ }
552
+ buildRulesSection(context) {
553
+ const parts = [];
554
+ parts.push("\n\nRULES:");
555
+ const rules = resolveRulesTemplate(this.promptConfig.rules, context);
556
+ for (const rule of rules) {
557
+ parts.push(`
558
+ - ${rule}`);
559
+ }
560
+ return parts.join("");
561
+ }
562
+ /**
563
+ * Add a user message.
564
+ * Content can be a string (text only) or an array of content parts (multimodal).
565
+ *
566
+ * @param content - Message content
567
+ * @param metadata - Optional metadata
568
+ *
569
+ * @example
570
+ * ```typescript
571
+ * // Text only
572
+ * builder.addUser("Hello!");
573
+ *
574
+ * // Multimodal
575
+ * builder.addUser([
576
+ * text("What's in this image?"),
577
+ * imageFromBuffer(imageData),
578
+ * ]);
579
+ * ```
580
+ */
581
+ addUser(content, metadata) {
582
+ this.messages.push({ role: "user", content, metadata });
583
+ return this;
584
+ }
585
+ addAssistant(content, metadata) {
586
+ this.messages.push({ role: "assistant", content, metadata });
587
+ return this;
588
+ }
589
+ /**
590
+ * Add a user message with an image attachment.
591
+ *
592
+ * @param textContent - Text prompt
593
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
594
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
595
+ *
596
+ * @example
597
+ * ```typescript
598
+ * builder.addUserWithImage(
599
+ * "What's in this image?",
600
+ * await fs.readFile("photo.jpg"),
601
+ * "image/jpeg" // Optional - auto-detected
602
+ * );
603
+ * ```
604
+ */
605
+ addUserWithImage(textContent, imageData, mimeType) {
606
+ const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
607
+ const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
608
+ if (!detectedMime) {
609
+ throw new Error(
610
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
611
+ );
612
+ }
613
+ const content = [
614
+ text(textContent),
615
+ {
616
+ type: "image",
617
+ source: {
618
+ type: "base64",
619
+ mediaType: detectedMime,
620
+ data: toBase64(imageBuffer)
621
+ }
622
+ }
623
+ ];
624
+ this.messages.push({ role: "user", content });
625
+ return this;
626
+ }
627
+ /**
628
+ * Add a user message with an image URL (OpenAI only).
629
+ *
630
+ * @param textContent - Text prompt
631
+ * @param imageUrl - URL to the image
632
+ *
633
+ * @example
634
+ * ```typescript
635
+ * builder.addUserWithImageUrl(
636
+ * "What's in this image?",
637
+ * "https://example.com/image.jpg"
638
+ * );
639
+ * ```
640
+ */
641
+ addUserWithImageUrl(textContent, imageUrl) {
642
+ const content = [text(textContent), imageFromUrl(imageUrl)];
643
+ this.messages.push({ role: "user", content });
644
+ return this;
645
+ }
646
+ /**
647
+ * Add a user message with an audio attachment (Gemini only).
648
+ *
649
+ * @param textContent - Text prompt
650
+ * @param audioData - Audio data (Buffer, Uint8Array, or base64 string)
651
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
652
+ *
653
+ * @example
654
+ * ```typescript
655
+ * builder.addUserWithAudio(
656
+ * "Transcribe this audio",
657
+ * await fs.readFile("recording.mp3"),
658
+ * "audio/mp3" // Optional - auto-detected
659
+ * );
660
+ * ```
661
+ */
662
+ addUserWithAudio(textContent, audioData, mimeType) {
663
+ const audioBuffer = typeof audioData === "string" ? Buffer.from(audioData, "base64") : audioData;
664
+ const content = [text(textContent), audioFromBuffer(audioBuffer, mimeType)];
665
+ this.messages.push({ role: "user", content });
666
+ return this;
667
+ }
668
+ /**
669
+ * Add a user message with multiple content parts.
670
+ * Provides full flexibility for complex multimodal messages.
671
+ *
672
+ * @param parts - Array of content parts
673
+ *
674
+ * @example
675
+ * ```typescript
676
+ * builder.addUserMultimodal([
677
+ * text("Compare these images:"),
678
+ * imageFromBuffer(image1),
679
+ * imageFromBuffer(image2),
680
+ * ]);
681
+ * ```
682
+ */
683
+ addUserMultimodal(parts) {
684
+ this.messages.push({ role: "user", content: parts });
685
+ return this;
686
+ }
687
+ addGadgetCall(gadget, parameters, result) {
688
+ const paramStr = this.formatBlockParameters(parameters, "");
689
+ this.messages.push({
690
+ role: "assistant",
691
+ content: `${this.startPrefix}${gadget}
692
+ ${paramStr}
693
+ ${this.endPrefix}`
694
+ });
695
+ this.messages.push({
696
+ role: "user",
697
+ content: `Result: ${result}`
698
+ });
699
+ return this;
700
+ }
701
+ /**
702
+ * Format parameters as Block format with JSON Pointer paths.
703
+ * Uses the configured argPrefix for consistency with system prompt.
704
+ */
705
+ formatBlockParameters(params, prefix) {
706
+ const lines = [];
707
+ for (const [key, value] of Object.entries(params)) {
708
+ const fullPath = prefix ? `${prefix}/${key}` : key;
709
+ if (Array.isArray(value)) {
710
+ value.forEach((item, index) => {
711
+ const itemPath = `${fullPath}/${index}`;
712
+ if (typeof item === "object" && item !== null) {
713
+ lines.push(this.formatBlockParameters(item, itemPath));
714
+ } else {
715
+ lines.push(`${this.argPrefix}${itemPath}`);
716
+ lines.push(String(item));
717
+ }
718
+ });
719
+ } else if (typeof value === "object" && value !== null) {
720
+ lines.push(this.formatBlockParameters(value, fullPath));
721
+ } else {
722
+ lines.push(`${this.argPrefix}${fullPath}`);
723
+ lines.push(String(value));
724
+ }
725
+ }
726
+ return lines.join("\n");
727
+ }
728
+ build() {
729
+ return [...this.messages];
730
+ }
731
+ };
732
+ }
733
+ });
734
+
735
+ // src/core/model-shortcuts.ts
736
+ function isKnownModelPattern(model) {
737
+ const normalized = model.toLowerCase();
738
+ if (MODEL_ALIASES[normalized]) {
739
+ return true;
740
+ }
741
+ return KNOWN_MODEL_PATTERNS.some((pattern) => pattern.test(model));
742
+ }
743
+ function resolveModel(model, options = {}) {
744
+ if (model.includes(":")) {
745
+ return model;
746
+ }
747
+ const normalized = model.toLowerCase();
748
+ if (MODEL_ALIASES[normalized]) {
749
+ return MODEL_ALIASES[normalized];
750
+ }
751
+ const modelLower = model.toLowerCase();
752
+ if (modelLower.startsWith("gpt")) {
753
+ return `openai:${model}`;
754
+ }
755
+ if (modelLower.startsWith("claude")) {
756
+ return `anthropic:${model}`;
757
+ }
758
+ if (modelLower.startsWith("gemini")) {
759
+ return `gemini:${model}`;
760
+ }
761
+ if (modelLower.match(/^o\d/)) {
762
+ return `openai:${model}`;
763
+ }
764
+ if (!isKnownModelPattern(model)) {
765
+ if (options.strict) {
766
+ throw new Error(
767
+ `Unknown model '${model}'. Did you mean one of: gpt4, sonnet, haiku, flash? Use explicit provider prefix like 'openai:${model}' to bypass this check.`
768
+ );
769
+ }
770
+ if (!options.silent) {
771
+ console.warn(
772
+ `\u26A0\uFE0F Unknown model '${model}', falling back to 'openai:${model}'. This might be a typo. Did you mean: gpt4, gpt5, gpt5-nano, sonnet, haiku, flash? Use { strict: true } to error on unknown models, or { silent: true } to suppress this warning.`
773
+ );
774
+ }
775
+ }
776
+ return `openai:${model}`;
777
+ }
778
+ function hasProviderPrefix(model) {
779
+ return model.includes(":");
780
+ }
781
+ function getProvider(model) {
782
+ const separatorIndex = model.indexOf(":");
783
+ if (separatorIndex === -1) {
784
+ return void 0;
785
+ }
786
+ return model.slice(0, separatorIndex);
787
+ }
788
+ function getModelId(model) {
789
+ const separatorIndex = model.indexOf(":");
790
+ if (separatorIndex === -1) {
791
+ return model;
792
+ }
793
+ return model.slice(separatorIndex + 1);
794
+ }
795
+ var MODEL_ALIASES, KNOWN_MODEL_PATTERNS;
796
+ var init_model_shortcuts = __esm({
797
+ "src/core/model-shortcuts.ts"() {
798
+ "use strict";
799
+ MODEL_ALIASES = {
800
+ // OpenAI aliases
801
+ gpt4: "openai:gpt-4o",
802
+ gpt4o: "openai:gpt-4o",
803
+ gpt5: "openai:gpt-5",
804
+ "gpt5-mini": "openai:gpt-5-mini",
805
+ "gpt5-nano": "openai:gpt-5-nano",
806
+ // Anthropic aliases
807
+ sonnet: "anthropic:claude-sonnet-4-5",
808
+ "claude-sonnet": "anthropic:claude-sonnet-4-5",
809
+ haiku: "anthropic:claude-haiku-4-5",
810
+ "claude-haiku": "anthropic:claude-haiku-4-5",
811
+ opus: "anthropic:claude-opus-4-5",
812
+ "claude-opus": "anthropic:claude-opus-4-5",
813
+ // Gemini aliases
814
+ flash: "gemini:gemini-2.0-flash",
815
+ "gemini-flash": "gemini:gemini-2.0-flash",
816
+ "gemini-pro": "gemini:gemini-2.5-pro",
817
+ pro: "gemini:gemini-2.5-pro"
818
+ };
819
+ KNOWN_MODEL_PATTERNS = [
820
+ /^gpt-?\d/i,
821
+ // gpt-4, gpt-3.5, gpt4, etc.
822
+ /^claude-?\d/i,
823
+ // claude-3, claude-2, etc.
824
+ /^gemini-?(\d|pro|flash)/i,
825
+ // gemini-2.0, gemini-pro, gemini-flash, etc.
826
+ /^o\d/i
827
+ // OpenAI o1, o3, etc.
828
+ ];
829
+ }
830
+ });
831
+
832
+ // src/gadgets/schema-validator.ts
833
+ import * as z from "zod";
834
+ function validateGadgetSchema(schema, gadgetName) {
835
+ let jsonSchema;
836
+ try {
837
+ jsonSchema = z.toJSONSchema(schema, { target: "draft-7" });
838
+ } catch (error) {
839
+ const errorMessage = error instanceof Error ? error.message : String(error);
840
+ throw new Error(
841
+ `Gadget "${gadgetName}" has a schema that cannot be serialized to JSON Schema.
842
+ This usually happens with unsupported patterns like:
843
+ - z.record() - use z.object({}).passthrough() instead
844
+ - Complex transforms or custom refinements
845
+ - Circular references
846
+
847
+ Original error: ${errorMessage}
848
+
849
+ Only use schema patterns that Zod v4's native toJSONSchema() supports.`
850
+ );
851
+ }
852
+ const issues = findUnknownTypes(jsonSchema);
853
+ if (issues.length > 0) {
854
+ const fieldList = issues.join(", ");
855
+ throw new Error(
856
+ `Gadget "${gadgetName}" uses z.unknown() which produces incomplete schemas.
857
+ Problematic fields: ${fieldList}
858
+
859
+ z.unknown() doesn't generate type information in JSON Schema, making it unclear
860
+ to the LLM what data structure to provide.
861
+
862
+ Suggestions:
863
+ - Use z.object({}).passthrough() for flexible objects
864
+ - Use z.record(z.string()) for key-value objects with string values
865
+ - Define specific structure if possible
866
+
867
+ Example fixes:
868
+ // \u274C Bad
869
+ content: z.unknown()
870
+
871
+ // \u2705 Good
872
+ content: z.object({}).passthrough() // for flexible objects
873
+ content: z.record(z.string()) // for key-value objects
874
+ content: z.array(z.string()) // for arrays of strings
875
+ `
876
+ );
877
+ }
878
+ }
879
+ function findUnknownTypes(schema, path = []) {
880
+ const issues = [];
881
+ if (!schema || typeof schema !== "object") {
882
+ return issues;
883
+ }
884
+ if (schema.definitions) {
885
+ for (const defSchema of Object.values(schema.definitions)) {
886
+ issues.push(...findUnknownTypes(defSchema, []));
887
+ }
888
+ }
889
+ if (schema.properties) {
890
+ for (const [propName, propSchema] of Object.entries(schema.properties)) {
891
+ const propPath = [...path, propName];
892
+ if (hasNoType(propSchema)) {
893
+ issues.push(propPath.join(".") || propName);
894
+ }
895
+ issues.push(...findUnknownTypes(propSchema, propPath));
896
+ }
897
+ }
898
+ if (schema.items) {
899
+ const itemPath = [...path, "[]"];
900
+ if (hasNoType(schema.items)) {
901
+ issues.push(itemPath.join("."));
902
+ }
903
+ issues.push(...findUnknownTypes(schema.items, itemPath));
904
+ }
905
+ if (schema.anyOf) {
906
+ schema.anyOf.forEach((subSchema, index) => {
907
+ issues.push(...findUnknownTypes(subSchema, [...path, `anyOf[${index}]`]));
908
+ });
909
+ }
910
+ if (schema.oneOf) {
911
+ schema.oneOf.forEach((subSchema, index) => {
912
+ issues.push(...findUnknownTypes(subSchema, [...path, `oneOf[${index}]`]));
913
+ });
914
+ }
915
+ if (schema.allOf) {
916
+ schema.allOf.forEach((subSchema, index) => {
917
+ issues.push(...findUnknownTypes(subSchema, [...path, `allOf[${index}]`]));
918
+ });
919
+ }
920
+ return issues;
921
+ }
922
+ function hasNoType(prop) {
923
+ if (!prop || typeof prop !== "object") {
924
+ return false;
925
+ }
926
+ const hasType = prop.type !== void 0;
927
+ const hasRef = prop.$ref !== void 0;
928
+ const hasUnion = prop.anyOf !== void 0 || prop.oneOf !== void 0 || prop.allOf !== void 0;
929
+ if (hasType || hasRef || hasUnion) {
930
+ return false;
931
+ }
932
+ const keys = Object.keys(prop);
933
+ const metadataKeys = ["description", "title", "default", "examples"];
934
+ const hasOnlyMetadata = keys.every((key) => metadataKeys.includes(key));
935
+ return hasOnlyMetadata || keys.length === 0;
936
+ }
937
+ var init_schema_validator = __esm({
938
+ "src/gadgets/schema-validator.ts"() {
939
+ "use strict";
940
+ }
941
+ });
942
+
943
+ // src/gadgets/registry.ts
944
+ var GadgetRegistry;
945
+ var init_registry = __esm({
946
+ "src/gadgets/registry.ts"() {
947
+ "use strict";
948
+ init_schema_validator();
949
+ GadgetRegistry = class _GadgetRegistry {
950
+ gadgets = /* @__PURE__ */ new Map();
951
+ /**
952
+ * Creates a registry from an array of gadget classes or instances,
953
+ * or an object mapping names to gadgets.
954
+ *
955
+ * @param gadgets - Array of gadgets/classes or object with custom names
956
+ * @returns New GadgetRegistry with all gadgets registered
957
+ *
958
+ * @example
959
+ * ```typescript
960
+ * // From array of classes
961
+ * const registry = GadgetRegistry.from([Calculator, Weather]);
962
+ *
963
+ * // From array of instances
964
+ * const registry = GadgetRegistry.from([new Calculator(), new Weather()]);
965
+ *
966
+ * // From object with custom names
967
+ * const registry = GadgetRegistry.from({
968
+ * calc: Calculator,
969
+ * weather: new Weather({ apiKey: "..." })
970
+ * });
971
+ * ```
972
+ */
973
+ static from(gadgets) {
974
+ const registry = new _GadgetRegistry();
975
+ if (Array.isArray(gadgets)) {
976
+ registry.registerMany(gadgets);
977
+ } else {
978
+ for (const [name, gadget] of Object.entries(gadgets)) {
979
+ const instance = typeof gadget === "function" ? new gadget() : gadget;
980
+ registry.register(name, instance);
981
+ }
682
982
  }
683
- return parts.join("");
983
+ return registry;
684
984
  }
685
- addUser(content, metadata) {
686
- this.messages.push({ role: "user", content, metadata });
985
+ /**
986
+ * Registers multiple gadgets at once from an array.
987
+ *
988
+ * @param gadgets - Array of gadget instances or classes
989
+ * @returns This registry for chaining
990
+ *
991
+ * @example
992
+ * ```typescript
993
+ * registry.registerMany([Calculator, Weather, Email]);
994
+ * registry.registerMany([new Calculator(), new Weather()]);
995
+ * ```
996
+ */
997
+ registerMany(gadgets) {
998
+ for (const gadget of gadgets) {
999
+ const instance = typeof gadget === "function" ? new gadget() : gadget;
1000
+ this.registerByClass(instance);
1001
+ }
687
1002
  return this;
688
1003
  }
689
- addAssistant(content, metadata) {
690
- this.messages.push({ role: "assistant", content, metadata });
691
- return this;
1004
+ // Register a gadget by name
1005
+ register(name, gadget) {
1006
+ const normalizedName = name.toLowerCase();
1007
+ if (this.gadgets.has(normalizedName)) {
1008
+ throw new Error(`Gadget '${name}' is already registered`);
1009
+ }
1010
+ if (gadget.parameterSchema) {
1011
+ validateGadgetSchema(gadget.parameterSchema, name);
1012
+ }
1013
+ this.gadgets.set(normalizedName, gadget);
692
1014
  }
693
- addGadgetCall(gadget, parameters, result) {
694
- const paramStr = this.formatBlockParameters(parameters, "");
695
- this.messages.push({
696
- role: "assistant",
697
- content: `${this.startPrefix}${gadget}
698
- ${paramStr}
699
- ${this.endPrefix}`
700
- });
701
- this.messages.push({
702
- role: "user",
703
- content: `Result: ${result}`
704
- });
705
- return this;
1015
+ // Register a gadget using its name property or class name
1016
+ registerByClass(gadget) {
1017
+ const name = gadget.name ?? gadget.constructor.name;
1018
+ this.register(name, gadget);
706
1019
  }
707
- /**
708
- * Format parameters as Block format with JSON Pointer paths.
709
- * Uses the configured argPrefix for consistency with system prompt.
710
- */
711
- formatBlockParameters(params, prefix) {
712
- const lines = [];
713
- for (const [key, value] of Object.entries(params)) {
714
- const fullPath = prefix ? `${prefix}/${key}` : key;
715
- if (Array.isArray(value)) {
716
- value.forEach((item, index) => {
717
- const itemPath = `${fullPath}/${index}`;
718
- if (typeof item === "object" && item !== null) {
719
- lines.push(this.formatBlockParameters(item, itemPath));
720
- } else {
721
- lines.push(`${this.argPrefix}${itemPath}`);
722
- lines.push(String(item));
723
- }
724
- });
725
- } else if (typeof value === "object" && value !== null) {
726
- lines.push(this.formatBlockParameters(value, fullPath));
727
- } else {
728
- lines.push(`${this.argPrefix}${fullPath}`);
729
- lines.push(String(value));
730
- }
731
- }
732
- return lines.join("\n");
1020
+ // Get gadget by name (case-insensitive)
1021
+ get(name) {
1022
+ return this.gadgets.get(name.toLowerCase());
733
1023
  }
734
- build() {
735
- return [...this.messages];
1024
+ // Check if gadget exists (case-insensitive)
1025
+ has(name) {
1026
+ return this.gadgets.has(name.toLowerCase());
1027
+ }
1028
+ // Get all registered gadget names
1029
+ getNames() {
1030
+ return Array.from(this.gadgets.keys());
1031
+ }
1032
+ // Get all gadgets for instruction generation
1033
+ getAll() {
1034
+ return Array.from(this.gadgets.values());
1035
+ }
1036
+ // Unregister gadget (useful for testing, case-insensitive)
1037
+ unregister(name) {
1038
+ return this.gadgets.delete(name.toLowerCase());
1039
+ }
1040
+ // Clear all gadgets (useful for testing)
1041
+ clear() {
1042
+ this.gadgets.clear();
736
1043
  }
737
1044
  };
738
1045
  }
@@ -1928,7 +2235,7 @@ var init_conversation_manager = __esm({
1928
2235
  if (msg.role === "user") {
1929
2236
  this.historyBuilder.addUser(msg.content);
1930
2237
  } else if (msg.role === "assistant") {
1931
- this.historyBuilder.addAssistant(msg.content);
2238
+ this.historyBuilder.addAssistant(extractText(msg.content));
1932
2239
  }
1933
2240
  }
1934
2241
  }
@@ -1949,8 +2256,10 @@ async function runWithHandlers(agentGenerator, handlers) {
1949
2256
  if (handlers.onGadgetCall) {
1950
2257
  await handlers.onGadgetCall({
1951
2258
  gadgetName: event.call.gadgetName,
2259
+ invocationId: event.call.invocationId,
1952
2260
  parameters: event.call.parameters,
1953
- parametersRaw: event.call.parametersRaw
2261
+ parametersRaw: event.call.parametersRaw,
2262
+ dependencies: event.call.dependencies
1954
2263
  });
1955
2264
  }
1956
2265
  break;
@@ -2542,7 +2851,27 @@ var init_cost_reporting_client = __esm({
2542
2851
  constructor(client, reportCost) {
2543
2852
  this.client = client;
2544
2853
  this.reportCost = reportCost;
2854
+ this.image = {
2855
+ generate: async (options) => {
2856
+ const result = await this.client.image.generate(options);
2857
+ if (result.cost !== void 0 && result.cost > 0) {
2858
+ this.reportCost(result.cost);
2859
+ }
2860
+ return result;
2861
+ }
2862
+ };
2863
+ this.speech = {
2864
+ generate: async (options) => {
2865
+ const result = await this.client.speech.generate(options);
2866
+ if (result.cost !== void 0 && result.cost > 0) {
2867
+ this.reportCost(result.cost);
2868
+ }
2869
+ return result;
2870
+ }
2871
+ };
2545
2872
  }
2873
+ image;
2874
+ speech;
2546
2875
  /**
2547
2876
  * Access to model registry for cost estimation.
2548
2877
  */
@@ -2807,15 +3136,37 @@ var init_parser = __esm({
2807
3136
  return segment.trim().length > 0 ? segment : void 0;
2808
3137
  }
2809
3138
  /**
2810
- * Parse gadget name, handling both old format (name:invocationId) and new format (just name).
2811
- * For new format, generates a unique invocation ID.
3139
+ * Parse gadget name with optional invocation ID and dependencies.
3140
+ *
3141
+ * Supported formats:
3142
+ * - `GadgetName` - Auto-generate ID, no dependencies
3143
+ * - `GadgetName:my_id` - Explicit ID, no dependencies
3144
+ * - `GadgetName:my_id:dep1,dep2` - Explicit ID with dependencies
3145
+ *
3146
+ * Dependencies must be comma-separated invocation IDs.
2812
3147
  */
2813
3148
  parseGadgetName(gadgetName) {
2814
- if (gadgetName.includes(":")) {
2815
- const parts = gadgetName.split(":");
2816
- return { actualName: parts[0], invocationId: parts[1] };
3149
+ const parts = gadgetName.split(":");
3150
+ if (parts.length === 1) {
3151
+ return {
3152
+ actualName: parts[0],
3153
+ invocationId: `gadget_${++globalInvocationCounter}`,
3154
+ dependencies: []
3155
+ };
3156
+ } else if (parts.length === 2) {
3157
+ return {
3158
+ actualName: parts[0],
3159
+ invocationId: parts[1].trim(),
3160
+ dependencies: []
3161
+ };
3162
+ } else {
3163
+ const deps = parts[2].split(",").map((d) => d.trim()).filter((d) => d.length > 0);
3164
+ return {
3165
+ actualName: parts[0],
3166
+ invocationId: parts[1].trim(),
3167
+ dependencies: deps
3168
+ };
2817
3169
  }
2818
- return { actualName: gadgetName, invocationId: `gadget_${++globalInvocationCounter}` };
2819
3170
  }
2820
3171
  /**
2821
3172
  * Extract the error message from a parse error.
@@ -2851,39 +3202,20 @@ var init_parser = __esm({
2851
3202
  const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
2852
3203
  if (metadataEndIndex === -1) break;
2853
3204
  const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
2854
- const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
3205
+ const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
2855
3206
  const contentStartIndex = metadataEndIndex + 1;
2856
3207
  let partEndIndex;
2857
3208
  let endMarkerLength = 0;
2858
- if (gadgetName.includes(":")) {
2859
- const oldEndMarker = `${this.endPrefix + actualGadgetName}:${invocationId}`;
2860
- partEndIndex = this.buffer.indexOf(oldEndMarker, contentStartIndex);
2861
- if (partEndIndex === -1) break;
2862
- endMarkerLength = oldEndMarker.length;
3209
+ const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
3210
+ const endPos = this.buffer.indexOf(this.endPrefix, contentStartIndex);
3211
+ if (nextStartPos !== -1 && (endPos === -1 || nextStartPos < endPos)) {
3212
+ partEndIndex = nextStartPos;
3213
+ endMarkerLength = 0;
3214
+ } else if (endPos !== -1) {
3215
+ partEndIndex = endPos;
3216
+ endMarkerLength = this.endPrefix.length;
2863
3217
  } else {
2864
- const nextStartPos = this.buffer.indexOf(this.startPrefix, contentStartIndex);
2865
- let validEndPos = -1;
2866
- let searchPos = contentStartIndex;
2867
- while (true) {
2868
- const endPos = this.buffer.indexOf(this.endPrefix, searchPos);
2869
- if (endPos === -1) break;
2870
- const afterEnd = this.buffer.substring(endPos + this.endPrefix.length);
2871
- if (afterEnd.startsWith("\n") || afterEnd.startsWith("\r") || afterEnd.startsWith(this.startPrefix) || afterEnd.length === 0) {
2872
- validEndPos = endPos;
2873
- break;
2874
- } else {
2875
- searchPos = endPos + this.endPrefix.length;
2876
- }
2877
- }
2878
- if (nextStartPos !== -1 && (validEndPos === -1 || nextStartPos < validEndPos)) {
2879
- partEndIndex = nextStartPos;
2880
- endMarkerLength = 0;
2881
- } else if (validEndPos !== -1) {
2882
- partEndIndex = validEndPos;
2883
- endMarkerLength = this.endPrefix.length;
2884
- } else {
2885
- break;
2886
- }
3218
+ break;
2887
3219
  }
2888
3220
  const parametersRaw = this.buffer.substring(contentStartIndex, partEndIndex).trim();
2889
3221
  const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2894,7 +3226,8 @@ var init_parser = __esm({
2894
3226
  invocationId,
2895
3227
  parametersRaw,
2896
3228
  parameters,
2897
- parseError
3229
+ parseError,
3230
+ dependencies
2898
3231
  }
2899
3232
  };
2900
3233
  startIndex = partEndIndex + endMarkerLength;
@@ -2917,7 +3250,7 @@ var init_parser = __esm({
2917
3250
  const metadataEndIndex = this.buffer.indexOf("\n", metadataStartIndex);
2918
3251
  if (metadataEndIndex !== -1) {
2919
3252
  const gadgetName = this.buffer.substring(metadataStartIndex, metadataEndIndex).trim();
2920
- const { actualName: actualGadgetName, invocationId } = this.parseGadgetName(gadgetName);
3253
+ const { actualName: actualGadgetName, invocationId, dependencies } = this.parseGadgetName(gadgetName);
2921
3254
  const contentStartIndex = metadataEndIndex + 1;
2922
3255
  const parametersRaw = this.buffer.substring(contentStartIndex).trim();
2923
3256
  const { parameters, parseError } = this.parseParameters(parametersRaw);
@@ -2928,7 +3261,8 @@ var init_parser = __esm({
2928
3261
  invocationId,
2929
3262
  parametersRaw,
2930
3263
  parameters,
2931
- parseError
3264
+ parseError,
3265
+ dependencies
2932
3266
  }
2933
3267
  };
2934
3268
  return;
@@ -3298,6 +3632,13 @@ var init_stream_processor = __esm({
3298
3632
  accumulatedText = "";
3299
3633
  shouldStopExecution = false;
3300
3634
  observerFailureCount = 0;
3635
+ // Dependency tracking for gadget execution DAG
3636
+ /** Gadgets waiting for their dependencies to complete */
3637
+ pendingGadgets = /* @__PURE__ */ new Map();
3638
+ /** Completed gadget results, keyed by invocation ID */
3639
+ completedResults = /* @__PURE__ */ new Map();
3640
+ /** Invocation IDs of gadgets that have failed (error or skipped due to dependency) */
3641
+ failedInvocations = /* @__PURE__ */ new Set();
3301
3642
  constructor(options) {
3302
3643
  this.iteration = options.iteration;
3303
3644
  this.registry = options.registry;
@@ -3398,6 +3739,16 @@ var init_stream_processor = __esm({
3398
3739
  }
3399
3740
  }
3400
3741
  }
3742
+ const finalPendingEvents = await this.processPendingGadgets();
3743
+ outputs.push(...finalPendingEvents);
3744
+ if (finalPendingEvents.some((e) => e.type === "gadget_result")) {
3745
+ didExecuteGadgets = true;
3746
+ }
3747
+ for (const evt of finalPendingEvents) {
3748
+ if (evt.type === "gadget_result" && evt.result.breaksLoop) {
3749
+ shouldBreakLoop = true;
3750
+ }
3751
+ }
3401
3752
  }
3402
3753
  let finalMessage = this.accumulatedText;
3403
3754
  if (this.hooks.interceptors?.interceptAssistantMessage) {
@@ -3449,7 +3800,11 @@ var init_stream_processor = __esm({
3449
3800
  return [{ type: "text", content }];
3450
3801
  }
3451
3802
  /**
3452
- * Process a gadget call through the full lifecycle.
3803
+ * Process a gadget call through the full lifecycle, handling dependencies.
3804
+ *
3805
+ * Gadgets without dependencies (or with all dependencies satisfied) execute immediately.
3806
+ * Gadgets with unsatisfied dependencies are queued for later execution.
3807
+ * After each execution, pending gadgets are checked to see if they can now run.
3453
3808
  */
3454
3809
  async processGadgetCall(call) {
3455
3810
  if (this.shouldStopExecution) {
@@ -3459,7 +3814,54 @@ var init_stream_processor = __esm({
3459
3814
  return [];
3460
3815
  }
3461
3816
  const events = [];
3462
- events.push({ type: "gadget_call", call });
3817
+ events.push({ type: "gadget_call", call });
3818
+ if (call.dependencies.length > 0) {
3819
+ if (call.dependencies.includes(call.invocationId)) {
3820
+ this.logger.warn("Gadget has self-referential dependency (depends on itself)", {
3821
+ gadgetName: call.gadgetName,
3822
+ invocationId: call.invocationId
3823
+ });
3824
+ this.failedInvocations.add(call.invocationId);
3825
+ const skipEvent = {
3826
+ type: "gadget_skipped",
3827
+ gadgetName: call.gadgetName,
3828
+ invocationId: call.invocationId,
3829
+ parameters: call.parameters ?? {},
3830
+ failedDependency: call.invocationId,
3831
+ failedDependencyError: `Gadget "${call.invocationId}" cannot depend on itself (self-referential dependency)`
3832
+ };
3833
+ events.push(skipEvent);
3834
+ return events;
3835
+ }
3836
+ const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
3837
+ if (failedDep) {
3838
+ const skipEvents = await this.handleFailedDependency(call, failedDep);
3839
+ events.push(...skipEvents);
3840
+ return events;
3841
+ }
3842
+ const unsatisfied = call.dependencies.filter((dep) => !this.completedResults.has(dep));
3843
+ if (unsatisfied.length > 0) {
3844
+ this.logger.debug("Queueing gadget for later - waiting on dependencies", {
3845
+ gadgetName: call.gadgetName,
3846
+ invocationId: call.invocationId,
3847
+ waitingOn: unsatisfied
3848
+ });
3849
+ this.pendingGadgets.set(call.invocationId, call);
3850
+ return events;
3851
+ }
3852
+ }
3853
+ const executeEvents = await this.executeGadgetWithHooks(call);
3854
+ events.push(...executeEvents);
3855
+ const triggeredEvents = await this.processPendingGadgets();
3856
+ events.push(...triggeredEvents);
3857
+ return events;
3858
+ }
3859
+ /**
3860
+ * Execute a gadget through the full hook lifecycle.
3861
+ * This is the core execution logic, extracted from processGadgetCall.
3862
+ */
3863
+ async executeGadgetWithHooks(call) {
3864
+ const events = [];
3463
3865
  if (call.parseError) {
3464
3866
  this.logger.warn("Gadget has parse error", {
3465
3867
  gadgetName: call.gadgetName,
@@ -3590,6 +3992,10 @@ var init_stream_processor = __esm({
3590
3992
  });
3591
3993
  }
3592
3994
  await this.runObserversInParallel(completeObservers);
3995
+ this.completedResults.set(result.invocationId, result);
3996
+ if (result.error) {
3997
+ this.failedInvocations.add(result.invocationId);
3998
+ }
3593
3999
  events.push({ type: "gadget_result", result });
3594
4000
  if (result.error) {
3595
4001
  const errorType = this.determineErrorType(call, result);
@@ -3605,6 +4011,162 @@ var init_stream_processor = __esm({
3605
4011
  }
3606
4012
  return events;
3607
4013
  }
4014
+ /**
4015
+ * Handle a gadget that cannot execute because a dependency failed.
4016
+ * Calls the onDependencySkipped controller to allow customization.
4017
+ */
4018
+ async handleFailedDependency(call, failedDep) {
4019
+ const events = [];
4020
+ const depResult = this.completedResults.get(failedDep);
4021
+ const depError = depResult?.error ?? "Dependency failed";
4022
+ let action = { action: "skip" };
4023
+ if (this.hooks.controllers?.onDependencySkipped) {
4024
+ const context = {
4025
+ iteration: this.iteration,
4026
+ gadgetName: call.gadgetName,
4027
+ invocationId: call.invocationId,
4028
+ parameters: call.parameters ?? {},
4029
+ failedDependency: failedDep,
4030
+ failedDependencyError: depError,
4031
+ logger: this.logger
4032
+ };
4033
+ action = await this.hooks.controllers.onDependencySkipped(context);
4034
+ }
4035
+ if (action.action === "skip") {
4036
+ this.failedInvocations.add(call.invocationId);
4037
+ const skipEvent = {
4038
+ type: "gadget_skipped",
4039
+ gadgetName: call.gadgetName,
4040
+ invocationId: call.invocationId,
4041
+ parameters: call.parameters ?? {},
4042
+ failedDependency: failedDep,
4043
+ failedDependencyError: depError
4044
+ };
4045
+ events.push(skipEvent);
4046
+ if (this.hooks.observers?.onGadgetSkipped) {
4047
+ const observeContext = {
4048
+ iteration: this.iteration,
4049
+ gadgetName: call.gadgetName,
4050
+ invocationId: call.invocationId,
4051
+ parameters: call.parameters ?? {},
4052
+ failedDependency: failedDep,
4053
+ failedDependencyError: depError,
4054
+ logger: this.logger
4055
+ };
4056
+ await this.safeObserve(() => this.hooks.observers.onGadgetSkipped(observeContext));
4057
+ }
4058
+ this.logger.info("Gadget skipped due to failed dependency", {
4059
+ gadgetName: call.gadgetName,
4060
+ invocationId: call.invocationId,
4061
+ failedDependency: failedDep
4062
+ });
4063
+ } else if (action.action === "execute_anyway") {
4064
+ this.logger.info("Executing gadget despite failed dependency (controller override)", {
4065
+ gadgetName: call.gadgetName,
4066
+ invocationId: call.invocationId,
4067
+ failedDependency: failedDep
4068
+ });
4069
+ const executeEvents = await this.executeGadgetWithHooks(call);
4070
+ events.push(...executeEvents);
4071
+ } else if (action.action === "use_fallback") {
4072
+ const fallbackResult = {
4073
+ gadgetName: call.gadgetName,
4074
+ invocationId: call.invocationId,
4075
+ parameters: call.parameters ?? {},
4076
+ result: action.fallbackResult,
4077
+ executionTimeMs: 0
4078
+ };
4079
+ this.completedResults.set(call.invocationId, fallbackResult);
4080
+ events.push({ type: "gadget_result", result: fallbackResult });
4081
+ this.logger.info("Using fallback result for gadget with failed dependency", {
4082
+ gadgetName: call.gadgetName,
4083
+ invocationId: call.invocationId,
4084
+ failedDependency: failedDep
4085
+ });
4086
+ }
4087
+ return events;
4088
+ }
4089
+ /**
4090
+ * Process pending gadgets whose dependencies are now satisfied.
4091
+ * Executes ready gadgets in parallel and continues until no more can be triggered.
4092
+ */
4093
+ async processPendingGadgets() {
4094
+ const events = [];
4095
+ let progress = true;
4096
+ while (progress && this.pendingGadgets.size > 0) {
4097
+ progress = false;
4098
+ const readyToExecute = [];
4099
+ const readyToSkip = [];
4100
+ for (const [invocationId, call] of this.pendingGadgets) {
4101
+ const failedDep = call.dependencies.find((dep) => this.failedInvocations.has(dep));
4102
+ if (failedDep) {
4103
+ readyToSkip.push({ call, failedDep });
4104
+ continue;
4105
+ }
4106
+ const allSatisfied = call.dependencies.every((dep) => this.completedResults.has(dep));
4107
+ if (allSatisfied) {
4108
+ readyToExecute.push(call);
4109
+ }
4110
+ }
4111
+ for (const { call, failedDep } of readyToSkip) {
4112
+ this.pendingGadgets.delete(call.invocationId);
4113
+ const skipEvents = await this.handleFailedDependency(call, failedDep);
4114
+ events.push(...skipEvents);
4115
+ progress = true;
4116
+ }
4117
+ if (readyToExecute.length > 0) {
4118
+ this.logger.debug("Executing ready gadgets in parallel", {
4119
+ count: readyToExecute.length,
4120
+ invocationIds: readyToExecute.map((c) => c.invocationId)
4121
+ });
4122
+ for (const call of readyToExecute) {
4123
+ this.pendingGadgets.delete(call.invocationId);
4124
+ }
4125
+ const executePromises = readyToExecute.map((call) => this.executeGadgetWithHooks(call));
4126
+ const results = await Promise.all(executePromises);
4127
+ for (const executeEvents of results) {
4128
+ events.push(...executeEvents);
4129
+ }
4130
+ progress = true;
4131
+ }
4132
+ }
4133
+ if (this.pendingGadgets.size > 0) {
4134
+ const pendingIds = new Set(this.pendingGadgets.keys());
4135
+ for (const [invocationId, call] of this.pendingGadgets) {
4136
+ const missingDeps = call.dependencies.filter((dep) => !this.completedResults.has(dep));
4137
+ const circularDeps = missingDeps.filter((dep) => pendingIds.has(dep));
4138
+ const trulyMissingDeps = missingDeps.filter((dep) => !pendingIds.has(dep));
4139
+ let errorMessage;
4140
+ let logLevel = "warn";
4141
+ if (circularDeps.length > 0 && trulyMissingDeps.length > 0) {
4142
+ errorMessage = `Dependencies unresolvable: circular=[${circularDeps.join(", ")}], missing=[${trulyMissingDeps.join(", ")}]`;
4143
+ logLevel = "error";
4144
+ } else if (circularDeps.length > 0) {
4145
+ errorMessage = `Circular dependency detected: "${invocationId}" depends on "${circularDeps[0]}" which also depends on "${invocationId}" (directly or indirectly)`;
4146
+ } else {
4147
+ errorMessage = `Dependency "${missingDeps[0]}" was never executed - check that the invocation ID exists and is spelled correctly`;
4148
+ }
4149
+ this.logger[logLevel]("Gadget has unresolvable dependencies", {
4150
+ gadgetName: call.gadgetName,
4151
+ invocationId,
4152
+ circularDependencies: circularDeps,
4153
+ missingDependencies: trulyMissingDeps
4154
+ });
4155
+ this.failedInvocations.add(invocationId);
4156
+ const skipEvent = {
4157
+ type: "gadget_skipped",
4158
+ gadgetName: call.gadgetName,
4159
+ invocationId,
4160
+ parameters: call.parameters ?? {},
4161
+ failedDependency: missingDeps[0],
4162
+ failedDependencyError: errorMessage
4163
+ };
4164
+ events.push(skipEvent);
4165
+ }
4166
+ this.pendingGadgets.clear();
4167
+ }
4168
+ return events;
4169
+ }
3608
4170
  /**
3609
4171
  * Safely execute an observer, catching and logging any errors.
3610
4172
  * Observers are non-critical, so errors are logged but don't crash the system.
@@ -4042,9 +4604,9 @@ var init_agent = __esm({
4042
4604
  if (msg.role === "user") {
4043
4605
  this.conversation.addUserMessage(msg.content);
4044
4606
  } else if (msg.role === "assistant") {
4045
- this.conversation.addAssistantMessage(msg.content);
4607
+ this.conversation.addAssistantMessage(extractText(msg.content));
4046
4608
  } else if (msg.role === "system") {
4047
- this.conversation.addUserMessage(`[System] ${msg.content}`);
4609
+ this.conversation.addUserMessage(`[System] ${extractText(msg.content)}`);
4048
4610
  }
4049
4611
  }
4050
4612
  }
@@ -4264,6 +4826,7 @@ var init_builder = __esm({
4264
4826
  "src/agent/builder.ts"() {
4265
4827
  "use strict";
4266
4828
  init_constants();
4829
+ init_input_content();
4267
4830
  init_model_shortcuts();
4268
4831
  init_registry();
4269
4832
  init_agent();
@@ -4911,13 +5474,17 @@ ${endPrefix}`
4911
5474
  * }
4912
5475
  * ```
4913
5476
  */
4914
- ask(userPrompt) {
5477
+ /**
5478
+ * Build AgentOptions with the given user prompt.
5479
+ * Centralizes options construction for ask(), askWithImage(), and askWithContent().
5480
+ */
5481
+ buildAgentOptions(userPrompt) {
4915
5482
  if (!this.client) {
4916
5483
  const { LLMist: LLMistClass } = (init_client(), __toCommonJS(client_exports));
4917
5484
  this.client = new LLMistClass();
4918
5485
  }
4919
5486
  const registry = GadgetRegistry.from(this.gadgets);
4920
- const options = {
5487
+ return {
4921
5488
  client: this.client,
4922
5489
  model: this.model ?? "openai:gpt-5-nano",
4923
5490
  systemPrompt: this.systemPrompt,
@@ -4943,6 +5510,83 @@ ${endPrefix}`
4943
5510
  compactionConfig: this.compactionConfig,
4944
5511
  signal: this.signal
4945
5512
  };
5513
+ }
5514
+ ask(userPrompt) {
5515
+ const options = this.buildAgentOptions(userPrompt);
5516
+ return new Agent(AGENT_INTERNAL_KEY, options);
5517
+ }
5518
+ /**
5519
+ * Build and create the agent with a multimodal user prompt (text + image).
5520
+ * Returns the Agent instance ready to run.
5521
+ *
5522
+ * @param textPrompt - Text prompt describing what to do with the image
5523
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
5524
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
5525
+ * @returns Configured Agent instance
5526
+ *
5527
+ * @example
5528
+ * ```typescript
5529
+ * const agent = LLMist.createAgent()
5530
+ * .withModel("gpt-4o")
5531
+ * .withSystem("You analyze images")
5532
+ * .askWithImage(
5533
+ * "What's in this image?",
5534
+ * await fs.readFile("photo.jpg")
5535
+ * );
5536
+ *
5537
+ * for await (const event of agent.run()) {
5538
+ * // handle events
5539
+ * }
5540
+ * ```
5541
+ */
5542
+ askWithImage(textPrompt, imageData, mimeType) {
5543
+ const imageBuffer = typeof imageData === "string" ? Buffer.from(imageData, "base64") : imageData;
5544
+ const detectedMime = mimeType ?? detectImageMimeType(imageBuffer);
5545
+ if (!detectedMime) {
5546
+ throw new Error(
5547
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
5548
+ );
5549
+ }
5550
+ const userContent = [
5551
+ text(textPrompt),
5552
+ {
5553
+ type: "image",
5554
+ source: {
5555
+ type: "base64",
5556
+ mediaType: detectedMime,
5557
+ data: toBase64(imageBuffer)
5558
+ }
5559
+ }
5560
+ ];
5561
+ const options = this.buildAgentOptions(userContent);
5562
+ return new Agent(AGENT_INTERNAL_KEY, options);
5563
+ }
5564
+ /**
5565
+ * Build and return an Agent configured with multimodal content.
5566
+ * More flexible than askWithImage - accepts any combination of content parts.
5567
+ *
5568
+ * @param content - Array of content parts (text, images, audio)
5569
+ * @returns A configured Agent ready for execution
5570
+ *
5571
+ * @example
5572
+ * ```typescript
5573
+ * import { text, imageFromBuffer, audioFromBuffer } from "llmist";
5574
+ *
5575
+ * const agent = LLMist.createAgent()
5576
+ * .withModel("gemini:gemini-2.5-flash")
5577
+ * .askWithContent([
5578
+ * text("Describe this image and transcribe the audio:"),
5579
+ * imageFromBuffer(imageData),
5580
+ * audioFromBuffer(audioData),
5581
+ * ]);
5582
+ *
5583
+ * for await (const event of agent.run()) {
5584
+ * // handle events
5585
+ * }
5586
+ * ```
5587
+ */
5588
+ askWithContent(content) {
5589
+ const options = this.buildAgentOptions(content);
4946
5590
  return new Agent(AGENT_INTERNAL_KEY, options);
4947
5591
  }
4948
5592
  /**
@@ -5418,6 +6062,7 @@ var AnthropicMessagesProvider;
5418
6062
  var init_anthropic = __esm({
5419
6063
  "src/providers/anthropic.ts"() {
5420
6064
  "use strict";
6065
+ init_messages();
5421
6066
  init_anthropic_models();
5422
6067
  init_base_provider();
5423
6068
  init_constants2();
@@ -5430,11 +6075,33 @@ var init_anthropic = __esm({
5430
6075
  getModelSpecs() {
5431
6076
  return ANTHROPIC_MODELS;
5432
6077
  }
6078
+ // =========================================================================
6079
+ // Image Generation (Not Supported)
6080
+ // =========================================================================
6081
+ supportsImageGeneration(_modelId) {
6082
+ return false;
6083
+ }
6084
+ async generateImage() {
6085
+ throw new Error(
6086
+ "Anthropic does not support image generation. Use OpenAI (DALL-E, GPT Image) or Google Gemini (Imagen) instead."
6087
+ );
6088
+ }
6089
+ // =========================================================================
6090
+ // Speech Generation (Not Supported)
6091
+ // =========================================================================
6092
+ supportsSpeechGeneration(_modelId) {
6093
+ return false;
6094
+ }
6095
+ async generateSpeech() {
6096
+ throw new Error(
6097
+ "Anthropic does not support speech generation. Use OpenAI (TTS) or Google Gemini (TTS) instead."
6098
+ );
6099
+ }
5433
6100
  buildRequestPayload(options, descriptor, spec, messages) {
5434
6101
  const systemMessages = messages.filter((message) => message.role === "system");
5435
6102
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
5436
6103
  type: "text",
5437
- text: m.content,
6104
+ text: extractText(m.content),
5438
6105
  // Add cache_control to the LAST system message block
5439
6106
  ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
5440
6107
  })) : void 0;
@@ -5447,14 +6114,10 @@ var init_anthropic = __esm({
5447
6114
  );
5448
6115
  const conversation = nonSystemMessages.map((message, index) => ({
5449
6116
  role: message.role,
5450
- content: [
5451
- {
5452
- type: "text",
5453
- text: message.content,
5454
- // Add cache_control to the LAST user message
5455
- ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
5456
- }
5457
- ]
6117
+ content: this.convertToAnthropicContent(
6118
+ message.content,
6119
+ message.role === "user" && index === lastUserIndex
6120
+ )
5458
6121
  }));
5459
6122
  const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
5460
6123
  const payload = {
@@ -5470,6 +6133,52 @@ var init_anthropic = __esm({
5470
6133
  };
5471
6134
  return payload;
5472
6135
  }
6136
+ /**
6137
+ * Convert llmist content to Anthropic's content block format.
6138
+ * Handles text, images (base64 only), and applies cache_control.
6139
+ */
6140
+ convertToAnthropicContent(content, addCacheControl) {
6141
+ const parts = normalizeContent(content);
6142
+ return parts.map((part, index) => {
6143
+ const isLastPart = index === parts.length - 1;
6144
+ const cacheControl = addCacheControl && isLastPart ? { cache_control: { type: "ephemeral" } } : {};
6145
+ if (part.type === "text") {
6146
+ return {
6147
+ type: "text",
6148
+ text: part.text,
6149
+ ...cacheControl
6150
+ };
6151
+ }
6152
+ if (part.type === "image") {
6153
+ return this.convertImagePart(part, cacheControl);
6154
+ }
6155
+ if (part.type === "audio") {
6156
+ throw new Error(
6157
+ "Anthropic does not support audio input. Use Google Gemini for audio processing."
6158
+ );
6159
+ }
6160
+ throw new Error(`Unsupported content type: ${part.type}`);
6161
+ });
6162
+ }
6163
+ /**
6164
+ * Convert an image content part to Anthropic's image block format.
6165
+ */
6166
+ convertImagePart(part, cacheControl) {
6167
+ if (part.source.type === "url") {
6168
+ throw new Error(
6169
+ "Anthropic does not support image URLs. Please provide base64-encoded image data instead."
6170
+ );
6171
+ }
6172
+ return {
6173
+ type: "image",
6174
+ source: {
6175
+ type: "base64",
6176
+ media_type: part.source.mediaType,
6177
+ data: part.source.data
6178
+ },
6179
+ ...cacheControl
6180
+ };
6181
+ }
5473
6182
  async executeStreamRequest(payload, signal) {
5474
6183
  const client = this.client;
5475
6184
  const stream2 = await client.messages.create(payload, signal ? { signal } : void 0);
@@ -5552,17 +6261,12 @@ var init_anthropic = __esm({
5552
6261
  async countTokens(messages, descriptor, _spec) {
5553
6262
  const client = this.client;
5554
6263
  const systemMessages = messages.filter((message) => message.role === "system");
5555
- const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
6264
+ const system = systemMessages.length > 0 ? systemMessages.map((m) => extractText(m.content)).join("\n\n") : void 0;
5556
6265
  const conversation = messages.filter(
5557
6266
  (message) => message.role !== "system"
5558
6267
  ).map((message) => ({
5559
6268
  role: message.role,
5560
- content: [
5561
- {
5562
- type: "text",
5563
- text: message.content
5564
- }
5565
- ]
6269
+ content: this.convertToAnthropicContent(message.content, false)
5566
6270
  }));
5567
6271
  try {
5568
6272
  const response = await client.messages.countTokens({
@@ -5576,14 +6280,201 @@ var init_anthropic = __esm({
5576
6280
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
5577
6281
  error
5578
6282
  );
5579
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
5580
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
6283
+ let totalChars = 0;
6284
+ let imageCount = 0;
6285
+ for (const msg of messages) {
6286
+ const parts = normalizeContent(msg.content);
6287
+ for (const part of parts) {
6288
+ if (part.type === "text") {
6289
+ totalChars += part.text.length;
6290
+ } else if (part.type === "image") {
6291
+ imageCount++;
6292
+ }
6293
+ }
6294
+ }
6295
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 1e3;
5581
6296
  }
5582
6297
  }
5583
6298
  };
5584
6299
  }
5585
6300
  });
5586
6301
 
6302
+ // src/providers/gemini-image-models.ts
6303
+ function getGeminiImageModelSpec(modelId) {
6304
+ return geminiImageModels.find((m) => m.modelId === modelId);
6305
+ }
6306
+ function isGeminiImageModel(modelId) {
6307
+ return geminiImageModels.some((m) => m.modelId === modelId);
6308
+ }
6309
+ function calculateGeminiImageCost(modelId, size = "1:1", n = 1) {
6310
+ const spec = getGeminiImageModelSpec(modelId);
6311
+ if (!spec) return void 0;
6312
+ if (spec.pricing.perImage !== void 0) {
6313
+ return spec.pricing.perImage * n;
6314
+ }
6315
+ if (spec.pricing.bySize) {
6316
+ const sizePrice = spec.pricing.bySize[size];
6317
+ if (typeof sizePrice === "number") {
6318
+ return sizePrice * n;
6319
+ }
6320
+ }
6321
+ return void 0;
6322
+ }
6323
+ var IMAGEN4_ASPECT_RATIOS, GEMINI_IMAGE_ASPECT_RATIOS, geminiImageModels;
6324
+ var init_gemini_image_models = __esm({
6325
+ "src/providers/gemini-image-models.ts"() {
6326
+ "use strict";
6327
+ IMAGEN4_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
6328
+ GEMINI_IMAGE_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
6329
+ geminiImageModels = [
6330
+ // Imagen 4 Family (standalone image generation)
6331
+ {
6332
+ provider: "gemini",
6333
+ modelId: "imagen-4.0-fast-generate-001",
6334
+ displayName: "Imagen 4 Fast",
6335
+ pricing: {
6336
+ perImage: 0.02
6337
+ },
6338
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
6339
+ maxImages: 4,
6340
+ defaultSize: "1:1",
6341
+ features: {
6342
+ textRendering: true
6343
+ }
6344
+ },
6345
+ {
6346
+ provider: "gemini",
6347
+ modelId: "imagen-4.0-generate-001",
6348
+ displayName: "Imagen 4",
6349
+ pricing: {
6350
+ perImage: 0.04
6351
+ },
6352
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
6353
+ maxImages: 4,
6354
+ defaultSize: "1:1",
6355
+ features: {
6356
+ textRendering: true
6357
+ }
6358
+ },
6359
+ {
6360
+ provider: "gemini",
6361
+ modelId: "imagen-4.0-ultra-generate-001",
6362
+ displayName: "Imagen 4 Ultra",
6363
+ pricing: {
6364
+ perImage: 0.06
6365
+ },
6366
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
6367
+ maxImages: 4,
6368
+ defaultSize: "1:1",
6369
+ features: {
6370
+ textRendering: true
6371
+ }
6372
+ },
6373
+ // Preview versions
6374
+ {
6375
+ provider: "gemini",
6376
+ modelId: "imagen-4.0-generate-preview-06-06",
6377
+ displayName: "Imagen 4 (Preview)",
6378
+ pricing: {
6379
+ perImage: 0.04
6380
+ },
6381
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
6382
+ maxImages: 4,
6383
+ defaultSize: "1:1",
6384
+ features: {
6385
+ textRendering: true
6386
+ }
6387
+ },
6388
+ {
6389
+ provider: "gemini",
6390
+ modelId: "imagen-4.0-ultra-generate-preview-06-06",
6391
+ displayName: "Imagen 4 Ultra (Preview)",
6392
+ pricing: {
6393
+ perImage: 0.06
6394
+ },
6395
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
6396
+ maxImages: 4,
6397
+ defaultSize: "1:1",
6398
+ features: {
6399
+ textRendering: true
6400
+ }
6401
+ },
6402
+ // Gemini Native Image Generation (multimodal models)
6403
+ {
6404
+ provider: "gemini",
6405
+ modelId: "gemini-2.5-flash-image",
6406
+ displayName: "Gemini 2.5 Flash Image",
6407
+ pricing: {
6408
+ perImage: 0.039
6409
+ },
6410
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
6411
+ maxImages: 1,
6412
+ defaultSize: "1:1",
6413
+ features: {
6414
+ conversational: true,
6415
+ textRendering: true
6416
+ }
6417
+ },
6418
+ {
6419
+ provider: "gemini",
6420
+ modelId: "gemini-2.5-flash-image-preview",
6421
+ displayName: "Gemini 2.5 Flash Image (Preview)",
6422
+ pricing: {
6423
+ perImage: 0.039
6424
+ },
6425
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
6426
+ maxImages: 1,
6427
+ defaultSize: "1:1",
6428
+ features: {
6429
+ conversational: true,
6430
+ textRendering: true
6431
+ }
6432
+ },
6433
+ {
6434
+ provider: "gemini",
6435
+ modelId: "gemini-3-pro-image-preview",
6436
+ displayName: "Gemini 3 Pro Image (Preview)",
6437
+ pricing: {
6438
+ // Token-based: ~$0.134 per 1K/2K image, $0.24 per 4K
6439
+ // Using 2K as default
6440
+ bySize: {
6441
+ "1K": 0.134,
6442
+ "2K": 0.134,
6443
+ "4K": 0.24
6444
+ }
6445
+ },
6446
+ supportedSizes: ["1K", "2K", "4K"],
6447
+ maxImages: 1,
6448
+ defaultSize: "2K",
6449
+ features: {
6450
+ conversational: true,
6451
+ textRendering: true
6452
+ }
6453
+ },
6454
+ // Alias: nano-banana-pro-preview is gemini-3-pro-image-preview
6455
+ {
6456
+ provider: "gemini",
6457
+ modelId: "nano-banana-pro-preview",
6458
+ displayName: "Nano Banana Pro (Gemini 3 Pro Image)",
6459
+ pricing: {
6460
+ bySize: {
6461
+ "1K": 0.134,
6462
+ "2K": 0.134,
6463
+ "4K": 0.24
6464
+ }
6465
+ },
6466
+ supportedSizes: ["1K", "2K", "4K"],
6467
+ maxImages: 1,
6468
+ defaultSize: "2K",
6469
+ features: {
6470
+ conversational: true,
6471
+ textRendering: true
6472
+ }
6473
+ }
6474
+ ];
6475
+ }
6476
+ });
6477
+
5587
6478
  // src/providers/gemini-models.ts
5588
6479
  var GEMINI_MODELS;
5589
6480
  var init_gemini_models = __esm({
@@ -5741,16 +6632,155 @@ var init_gemini_models = __esm({
5741
6632
  output: 0.3
5742
6633
  // No context caching available for 2.0-flash-lite
5743
6634
  },
5744
- knowledgeCutoff: "2024-08",
6635
+ knowledgeCutoff: "2024-08",
6636
+ features: {
6637
+ streaming: true,
6638
+ functionCalling: true,
6639
+ vision: true,
6640
+ structuredOutputs: true
6641
+ },
6642
+ metadata: {
6643
+ family: "Gemini 2.0",
6644
+ notes: "Smallest and most cost effective 2.0 model for at scale usage."
6645
+ }
6646
+ }
6647
+ ];
6648
+ }
6649
+ });
6650
+
6651
+ // src/providers/gemini-speech-models.ts
6652
+ function getGeminiSpeechModelSpec(modelId) {
6653
+ return geminiSpeechModels.find((m) => m.modelId === modelId);
6654
+ }
6655
+ function isGeminiSpeechModel(modelId) {
6656
+ return geminiSpeechModels.some((m) => m.modelId === modelId);
6657
+ }
6658
+ function calculateGeminiSpeechCost(modelId, characterCount, estimatedMinutes) {
6659
+ const spec = getGeminiSpeechModelSpec(modelId);
6660
+ if (!spec) return void 0;
6661
+ if (spec.pricing.perMinute !== void 0) {
6662
+ if (estimatedMinutes !== void 0) {
6663
+ return estimatedMinutes * spec.pricing.perMinute;
6664
+ }
6665
+ const approxMinutes = characterCount / 750;
6666
+ return approxMinutes * spec.pricing.perMinute;
6667
+ }
6668
+ return void 0;
6669
+ }
6670
+ var GEMINI_TTS_VOICES, GEMINI_TTS_FORMATS, geminiSpeechModels;
6671
+ var init_gemini_speech_models = __esm({
6672
+ "src/providers/gemini-speech-models.ts"() {
6673
+ "use strict";
6674
+ GEMINI_TTS_VOICES = [
6675
+ "Zephyr",
6676
+ // Bright
6677
+ "Puck",
6678
+ // Upbeat
6679
+ "Charon",
6680
+ // Informative
6681
+ "Kore",
6682
+ // Firm
6683
+ "Fenrir",
6684
+ // Excitable
6685
+ "Leda",
6686
+ // Youthful
6687
+ "Orus",
6688
+ // Firm
6689
+ "Aoede",
6690
+ // Breezy
6691
+ "Callirrhoe",
6692
+ // Easy-going
6693
+ "Autonoe",
6694
+ // Bright
6695
+ "Enceladus",
6696
+ // Breathy
6697
+ "Iapetus",
6698
+ // Clear
6699
+ "Umbriel",
6700
+ // Easy-going
6701
+ "Algieba",
6702
+ // Smooth
6703
+ "Despina",
6704
+ // Smooth
6705
+ "Erinome",
6706
+ // Clear
6707
+ "Algenib",
6708
+ // Gravelly
6709
+ "Rasalgethi",
6710
+ // Informative
6711
+ "Laomedeia",
6712
+ // Upbeat
6713
+ "Achernar",
6714
+ // Soft
6715
+ "Alnilam",
6716
+ // Firm
6717
+ "Schedar",
6718
+ // Even
6719
+ "Gacrux",
6720
+ // Mature
6721
+ "Pulcherrima",
6722
+ // Forward
6723
+ "Achird",
6724
+ // Friendly
6725
+ "Zubenelgenubi",
6726
+ // Casual
6727
+ "Vindemiatrix",
6728
+ // Gentle
6729
+ "Sadachbia",
6730
+ // Lively
6731
+ "Sadaltager",
6732
+ // Knowledgeable
6733
+ "Sulafat"
6734
+ // Warm
6735
+ ];
6736
+ GEMINI_TTS_FORMATS = ["pcm", "wav"];
6737
+ geminiSpeechModels = [
6738
+ {
6739
+ provider: "gemini",
6740
+ modelId: "gemini-2.5-flash-preview-tts",
6741
+ displayName: "Gemini 2.5 Flash TTS (Preview)",
6742
+ pricing: {
6743
+ // $0.50 per 1M input tokens = $0.0000005 per token
6744
+ perInputToken: 5e-7,
6745
+ // $10.00 per 1M audio output tokens = $0.00001 per token
6746
+ perAudioOutputToken: 1e-5,
6747
+ // Rough estimate: ~$0.01 per minute of audio
6748
+ perMinute: 0.01
6749
+ },
6750
+ voices: [...GEMINI_TTS_VOICES],
6751
+ formats: GEMINI_TTS_FORMATS,
6752
+ maxInputLength: 8e3,
6753
+ // bytes (text + prompt combined)
6754
+ defaultVoice: "Zephyr",
6755
+ defaultFormat: "wav",
6756
+ features: {
6757
+ multiSpeaker: true,
6758
+ languages: 24,
6759
+ voiceInstructions: true
6760
+ }
6761
+ },
6762
+ {
6763
+ provider: "gemini",
6764
+ modelId: "gemini-2.5-pro-preview-tts",
6765
+ displayName: "Gemini 2.5 Pro TTS (Preview)",
6766
+ pricing: {
6767
+ // $1.00 per 1M input tokens = $0.000001 per token
6768
+ perInputToken: 1e-6,
6769
+ // $20.00 per 1M audio output tokens = $0.00002 per token
6770
+ perAudioOutputToken: 2e-5,
6771
+ // Rough estimate: ~$0.02 per minute of audio
6772
+ perMinute: 0.02
6773
+ },
6774
+ voices: [...GEMINI_TTS_VOICES],
6775
+ formats: GEMINI_TTS_FORMATS,
6776
+ maxInputLength: 8e3,
6777
+ // bytes
6778
+ defaultVoice: "Zephyr",
6779
+ defaultFormat: "wav",
5745
6780
  features: {
5746
- streaming: true,
5747
- functionCalling: true,
5748
- vision: true,
5749
- structuredOutputs: true
5750
- },
5751
- metadata: {
5752
- family: "Gemini 2.0",
5753
- notes: "Smallest and most cost effective 2.0 model for at scale usage."
6781
+ multiSpeaker: true,
6782
+ languages: 24,
6783
+ voiceInstructions: true
5754
6784
  }
5755
6785
  }
5756
6786
  ];
@@ -5758,7 +6788,32 @@ var init_gemini_models = __esm({
5758
6788
  });
5759
6789
 
5760
6790
  // src/providers/gemini.ts
5761
- import { FunctionCallingConfigMode, GoogleGenAI } from "@google/genai";
6791
+ import { FunctionCallingConfigMode, GoogleGenAI, Modality } from "@google/genai";
6792
+ function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
6793
+ const byteRate = sampleRate * numChannels * bitsPerSample / 8;
6794
+ const blockAlign = numChannels * bitsPerSample / 8;
6795
+ const dataSize = pcmData.length;
6796
+ const headerSize = 44;
6797
+ const fileSize = headerSize + dataSize - 8;
6798
+ const buffer = new ArrayBuffer(headerSize + dataSize);
6799
+ const view = new DataView(buffer);
6800
+ const uint8 = new Uint8Array(buffer);
6801
+ view.setUint32(0, 1380533830, false);
6802
+ view.setUint32(4, fileSize, true);
6803
+ view.setUint32(8, 1463899717, false);
6804
+ view.setUint32(12, 1718449184, false);
6805
+ view.setUint32(16, 16, true);
6806
+ view.setUint16(20, 1, true);
6807
+ view.setUint16(22, numChannels, true);
6808
+ view.setUint32(24, sampleRate, true);
6809
+ view.setUint32(28, byteRate, true);
6810
+ view.setUint16(32, blockAlign, true);
6811
+ view.setUint16(34, bitsPerSample, true);
6812
+ view.setUint32(36, 1684108385, false);
6813
+ view.setUint32(40, dataSize, true);
6814
+ uint8.set(pcmData, headerSize);
6815
+ return buffer;
6816
+ }
5762
6817
  function createGeminiProviderFromEnv() {
5763
6818
  return createProviderFromEnv("GEMINI_API_KEY", GoogleGenAI, GeminiGenerativeProvider);
5764
6819
  }
@@ -5766,9 +6821,12 @@ var GEMINI_ROLE_MAP, GeminiGenerativeProvider;
5766
6821
  var init_gemini = __esm({
5767
6822
  "src/providers/gemini.ts"() {
5768
6823
  "use strict";
6824
+ init_messages();
5769
6825
  init_base_provider();
5770
6826
  init_constants2();
6827
+ init_gemini_image_models();
5771
6828
  init_gemini_models();
6829
+ init_gemini_speech_models();
5772
6830
  init_utils();
5773
6831
  GEMINI_ROLE_MAP = {
5774
6832
  system: "user",
@@ -5783,6 +6841,139 @@ var init_gemini = __esm({
5783
6841
  getModelSpecs() {
5784
6842
  return GEMINI_MODELS;
5785
6843
  }
6844
+ // =========================================================================
6845
+ // Image Generation
6846
+ // =========================================================================
6847
+ getImageModelSpecs() {
6848
+ return geminiImageModels;
6849
+ }
6850
+ supportsImageGeneration(modelId) {
6851
+ return isGeminiImageModel(modelId);
6852
+ }
6853
+ async generateImage(options) {
6854
+ const client = this.client;
6855
+ const spec = getGeminiImageModelSpec(options.model);
6856
+ const isImagenModel = options.model.startsWith("imagen");
6857
+ const aspectRatio = options.size ?? spec?.defaultSize ?? "1:1";
6858
+ const n = options.n ?? 1;
6859
+ if (isImagenModel) {
6860
+ const response2 = await client.models.generateImages({
6861
+ model: options.model,
6862
+ prompt: options.prompt,
6863
+ config: {
6864
+ numberOfImages: n,
6865
+ aspectRatio,
6866
+ outputMimeType: options.responseFormat === "b64_json" ? "image/png" : "image/jpeg"
6867
+ }
6868
+ });
6869
+ const images2 = response2.generatedImages ?? [];
6870
+ const cost2 = calculateGeminiImageCost(options.model, aspectRatio, images2.length);
6871
+ return {
6872
+ // Gemini's imageBytes is already base64 encoded, so use it directly
6873
+ images: images2.map((img) => ({
6874
+ b64Json: img.image?.imageBytes ?? void 0
6875
+ })),
6876
+ model: options.model,
6877
+ usage: {
6878
+ imagesGenerated: images2.length,
6879
+ size: aspectRatio,
6880
+ quality: "standard"
6881
+ },
6882
+ cost: cost2
6883
+ };
6884
+ }
6885
+ const response = await client.models.generateContent({
6886
+ model: options.model,
6887
+ contents: [{ role: "user", parts: [{ text: options.prompt }] }],
6888
+ config: {
6889
+ responseModalities: [Modality.IMAGE, Modality.TEXT]
6890
+ }
6891
+ });
6892
+ const images = [];
6893
+ const candidate = response.candidates?.[0];
6894
+ if (candidate?.content?.parts) {
6895
+ for (const part of candidate.content.parts) {
6896
+ if ("inlineData" in part && part.inlineData) {
6897
+ images.push({
6898
+ b64Json: part.inlineData.data
6899
+ });
6900
+ }
6901
+ }
6902
+ }
6903
+ const cost = calculateGeminiImageCost(options.model, aspectRatio, images.length);
6904
+ return {
6905
+ images,
6906
+ model: options.model,
6907
+ usage: {
6908
+ imagesGenerated: images.length,
6909
+ size: aspectRatio,
6910
+ quality: "standard"
6911
+ },
6912
+ cost
6913
+ };
6914
+ }
6915
+ // =========================================================================
6916
+ // Speech Generation
6917
+ // =========================================================================
6918
+ getSpeechModelSpecs() {
6919
+ return geminiSpeechModels;
6920
+ }
6921
+ supportsSpeechGeneration(modelId) {
6922
+ return isGeminiSpeechModel(modelId);
6923
+ }
6924
+ async generateSpeech(options) {
6925
+ const client = this.client;
6926
+ const spec = getGeminiSpeechModelSpec(options.model);
6927
+ const voice = options.voice ?? spec?.defaultVoice ?? "Zephyr";
6928
+ const response = await client.models.generateContent({
6929
+ model: options.model,
6930
+ contents: [
6931
+ {
6932
+ role: "user",
6933
+ parts: [{ text: options.input }]
6934
+ }
6935
+ ],
6936
+ config: {
6937
+ responseModalities: [Modality.AUDIO],
6938
+ speechConfig: {
6939
+ voiceConfig: {
6940
+ prebuiltVoiceConfig: {
6941
+ voiceName: voice
6942
+ }
6943
+ }
6944
+ }
6945
+ }
6946
+ });
6947
+ let pcmData;
6948
+ const candidate = response.candidates?.[0];
6949
+ if (candidate?.content?.parts) {
6950
+ for (const part of candidate.content.parts) {
6951
+ if ("inlineData" in part && part.inlineData?.data) {
6952
+ const base64 = part.inlineData.data;
6953
+ const binary = atob(base64);
6954
+ pcmData = new Uint8Array(binary.length);
6955
+ for (let i = 0; i < binary.length; i++) {
6956
+ pcmData[i] = binary.charCodeAt(i);
6957
+ }
6958
+ break;
6959
+ }
6960
+ }
6961
+ }
6962
+ if (!pcmData) {
6963
+ throw new Error("No audio data in Gemini TTS response");
6964
+ }
6965
+ const audioData = wrapPcmInWav(pcmData, 24e3, 16, 1);
6966
+ const cost = calculateGeminiSpeechCost(options.model, options.input.length);
6967
+ return {
6968
+ audio: audioData,
6969
+ model: options.model,
6970
+ usage: {
6971
+ characterCount: options.input.length
6972
+ },
6973
+ cost,
6974
+ format: spec?.defaultFormat ?? "wav"
6975
+ };
6976
+ }
5786
6977
  buildRequestPayload(options, descriptor, _spec, messages) {
5787
6978
  const contents = this.convertMessagesToContents(messages);
5788
6979
  const generationConfig = this.buildGenerationConfig(options);
@@ -5800,7 +6991,7 @@ var init_gemini = __esm({
5800
6991
  };
5801
6992
  return {
5802
6993
  model: descriptor.name,
5803
- contents: this.convertContentsForNewSDK(contents),
6994
+ contents,
5804
6995
  config
5805
6996
  };
5806
6997
  }
@@ -5835,18 +7026,25 @@ var init_gemini = __esm({
5835
7026
  if (message.role === "system") {
5836
7027
  expandedMessages.push({
5837
7028
  role: "user",
5838
- content: message.content
7029
+ content: extractText(message.content)
5839
7030
  });
5840
7031
  expandedMessages.push({
5841
7032
  role: "assistant",
5842
7033
  content: "Understood."
5843
7034
  });
5844
7035
  } else {
5845
- expandedMessages.push(message);
7036
+ expandedMessages.push({
7037
+ role: message.role,
7038
+ content: message.content
7039
+ });
5846
7040
  }
5847
7041
  }
5848
7042
  return this.mergeConsecutiveMessages(expandedMessages);
5849
7043
  }
7044
+ /**
7045
+ * Merge consecutive messages with the same role (required by Gemini).
7046
+ * Handles multimodal content by converting to Gemini's part format.
7047
+ */
5850
7048
  mergeConsecutiveMessages(messages) {
5851
7049
  if (messages.length === 0) {
5852
7050
  return [];
@@ -5855,15 +7053,16 @@ var init_gemini = __esm({
5855
7053
  let currentGroup = null;
5856
7054
  for (const message of messages) {
5857
7055
  const geminiRole = GEMINI_ROLE_MAP[message.role];
7056
+ const geminiParts = this.convertToGeminiParts(message.content);
5858
7057
  if (currentGroup && currentGroup.role === geminiRole) {
5859
- currentGroup.parts.push({ text: message.content });
7058
+ currentGroup.parts.push(...geminiParts);
5860
7059
  } else {
5861
7060
  if (currentGroup) {
5862
7061
  result.push(currentGroup);
5863
7062
  }
5864
7063
  currentGroup = {
5865
7064
  role: geminiRole,
5866
- parts: [{ text: message.content }]
7065
+ parts: geminiParts
5867
7066
  };
5868
7067
  }
5869
7068
  }
@@ -5872,11 +7071,39 @@ var init_gemini = __esm({
5872
7071
  }
5873
7072
  return result;
5874
7073
  }
5875
- convertContentsForNewSDK(contents) {
5876
- return contents.map((content) => ({
5877
- role: content.role,
5878
- parts: content.parts.map((part) => ({ text: part.text }))
5879
- }));
7074
+ /**
7075
+ * Convert llmist content to Gemini's part format.
7076
+ * Handles text, images, and audio (Gemini supports all three).
7077
+ */
7078
+ convertToGeminiParts(content) {
7079
+ const parts = normalizeContent(content);
7080
+ return parts.map((part) => {
7081
+ if (part.type === "text") {
7082
+ return { text: part.text };
7083
+ }
7084
+ if (part.type === "image") {
7085
+ if (part.source.type === "url") {
7086
+ throw new Error(
7087
+ "Gemini does not support image URLs directly. Please provide base64-encoded image data."
7088
+ );
7089
+ }
7090
+ return {
7091
+ inlineData: {
7092
+ mimeType: part.source.mediaType,
7093
+ data: part.source.data
7094
+ }
7095
+ };
7096
+ }
7097
+ if (part.type === "audio") {
7098
+ return {
7099
+ inlineData: {
7100
+ mimeType: part.source.mediaType,
7101
+ data: part.source.data
7102
+ }
7103
+ };
7104
+ }
7105
+ throw new Error(`Unsupported content type: ${part.type}`);
7106
+ });
5880
7107
  }
5881
7108
  buildGenerationConfig(options) {
5882
7109
  const config = {};
@@ -5897,9 +7124,9 @@ var init_gemini = __esm({
5897
7124
  async *wrapStream(iterable) {
5898
7125
  const stream2 = iterable;
5899
7126
  for await (const chunk of stream2) {
5900
- const text = this.extractText(chunk);
5901
- if (text) {
5902
- yield { text, rawEvent: chunk };
7127
+ const text3 = this.extractText(chunk);
7128
+ if (text3) {
7129
+ yield { text: text3, rawEvent: chunk };
5903
7130
  }
5904
7131
  const finishReason = this.extractFinishReason(chunk);
5905
7132
  const usage = this.extractUsage(chunk);
@@ -5960,7 +7187,7 @@ var init_gemini = __esm({
5960
7187
  try {
5961
7188
  const response = await client.models.countTokens({
5962
7189
  model: descriptor.name,
5963
- contents: this.convertContentsForNewSDK(contents)
7190
+ contents
5964
7191
  // Note: systemInstruction not used - it's not supported by countTokens()
5965
7192
  // and would cause a 2100% token counting error
5966
7193
  });
@@ -5970,14 +7197,140 @@ var init_gemini = __esm({
5970
7197
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
5971
7198
  error
5972
7199
  );
5973
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
5974
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
7200
+ let totalChars = 0;
7201
+ let mediaCount = 0;
7202
+ for (const msg of messages) {
7203
+ const parts = normalizeContent(msg.content);
7204
+ for (const part of parts) {
7205
+ if (part.type === "text") {
7206
+ totalChars += part.text.length;
7207
+ } else if (part.type === "image" || part.type === "audio") {
7208
+ mediaCount++;
7209
+ }
7210
+ }
7211
+ }
7212
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + mediaCount * 258;
5975
7213
  }
5976
7214
  }
5977
7215
  };
5978
7216
  }
5979
7217
  });
5980
7218
 
7219
+ // src/providers/openai-image-models.ts
7220
+ function getOpenAIImageModelSpec(modelId) {
7221
+ return openaiImageModels.find((m) => m.modelId === modelId);
7222
+ }
7223
+ function isOpenAIImageModel(modelId) {
7224
+ return openaiImageModels.some((m) => m.modelId === modelId);
7225
+ }
7226
+ function calculateOpenAIImageCost(modelId, size, quality = "standard", n = 1) {
7227
+ const spec = getOpenAIImageModelSpec(modelId);
7228
+ if (!spec) return void 0;
7229
+ const sizePrice = spec.pricing.bySize?.[size];
7230
+ if (sizePrice === void 0) return void 0;
7231
+ let pricePerImage;
7232
+ if (typeof sizePrice === "number") {
7233
+ pricePerImage = sizePrice;
7234
+ } else {
7235
+ pricePerImage = sizePrice[quality];
7236
+ if (pricePerImage === void 0) return void 0;
7237
+ }
7238
+ return pricePerImage * n;
7239
+ }
7240
+ var GPT_IMAGE_SIZES, GPT_IMAGE_QUALITIES, DALLE3_SIZES, DALLE3_QUALITIES, DALLE2_SIZES, openaiImageModels;
7241
+ var init_openai_image_models = __esm({
7242
+ "src/providers/openai-image-models.ts"() {
7243
+ "use strict";
7244
+ GPT_IMAGE_SIZES = ["1024x1024", "1024x1536", "1536x1024"];
7245
+ GPT_IMAGE_QUALITIES = ["low", "medium", "high"];
7246
+ DALLE3_SIZES = ["1024x1024", "1024x1792", "1792x1024"];
7247
+ DALLE3_QUALITIES = ["standard", "hd"];
7248
+ DALLE2_SIZES = ["256x256", "512x512", "1024x1024"];
7249
+ openaiImageModels = [
7250
+ // GPT Image 1 Family (flagship)
7251
+ {
7252
+ provider: "openai",
7253
+ modelId: "gpt-image-1",
7254
+ displayName: "GPT Image 1",
7255
+ pricing: {
7256
+ bySize: {
7257
+ "1024x1024": { low: 0.011, medium: 0.04, high: 0.17 },
7258
+ "1024x1536": { low: 0.016, medium: 0.06, high: 0.25 },
7259
+ "1536x1024": { low: 0.016, medium: 0.06, high: 0.25 }
7260
+ }
7261
+ },
7262
+ supportedSizes: [...GPT_IMAGE_SIZES],
7263
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
7264
+ maxImages: 1,
7265
+ defaultSize: "1024x1024",
7266
+ defaultQuality: "medium",
7267
+ features: {
7268
+ textRendering: true,
7269
+ transparency: true
7270
+ }
7271
+ },
7272
+ {
7273
+ provider: "openai",
7274
+ modelId: "gpt-image-1-mini",
7275
+ displayName: "GPT Image 1 Mini",
7276
+ pricing: {
7277
+ bySize: {
7278
+ "1024x1024": { low: 5e-3, medium: 0.02, high: 0.052 },
7279
+ "1024x1536": { low: 75e-4, medium: 0.03, high: 0.078 },
7280
+ "1536x1024": { low: 75e-4, medium: 0.03, high: 0.078 }
7281
+ }
7282
+ },
7283
+ supportedSizes: [...GPT_IMAGE_SIZES],
7284
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
7285
+ maxImages: 1,
7286
+ defaultSize: "1024x1024",
7287
+ defaultQuality: "medium",
7288
+ features: {
7289
+ textRendering: true,
7290
+ transparency: true
7291
+ }
7292
+ },
7293
+ // DALL-E Family
7294
+ {
7295
+ provider: "openai",
7296
+ modelId: "dall-e-3",
7297
+ displayName: "DALL-E 3",
7298
+ pricing: {
7299
+ bySize: {
7300
+ "1024x1024": { standard: 0.04, hd: 0.08 },
7301
+ "1024x1792": { standard: 0.08, hd: 0.12 },
7302
+ "1792x1024": { standard: 0.08, hd: 0.12 }
7303
+ }
7304
+ },
7305
+ supportedSizes: [...DALLE3_SIZES],
7306
+ supportedQualities: [...DALLE3_QUALITIES],
7307
+ maxImages: 1,
7308
+ // DALL-E 3 only supports n=1
7309
+ defaultSize: "1024x1024",
7310
+ defaultQuality: "standard",
7311
+ features: {
7312
+ textRendering: true
7313
+ }
7314
+ },
7315
+ {
7316
+ provider: "openai",
7317
+ modelId: "dall-e-2",
7318
+ displayName: "DALL-E 2 (Legacy)",
7319
+ pricing: {
7320
+ bySize: {
7321
+ "256x256": 0.016,
7322
+ "512x512": 0.018,
7323
+ "1024x1024": 0.02
7324
+ }
7325
+ },
7326
+ supportedSizes: [...DALLE2_SIZES],
7327
+ maxImages: 10,
7328
+ defaultSize: "1024x1024"
7329
+ }
7330
+ ];
7331
+ }
7332
+ });
7333
+
5981
7334
  // src/providers/openai-models.ts
5982
7335
  var OPENAI_MODELS;
5983
7336
  var init_openai_models = __esm({
@@ -6342,6 +7695,144 @@ var init_openai_models = __esm({
6342
7695
  }
6343
7696
  });
6344
7697
 
7698
+ // src/providers/openai-speech-models.ts
7699
+ function getOpenAISpeechModelSpec(modelId) {
7700
+ return openaiSpeechModels.find((m) => m.modelId === modelId);
7701
+ }
7702
+ function isOpenAISpeechModel(modelId) {
7703
+ return openaiSpeechModels.some((m) => m.modelId === modelId);
7704
+ }
7705
+ function calculateOpenAISpeechCost(modelId, characterCount, estimatedMinutes) {
7706
+ const spec = getOpenAISpeechModelSpec(modelId);
7707
+ if (!spec) return void 0;
7708
+ if (spec.pricing.perCharacter !== void 0) {
7709
+ return characterCount * spec.pricing.perCharacter;
7710
+ }
7711
+ if (spec.pricing.perMinute !== void 0 && estimatedMinutes !== void 0) {
7712
+ return estimatedMinutes * spec.pricing.perMinute;
7713
+ }
7714
+ if (spec.pricing.perMinute !== void 0) {
7715
+ const approxMinutes = characterCount / 750;
7716
+ return approxMinutes * spec.pricing.perMinute;
7717
+ }
7718
+ return void 0;
7719
+ }
7720
+ var OPENAI_TTS_VOICES, OPENAI_TTS_EXTENDED_VOICES, OPENAI_TTS_FORMATS, openaiSpeechModels;
7721
+ var init_openai_speech_models = __esm({
7722
+ "src/providers/openai-speech-models.ts"() {
7723
+ "use strict";
7724
+ OPENAI_TTS_VOICES = [
7725
+ "alloy",
7726
+ "echo",
7727
+ "fable",
7728
+ "onyx",
7729
+ "nova",
7730
+ "shimmer"
7731
+ ];
7732
+ OPENAI_TTS_EXTENDED_VOICES = [
7733
+ ...OPENAI_TTS_VOICES,
7734
+ "ash",
7735
+ "ballad",
7736
+ "coral",
7737
+ "sage",
7738
+ "verse"
7739
+ ];
7740
+ OPENAI_TTS_FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
7741
+ openaiSpeechModels = [
7742
+ // Standard TTS models (character-based pricing)
7743
+ {
7744
+ provider: "openai",
7745
+ modelId: "tts-1",
7746
+ displayName: "TTS-1",
7747
+ pricing: {
7748
+ // $15 per 1M characters = $0.000015 per character
7749
+ perCharacter: 15e-6
7750
+ },
7751
+ voices: [...OPENAI_TTS_VOICES],
7752
+ formats: OPENAI_TTS_FORMATS,
7753
+ maxInputLength: 4096,
7754
+ defaultVoice: "alloy",
7755
+ defaultFormat: "mp3",
7756
+ features: {
7757
+ voiceInstructions: false
7758
+ }
7759
+ },
7760
+ {
7761
+ provider: "openai",
7762
+ modelId: "tts-1-1106",
7763
+ displayName: "TTS-1 (Nov 2023)",
7764
+ pricing: {
7765
+ perCharacter: 15e-6
7766
+ },
7767
+ voices: [...OPENAI_TTS_VOICES],
7768
+ formats: OPENAI_TTS_FORMATS,
7769
+ maxInputLength: 4096,
7770
+ defaultVoice: "alloy",
7771
+ defaultFormat: "mp3",
7772
+ features: {
7773
+ voiceInstructions: false
7774
+ }
7775
+ },
7776
+ {
7777
+ provider: "openai",
7778
+ modelId: "tts-1-hd",
7779
+ displayName: "TTS-1 HD",
7780
+ pricing: {
7781
+ // $30 per 1M characters = $0.00003 per character
7782
+ perCharacter: 3e-5
7783
+ },
7784
+ voices: [...OPENAI_TTS_VOICES],
7785
+ formats: OPENAI_TTS_FORMATS,
7786
+ maxInputLength: 4096,
7787
+ defaultVoice: "alloy",
7788
+ defaultFormat: "mp3",
7789
+ features: {
7790
+ voiceInstructions: false
7791
+ }
7792
+ },
7793
+ {
7794
+ provider: "openai",
7795
+ modelId: "tts-1-hd-1106",
7796
+ displayName: "TTS-1 HD (Nov 2023)",
7797
+ pricing: {
7798
+ perCharacter: 3e-5
7799
+ },
7800
+ voices: [...OPENAI_TTS_VOICES],
7801
+ formats: OPENAI_TTS_FORMATS,
7802
+ maxInputLength: 4096,
7803
+ defaultVoice: "alloy",
7804
+ defaultFormat: "mp3",
7805
+ features: {
7806
+ voiceInstructions: false
7807
+ }
7808
+ },
7809
+ // Token-based TTS model with voice instructions support
7810
+ {
7811
+ provider: "openai",
7812
+ modelId: "gpt-4o-mini-tts",
7813
+ displayName: "GPT-4o Mini TTS",
7814
+ pricing: {
7815
+ // $0.60 per 1M input tokens = $0.0000006 per token
7816
+ perInputToken: 6e-7,
7817
+ // $12 per 1M audio output tokens = $0.000012 per token
7818
+ perAudioOutputToken: 12e-6,
7819
+ // ~$0.015 per minute of audio
7820
+ perMinute: 0.015
7821
+ },
7822
+ voices: [...OPENAI_TTS_EXTENDED_VOICES],
7823
+ formats: OPENAI_TTS_FORMATS,
7824
+ maxInputLength: 2e3,
7825
+ // tokens, not characters
7826
+ defaultVoice: "alloy",
7827
+ defaultFormat: "mp3",
7828
+ features: {
7829
+ voiceInstructions: true
7830
+ }
7831
+ }
7832
+ ];
7833
+ }
7834
+ });
7835
+
6345
7836
  // src/providers/openai.ts
6346
7837
  import OpenAI from "openai";
6347
7838
  import { encoding_for_model } from "tiktoken";
@@ -6361,9 +7852,12 @@ var ROLE_MAP, OpenAIChatProvider;
6361
7852
  var init_openai = __esm({
6362
7853
  "src/providers/openai.ts"() {
6363
7854
  "use strict";
7855
+ init_messages();
6364
7856
  init_base_provider();
6365
7857
  init_constants2();
7858
+ init_openai_image_models();
6366
7859
  init_openai_models();
7860
+ init_openai_speech_models();
6367
7861
  init_utils();
6368
7862
  ROLE_MAP = {
6369
7863
  system: "system",
@@ -6378,6 +7872,87 @@ var init_openai = __esm({
6378
7872
  getModelSpecs() {
6379
7873
  return OPENAI_MODELS;
6380
7874
  }
7875
+ // =========================================================================
7876
+ // Image Generation
7877
+ // =========================================================================
7878
+ getImageModelSpecs() {
7879
+ return openaiImageModels;
7880
+ }
7881
+ supportsImageGeneration(modelId) {
7882
+ return isOpenAIImageModel(modelId);
7883
+ }
7884
+ async generateImage(options) {
7885
+ const client = this.client;
7886
+ const spec = getOpenAIImageModelSpec(options.model);
7887
+ const size = options.size ?? spec?.defaultSize ?? "1024x1024";
7888
+ const quality = options.quality ?? spec?.defaultQuality ?? "standard";
7889
+ const n = options.n ?? 1;
7890
+ const isDallE2 = options.model === "dall-e-2";
7891
+ const isGptImage = options.model.startsWith("gpt-image");
7892
+ const requestParams = {
7893
+ model: options.model,
7894
+ prompt: options.prompt,
7895
+ size,
7896
+ n
7897
+ };
7898
+ if (!isDallE2 && !isGptImage) {
7899
+ requestParams.quality = quality;
7900
+ }
7901
+ if (isGptImage) {
7902
+ } else if (!isDallE2) {
7903
+ requestParams.response_format = options.responseFormat ?? "url";
7904
+ }
7905
+ const response = await client.images.generate(requestParams);
7906
+ const cost = calculateOpenAIImageCost(options.model, size, quality, n);
7907
+ const images = response.data ?? [];
7908
+ return {
7909
+ images: images.map((img) => ({
7910
+ url: img.url,
7911
+ b64Json: img.b64_json,
7912
+ revisedPrompt: img.revised_prompt
7913
+ })),
7914
+ model: options.model,
7915
+ usage: {
7916
+ imagesGenerated: images.length,
7917
+ size,
7918
+ quality
7919
+ },
7920
+ cost
7921
+ };
7922
+ }
7923
+ // =========================================================================
7924
+ // Speech Generation
7925
+ // =========================================================================
7926
+ getSpeechModelSpecs() {
7927
+ return openaiSpeechModels;
7928
+ }
7929
+ supportsSpeechGeneration(modelId) {
7930
+ return isOpenAISpeechModel(modelId);
7931
+ }
7932
+ async generateSpeech(options) {
7933
+ const client = this.client;
7934
+ const spec = getOpenAISpeechModelSpec(options.model);
7935
+ const format = options.responseFormat ?? spec?.defaultFormat ?? "mp3";
7936
+ const voice = options.voice ?? spec?.defaultVoice ?? "alloy";
7937
+ const response = await client.audio.speech.create({
7938
+ model: options.model,
7939
+ input: options.input,
7940
+ voice,
7941
+ response_format: format,
7942
+ speed: options.speed ?? 1
7943
+ });
7944
+ const audioBuffer = await response.arrayBuffer();
7945
+ const cost = calculateOpenAISpeechCost(options.model, options.input.length);
7946
+ return {
7947
+ audio: audioBuffer,
7948
+ model: options.model,
7949
+ usage: {
7950
+ characterCount: options.input.length
7951
+ },
7952
+ cost,
7953
+ format
7954
+ };
7955
+ }
6381
7956
  buildRequestPayload(options, descriptor, spec, messages) {
6382
7957
  const { maxTokens, temperature, topP, stopSequences, extra } = options;
6383
7958
  const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
@@ -6385,11 +7960,7 @@ var init_openai = __esm({
6385
7960
  const sanitizedExtra = sanitizeExtra(extra, shouldIncludeTemperature);
6386
7961
  return {
6387
7962
  model: descriptor.name,
6388
- messages: messages.map((message) => ({
6389
- role: ROLE_MAP[message.role],
6390
- content: message.content,
6391
- name: message.name
6392
- })),
7963
+ messages: messages.map((message) => this.convertToOpenAIMessage(message)),
6393
7964
  // Only set max_completion_tokens if explicitly provided
6394
7965
  // Otherwise let the API use "as much as fits" in the context window
6395
7966
  ...maxTokens !== void 0 ? { max_completion_tokens: maxTokens } : {},
@@ -6401,6 +7972,77 @@ var init_openai = __esm({
6401
7972
  ...shouldIncludeTemperature ? { temperature } : {}
6402
7973
  };
6403
7974
  }
7975
+ /**
7976
+ * Convert an LLMMessage to OpenAI's ChatCompletionMessageParam.
7977
+ * Handles role-specific content type requirements:
7978
+ * - system/assistant: string content only
7979
+ * - user: string or multimodal array content
7980
+ */
7981
+ convertToOpenAIMessage(message) {
7982
+ const role = ROLE_MAP[message.role];
7983
+ if (role === "user") {
7984
+ const content = this.convertToOpenAIContent(message.content);
7985
+ return {
7986
+ role: "user",
7987
+ content,
7988
+ ...message.name ? { name: message.name } : {}
7989
+ };
7990
+ }
7991
+ const textContent = typeof message.content === "string" ? message.content : extractText(message.content);
7992
+ if (role === "system") {
7993
+ return {
7994
+ role: "system",
7995
+ content: textContent,
7996
+ ...message.name ? { name: message.name } : {}
7997
+ };
7998
+ }
7999
+ return {
8000
+ role: "assistant",
8001
+ content: textContent,
8002
+ ...message.name ? { name: message.name } : {}
8003
+ };
8004
+ }
8005
+ /**
8006
+ * Convert llmist content to OpenAI's content format.
8007
+ * Optimizes by returning string for text-only content, array for multimodal.
8008
+ */
8009
+ convertToOpenAIContent(content) {
8010
+ if (typeof content === "string") {
8011
+ return content;
8012
+ }
8013
+ return content.map((part) => {
8014
+ if (part.type === "text") {
8015
+ return { type: "text", text: part.text };
8016
+ }
8017
+ if (part.type === "image") {
8018
+ return this.convertImagePart(part);
8019
+ }
8020
+ if (part.type === "audio") {
8021
+ throw new Error(
8022
+ "OpenAI chat completions do not support audio input. Use Whisper for transcription or Gemini for audio understanding."
8023
+ );
8024
+ }
8025
+ throw new Error(`Unsupported content type: ${part.type}`);
8026
+ });
8027
+ }
8028
+ /**
8029
+ * Convert an image content part to OpenAI's image_url format.
8030
+ * Supports both URLs and base64 data URLs.
8031
+ */
8032
+ convertImagePart(part) {
8033
+ if (part.source.type === "url") {
8034
+ return {
8035
+ type: "image_url",
8036
+ image_url: { url: part.source.url }
8037
+ };
8038
+ }
8039
+ return {
8040
+ type: "image_url",
8041
+ image_url: {
8042
+ url: `data:${part.source.mediaType};base64,${part.source.data}`
8043
+ }
8044
+ };
8045
+ }
6404
8046
  async executeStreamRequest(payload, signal) {
6405
8047
  const client = this.client;
6406
8048
  const stream2 = await client.chat.completions.create(payload, signal ? { signal } : void 0);
@@ -6409,9 +8051,9 @@ var init_openai = __esm({
6409
8051
  async *wrapStream(iterable) {
6410
8052
  const stream2 = iterable;
6411
8053
  for await (const chunk of stream2) {
6412
- const text = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
6413
- if (text) {
6414
- yield { text, rawEvent: chunk };
8054
+ const text3 = chunk.choices.map((choice) => choice.delta?.content ?? "").join("");
8055
+ if (text3) {
8056
+ yield { text: text3, rawEvent: chunk };
6415
8057
  }
6416
8058
  const finishReason = chunk.choices.find((choice) => choice.finish_reason)?.finish_reason;
6417
8059
  const usage = chunk.usage ? {
@@ -6459,17 +8101,26 @@ var init_openai = __esm({
6459
8101
  }
6460
8102
  try {
6461
8103
  let tokenCount = 0;
8104
+ let imageCount = 0;
6462
8105
  for (const message of messages) {
6463
8106
  tokenCount += OPENAI_MESSAGE_OVERHEAD_TOKENS;
6464
8107
  const roleText = ROLE_MAP[message.role];
6465
8108
  tokenCount += encoding.encode(roleText).length;
6466
- tokenCount += encoding.encode(message.content ?? "").length;
8109
+ const textContent = extractText(message.content);
8110
+ tokenCount += encoding.encode(textContent).length;
8111
+ const parts = normalizeContent(message.content);
8112
+ for (const part of parts) {
8113
+ if (part.type === "image") {
8114
+ imageCount++;
8115
+ }
8116
+ }
6467
8117
  if (message.name) {
6468
8118
  tokenCount += encoding.encode(message.name).length;
6469
8119
  tokenCount += OPENAI_NAME_FIELD_OVERHEAD_TOKENS;
6470
8120
  }
6471
8121
  }
6472
8122
  tokenCount += OPENAI_REPLY_PRIMING_TOKENS;
8123
+ tokenCount += imageCount * 765;
6473
8124
  return tokenCount;
6474
8125
  } finally {
6475
8126
  encoding.free();
@@ -6479,8 +8130,19 @@ var init_openai = __esm({
6479
8130
  `Token counting failed for ${descriptor.name}, using fallback estimation:`,
6480
8131
  error
6481
8132
  );
6482
- const totalChars = messages.reduce((sum, msg) => sum + (msg.content?.length ?? 0), 0);
6483
- return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN);
8133
+ let totalChars = 0;
8134
+ let imageCount = 0;
8135
+ for (const msg of messages) {
8136
+ const parts = normalizeContent(msg.content);
8137
+ for (const part of parts) {
8138
+ if (part.type === "text") {
8139
+ totalChars += part.text.length;
8140
+ } else if (part.type === "image") {
8141
+ imageCount++;
8142
+ }
8143
+ }
8144
+ }
8145
+ return Math.ceil(totalChars / FALLBACK_CHARS_PER_TOKEN) + imageCount * 765;
6484
8146
  }
6485
8147
  }
6486
8148
  };
@@ -6718,30 +8380,109 @@ var init_model_registry = __esm({
6718
8380
  }
6719
8381
  });
6720
8382
 
6721
- // src/core/options.ts
6722
- var ModelIdentifierParser;
6723
- var init_options = __esm({
6724
- "src/core/options.ts"() {
8383
+ // src/core/namespaces/image.ts
8384
+ var ImageNamespace;
8385
+ var init_image = __esm({
8386
+ "src/core/namespaces/image.ts"() {
6725
8387
  "use strict";
6726
- ModelIdentifierParser = class {
6727
- constructor(defaultProvider = "openai") {
8388
+ ImageNamespace = class {
8389
+ constructor(adapters, defaultProvider) {
8390
+ this.adapters = adapters;
6728
8391
  this.defaultProvider = defaultProvider;
6729
8392
  }
6730
- parse(identifier) {
6731
- const trimmed = identifier.trim();
6732
- if (!trimmed) {
6733
- throw new Error("Model identifier cannot be empty");
8393
+ /**
8394
+ * Generate images from a text prompt.
8395
+ *
8396
+ * @param options - Image generation options
8397
+ * @returns Promise resolving to the generation result with images and cost
8398
+ * @throws Error if the provider doesn't support image generation
8399
+ */
8400
+ async generate(options) {
8401
+ const modelId = options.model;
8402
+ const adapter = this.findImageAdapter(modelId);
8403
+ if (!adapter || !adapter.generateImage) {
8404
+ throw new Error(
8405
+ `No provider supports image generation for model "${modelId}". Available image models: ${this.listModels().map((m) => m.modelId).join(", ")}`
8406
+ );
6734
8407
  }
6735
- const [maybeProvider, ...rest] = trimmed.split(":");
6736
- if (rest.length === 0) {
6737
- return { provider: this.defaultProvider, name: maybeProvider };
8408
+ return adapter.generateImage(options);
8409
+ }
8410
+ /**
8411
+ * List all available image generation models.
8412
+ */
8413
+ listModels() {
8414
+ const models = [];
8415
+ for (const adapter of this.adapters) {
8416
+ if (adapter.getImageModelSpecs) {
8417
+ models.push(...adapter.getImageModelSpecs());
8418
+ }
8419
+ }
8420
+ return models;
8421
+ }
8422
+ /**
8423
+ * Check if a model is supported for image generation.
8424
+ */
8425
+ supportsModel(modelId) {
8426
+ return this.findImageAdapter(modelId) !== void 0;
8427
+ }
8428
+ findImageAdapter(modelId) {
8429
+ return this.adapters.find(
8430
+ (adapter) => adapter.supportsImageGeneration?.(modelId) ?? false
8431
+ );
8432
+ }
8433
+ };
8434
+ }
8435
+ });
8436
+
8437
+ // src/core/namespaces/speech.ts
8438
+ var SpeechNamespace;
8439
+ var init_speech = __esm({
8440
+ "src/core/namespaces/speech.ts"() {
8441
+ "use strict";
8442
+ SpeechNamespace = class {
8443
+ constructor(adapters, defaultProvider) {
8444
+ this.adapters = adapters;
8445
+ this.defaultProvider = defaultProvider;
8446
+ }
8447
+ /**
8448
+ * Generate speech audio from text.
8449
+ *
8450
+ * @param options - Speech generation options
8451
+ * @returns Promise resolving to the generation result with audio and cost
8452
+ * @throws Error if the provider doesn't support speech generation
8453
+ */
8454
+ async generate(options) {
8455
+ const modelId = options.model;
8456
+ const adapter = this.findSpeechAdapter(modelId);
8457
+ if (!adapter || !adapter.generateSpeech) {
8458
+ throw new Error(
8459
+ `No provider supports speech generation for model "${modelId}". Available speech models: ${this.listModels().map((m) => m.modelId).join(", ")}`
8460
+ );
6738
8461
  }
6739
- const provider = maybeProvider;
6740
- const name = rest.join(":");
6741
- if (!name) {
6742
- throw new Error("Model name cannot be empty");
8462
+ return adapter.generateSpeech(options);
8463
+ }
8464
+ /**
8465
+ * List all available speech generation models.
8466
+ */
8467
+ listModels() {
8468
+ const models = [];
8469
+ for (const adapter of this.adapters) {
8470
+ if (adapter.getSpeechModelSpecs) {
8471
+ models.push(...adapter.getSpeechModelSpecs());
8472
+ }
6743
8473
  }
6744
- return { provider, name };
8474
+ return models;
8475
+ }
8476
+ /**
8477
+ * Check if a model is supported for speech generation.
8478
+ */
8479
+ supportsModel(modelId) {
8480
+ return this.findSpeechAdapter(modelId) !== void 0;
8481
+ }
8482
+ findSpeechAdapter(modelId) {
8483
+ return this.adapters.find(
8484
+ (adapter) => adapter.supportsSpeechGeneration?.(modelId) ?? false
8485
+ );
6745
8486
  }
6746
8487
  };
6747
8488
  }
@@ -6790,6 +8531,201 @@ var init_quick_methods = __esm({
6790
8531
  }
6791
8532
  });
6792
8533
 
8534
+ // src/core/namespaces/text.ts
8535
+ var TextNamespace;
8536
+ var init_text = __esm({
8537
+ "src/core/namespaces/text.ts"() {
8538
+ "use strict";
8539
+ init_quick_methods();
8540
+ TextNamespace = class {
8541
+ constructor(client) {
8542
+ this.client = client;
8543
+ }
8544
+ /**
8545
+ * Generate a complete text response.
8546
+ *
8547
+ * @param prompt - User prompt
8548
+ * @param options - Optional configuration
8549
+ * @returns Complete text response
8550
+ */
8551
+ async complete(prompt, options) {
8552
+ return complete(this.client, prompt, options);
8553
+ }
8554
+ /**
8555
+ * Stream text chunks.
8556
+ *
8557
+ * @param prompt - User prompt
8558
+ * @param options - Optional configuration
8559
+ * @returns Async generator yielding text chunks
8560
+ */
8561
+ stream(prompt, options) {
8562
+ return stream(this.client, prompt, options);
8563
+ }
8564
+ };
8565
+ }
8566
+ });
8567
+
8568
+ // src/core/namespaces/vision.ts
8569
+ var VisionNamespace;
8570
+ var init_vision = __esm({
8571
+ "src/core/namespaces/vision.ts"() {
8572
+ "use strict";
8573
+ init_input_content();
8574
+ init_messages();
8575
+ VisionNamespace = class {
8576
+ constructor(client) {
8577
+ this.client = client;
8578
+ }
8579
+ /**
8580
+ * Build a message builder with the image content attached.
8581
+ * Handles URLs, data URLs, base64 strings, and binary buffers.
8582
+ */
8583
+ buildImageMessage(options) {
8584
+ const builder = new LLMMessageBuilder();
8585
+ if (options.systemPrompt) {
8586
+ builder.addSystem(options.systemPrompt);
8587
+ }
8588
+ if (typeof options.image === "string") {
8589
+ if (options.image.startsWith("http://") || options.image.startsWith("https://")) {
8590
+ builder.addUserWithImageUrl(options.prompt, options.image);
8591
+ } else if (isDataUrl(options.image)) {
8592
+ const parsed = parseDataUrl(options.image);
8593
+ if (!parsed) {
8594
+ throw new Error("Invalid data URL format");
8595
+ }
8596
+ builder.addUserWithImage(
8597
+ options.prompt,
8598
+ parsed.data,
8599
+ parsed.mimeType
8600
+ );
8601
+ } else {
8602
+ const buffer = Buffer.from(options.image, "base64");
8603
+ builder.addUserWithImage(options.prompt, buffer, options.mimeType);
8604
+ }
8605
+ } else {
8606
+ builder.addUserWithImage(options.prompt, options.image, options.mimeType);
8607
+ }
8608
+ return builder;
8609
+ }
8610
+ /**
8611
+ * Stream the response and collect text and usage information.
8612
+ */
8613
+ async streamAndCollect(options, builder) {
8614
+ let response = "";
8615
+ let finalUsage;
8616
+ for await (const chunk of this.client.stream({
8617
+ model: options.model,
8618
+ messages: builder.build(),
8619
+ maxTokens: options.maxTokens,
8620
+ temperature: options.temperature
8621
+ })) {
8622
+ response += chunk.text;
8623
+ if (chunk.usage) {
8624
+ finalUsage = {
8625
+ inputTokens: chunk.usage.inputTokens,
8626
+ outputTokens: chunk.usage.outputTokens,
8627
+ totalTokens: chunk.usage.totalTokens
8628
+ };
8629
+ }
8630
+ }
8631
+ return { text: response.trim(), usage: finalUsage };
8632
+ }
8633
+ /**
8634
+ * Analyze an image with a vision-capable model.
8635
+ * Returns the analysis as a string.
8636
+ *
8637
+ * @param options - Vision analysis options
8638
+ * @returns Promise resolving to the analysis text
8639
+ * @throws Error if the image format is unsupported or model doesn't support vision
8640
+ *
8641
+ * @example
8642
+ * ```typescript
8643
+ * // From file
8644
+ * const result = await llmist.vision.analyze({
8645
+ * model: "gpt-4o",
8646
+ * image: await fs.readFile("photo.jpg"),
8647
+ * prompt: "What's in this image?",
8648
+ * });
8649
+ *
8650
+ * // From URL (OpenAI only)
8651
+ * const result = await llmist.vision.analyze({
8652
+ * model: "gpt-4o",
8653
+ * image: "https://example.com/image.jpg",
8654
+ * prompt: "Describe this image",
8655
+ * });
8656
+ * ```
8657
+ */
8658
+ async analyze(options) {
8659
+ const builder = this.buildImageMessage(options);
8660
+ const { text: text3 } = await this.streamAndCollect(options, builder);
8661
+ return text3;
8662
+ }
8663
+ /**
8664
+ * Analyze an image and return detailed result with usage info.
8665
+ *
8666
+ * @param options - Vision analysis options
8667
+ * @returns Promise resolving to the analysis result with usage info
8668
+ */
8669
+ async analyzeWithUsage(options) {
8670
+ const builder = this.buildImageMessage(options);
8671
+ const { text: text3, usage } = await this.streamAndCollect(options, builder);
8672
+ return {
8673
+ text: text3,
8674
+ model: options.model,
8675
+ usage
8676
+ };
8677
+ }
8678
+ /**
8679
+ * Check if a model supports vision/image input.
8680
+ *
8681
+ * @param modelId - Model ID to check
8682
+ * @returns True if the model supports vision
8683
+ */
8684
+ supportsModel(modelId) {
8685
+ const spec = this.client.modelRegistry.getModelSpec(modelId);
8686
+ return spec?.features?.vision === true;
8687
+ }
8688
+ /**
8689
+ * List all models that support vision.
8690
+ *
8691
+ * @returns Array of model IDs that support vision
8692
+ */
8693
+ listModels() {
8694
+ return this.client.modelRegistry.listModels().filter((spec) => spec.features?.vision === true).map((spec) => spec.modelId);
8695
+ }
8696
+ };
8697
+ }
8698
+ });
8699
+
8700
+ // src/core/options.ts
8701
+ var ModelIdentifierParser;
8702
+ var init_options = __esm({
8703
+ "src/core/options.ts"() {
8704
+ "use strict";
8705
+ ModelIdentifierParser = class {
8706
+ constructor(defaultProvider = "openai") {
8707
+ this.defaultProvider = defaultProvider;
8708
+ }
8709
+ parse(identifier) {
8710
+ const trimmed = identifier.trim();
8711
+ if (!trimmed) {
8712
+ throw new Error("Model identifier cannot be empty");
8713
+ }
8714
+ const [maybeProvider, ...rest] = trimmed.split(":");
8715
+ if (rest.length === 0) {
8716
+ return { provider: this.defaultProvider, name: maybeProvider };
8717
+ }
8718
+ const provider = maybeProvider;
8719
+ const name = rest.join(":");
8720
+ if (!name) {
8721
+ throw new Error("Model name cannot be empty");
8722
+ }
8723
+ return { provider, name };
8724
+ }
8725
+ };
8726
+ }
8727
+ });
8728
+
6793
8729
  // src/core/client.ts
6794
8730
  var client_exports = {};
6795
8731
  __export(client_exports, {
@@ -6802,12 +8738,22 @@ var init_client = __esm({
6802
8738
  init_builder();
6803
8739
  init_discovery();
6804
8740
  init_model_registry();
8741
+ init_image();
8742
+ init_speech();
8743
+ init_text();
8744
+ init_vision();
6805
8745
  init_options();
6806
8746
  init_quick_methods();
6807
8747
  LLMist = class _LLMist {
6808
8748
  parser;
8749
+ defaultProvider;
6809
8750
  modelRegistry;
6810
8751
  adapters;
8752
+ // Namespaces for different generation types
8753
+ text;
8754
+ image;
8755
+ speech;
8756
+ vision;
6811
8757
  constructor(...args) {
6812
8758
  let adapters = [];
6813
8759
  let defaultProvider;
@@ -6846,6 +8792,7 @@ var init_client = __esm({
6846
8792
  const priorityB = b.priority ?? 0;
6847
8793
  return priorityB - priorityA;
6848
8794
  });
8795
+ this.defaultProvider = resolvedDefaultProvider;
6849
8796
  this.parser = new ModelIdentifierParser(resolvedDefaultProvider);
6850
8797
  this.modelRegistry = new ModelRegistry();
6851
8798
  for (const adapter of this.adapters) {
@@ -6854,6 +8801,10 @@ var init_client = __esm({
6854
8801
  if (customModels.length > 0) {
6855
8802
  this.modelRegistry.registerModels(customModels);
6856
8803
  }
8804
+ this.text = new TextNamespace(this);
8805
+ this.image = new ImageNamespace(this.adapters, this.defaultProvider);
8806
+ this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
8807
+ this.vision = new VisionNamespace(this);
6857
8808
  }
6858
8809
  stream(options) {
6859
8810
  const descriptor = this.parser.parse(options.model);
@@ -7275,9 +9226,9 @@ function sleep(ms) {
7275
9226
  function generateInvocationId() {
7276
9227
  return `inv-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
7277
9228
  }
7278
- function splitIntoChunks(text, minChunkSize = 5, maxChunkSize = 30) {
9229
+ function splitIntoChunks(text3, minChunkSize = 5, maxChunkSize = 30) {
7279
9230
  const chunks = [];
7280
- let remaining = text;
9231
+ let remaining = text3;
7281
9232
  while (remaining.length > 0) {
7282
9233
  const chunkSize = Math.min(
7283
9234
  Math.floor(Math.random() * (maxChunkSize - minChunkSize + 1)) + minChunkSize,
@@ -7336,17 +9287,17 @@ ${String(value)}
7336
9287
  return result;
7337
9288
  }
7338
9289
  function formatGadgetCalls(gadgetCalls) {
7339
- let text = "";
9290
+ let text3 = "";
7340
9291
  const calls = [];
7341
9292
  for (const call of gadgetCalls) {
7342
9293
  const invocationId = call.invocationId ?? generateInvocationId();
7343
9294
  calls.push({ name: call.gadgetName, invocationId });
7344
9295
  const blockParams = serializeToBlockFormat(call.parameters);
7345
- text += `
9296
+ text3 += `
7346
9297
  ${GADGET_START_PREFIX}${call.gadgetName}
7347
9298
  ${blockParams}${GADGET_END_PREFIX}`;
7348
9299
  }
7349
- return { text, calls };
9300
+ return { text: text3, calls };
7350
9301
  }
7351
9302
  async function* createMockStream(response) {
7352
9303
  if (response.delayMs) {
@@ -7386,9 +9337,9 @@ async function* createMockStream(response) {
7386
9337
  };
7387
9338
  }
7388
9339
  }
7389
- function createTextMockStream(text, options) {
9340
+ function createTextMockStream(text3, options) {
7390
9341
  return createMockStream({
7391
- text,
9342
+ text: text3,
7392
9343
  delayMs: options?.delayMs,
7393
9344
  streamDelayMs: options?.streamDelayMs,
7394
9345
  usage: options?.usage,
@@ -7405,10 +9356,10 @@ var MockProviderAdapter = class {
7405
9356
  constructor(options) {
7406
9357
  this.mockManager = getMockManager(options);
7407
9358
  }
7408
- supports(descriptor) {
9359
+ supports(_descriptor) {
7409
9360
  return true;
7410
9361
  }
7411
- stream(options, descriptor, spec) {
9362
+ stream(options, descriptor, _spec) {
7412
9363
  const context = {
7413
9364
  model: options.model,
7414
9365
  provider: descriptor.provider,
@@ -7419,20 +9370,154 @@ var MockProviderAdapter = class {
7419
9370
  return this.createMockStreamFromContext(context);
7420
9371
  }
7421
9372
  async *createMockStreamFromContext(context) {
7422
- try {
7423
- const mockResponse = await this.mockManager.findMatch(context);
7424
- if (!mockResponse) {
7425
- yield {
7426
- text: "",
7427
- finishReason: "stop",
7428
- usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
7429
- };
7430
- return;
7431
- }
7432
- yield* createMockStream(mockResponse);
7433
- } catch (error) {
7434
- throw error;
9373
+ const mockResponse = await this.mockManager.findMatch(context);
9374
+ if (!mockResponse) {
9375
+ yield {
9376
+ text: "",
9377
+ finishReason: "stop",
9378
+ usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
9379
+ };
9380
+ return;
9381
+ }
9382
+ yield* createMockStream(mockResponse);
9383
+ }
9384
+ // ==========================================================================
9385
+ // Image Generation Support
9386
+ // ==========================================================================
9387
+ /**
9388
+ * Check if this adapter supports image generation for a given model.
9389
+ * Returns true if there's a registered mock with images for this model.
9390
+ */
9391
+ supportsImageGeneration(_modelId) {
9392
+ return true;
9393
+ }
9394
+ /**
9395
+ * Generate mock images based on registered mocks.
9396
+ *
9397
+ * @param options - Image generation options
9398
+ * @returns Mock image generation result
9399
+ */
9400
+ async generateImage(options) {
9401
+ const context = {
9402
+ model: options.model,
9403
+ provider: "mock",
9404
+ modelName: options.model,
9405
+ options: {
9406
+ model: options.model,
9407
+ messages: [{ role: "user", content: options.prompt }]
9408
+ },
9409
+ messages: [{ role: "user", content: options.prompt }]
9410
+ };
9411
+ const mockResponse = await this.mockManager.findMatch(context);
9412
+ if (!mockResponse?.images || mockResponse.images.length === 0) {
9413
+ throw new Error(
9414
+ `No mock registered for image generation with model "${options.model}". Use mockLLM().forModel("${options.model}").returnsImage(...).register() to add one.`
9415
+ );
9416
+ }
9417
+ return this.createImageResult(options, mockResponse);
9418
+ }
9419
+ /**
9420
+ * Transform mock response into ImageGenerationResult format.
9421
+ *
9422
+ * @param options - Original image generation options
9423
+ * @param mockResponse - Mock response containing image data
9424
+ * @returns ImageGenerationResult with mock data and zero cost
9425
+ */
9426
+ createImageResult(options, mockResponse) {
9427
+ const images = mockResponse.images ?? [];
9428
+ return {
9429
+ images: images.map((img) => ({
9430
+ b64Json: img.data,
9431
+ revisedPrompt: img.revisedPrompt
9432
+ })),
9433
+ model: options.model,
9434
+ usage: {
9435
+ imagesGenerated: images.length,
9436
+ size: options.size ?? "1024x1024",
9437
+ quality: options.quality ?? "standard"
9438
+ },
9439
+ cost: 0
9440
+ // Mock cost is always 0
9441
+ };
9442
+ }
9443
+ // ==========================================================================
9444
+ // Speech Generation Support
9445
+ // ==========================================================================
9446
+ /**
9447
+ * Check if this adapter supports speech generation for a given model.
9448
+ * Returns true if there's a registered mock with audio for this model.
9449
+ */
9450
+ supportsSpeechGeneration(_modelId) {
9451
+ return true;
9452
+ }
9453
+ /**
9454
+ * Generate mock speech based on registered mocks.
9455
+ *
9456
+ * @param options - Speech generation options
9457
+ * @returns Mock speech generation result
9458
+ */
9459
+ async generateSpeech(options) {
9460
+ const context = {
9461
+ model: options.model,
9462
+ provider: "mock",
9463
+ modelName: options.model,
9464
+ options: {
9465
+ model: options.model,
9466
+ messages: [{ role: "user", content: options.input }]
9467
+ },
9468
+ messages: [{ role: "user", content: options.input }]
9469
+ };
9470
+ const mockResponse = await this.mockManager.findMatch(context);
9471
+ if (!mockResponse?.audio) {
9472
+ throw new Error(
9473
+ `No mock registered for speech generation with model "${options.model}". Use mockLLM().forModel("${options.model}").returnsAudio(...).register() to add one.`
9474
+ );
9475
+ }
9476
+ return this.createSpeechResult(options, mockResponse);
9477
+ }
9478
+ /**
9479
+ * Transform mock response into SpeechGenerationResult format.
9480
+ * Converts base64 audio data to ArrayBuffer.
9481
+ *
9482
+ * @param options - Original speech generation options
9483
+ * @param mockResponse - Mock response containing audio data
9484
+ * @returns SpeechGenerationResult with mock data and zero cost
9485
+ */
9486
+ createSpeechResult(options, mockResponse) {
9487
+ const audio = mockResponse.audio;
9488
+ const binaryString = atob(audio.data);
9489
+ const bytes = new Uint8Array(binaryString.length);
9490
+ for (let i = 0; i < binaryString.length; i++) {
9491
+ bytes[i] = binaryString.charCodeAt(i);
7435
9492
  }
9493
+ const format = this.mimeTypeToAudioFormat(audio.mimeType);
9494
+ return {
9495
+ audio: bytes.buffer,
9496
+ model: options.model,
9497
+ usage: {
9498
+ characterCount: options.input.length
9499
+ },
9500
+ cost: 0,
9501
+ // Mock cost is always 0
9502
+ format
9503
+ };
9504
+ }
9505
+ /**
9506
+ * Map MIME type to audio format for SpeechGenerationResult.
9507
+ * Defaults to "mp3" for unknown MIME types.
9508
+ *
9509
+ * @param mimeType - Audio MIME type string
9510
+ * @returns Audio format identifier
9511
+ */
9512
+ mimeTypeToAudioFormat(mimeType) {
9513
+ const mapping = {
9514
+ "audio/mp3": "mp3",
9515
+ "audio/mpeg": "mp3",
9516
+ "audio/wav": "wav",
9517
+ "audio/webm": "opus",
9518
+ "audio/ogg": "opus"
9519
+ };
9520
+ return mapping[mimeType] ?? "mp3";
7436
9521
  }
7437
9522
  };
7438
9523
  function createMockAdapter(options) {
@@ -7440,6 +9525,20 @@ function createMockAdapter(options) {
7440
9525
  }
7441
9526
 
7442
9527
  // src/testing/mock-builder.ts
9528
+ init_input_content();
9529
+ init_messages();
9530
+ function hasImageContent(content) {
9531
+ if (typeof content === "string") return false;
9532
+ return content.some((part) => isImagePart(part));
9533
+ }
9534
+ function hasAudioContent(content) {
9535
+ if (typeof content === "string") return false;
9536
+ return content.some((part) => isAudioPart(part));
9537
+ }
9538
+ function countImages(content) {
9539
+ if (typeof content === "string") return 0;
9540
+ return content.filter((part) => isImagePart(part)).length;
9541
+ }
7443
9542
  var MockBuilder = class {
7444
9543
  matchers = [];
7445
9544
  response = {};
@@ -7502,9 +9601,9 @@ var MockBuilder = class {
7502
9601
  * @example
7503
9602
  * mockLLM().whenMessageContains('hello')
7504
9603
  */
7505
- whenMessageContains(text) {
9604
+ whenMessageContains(text3) {
7506
9605
  this.matchers.push(
7507
- (ctx) => ctx.messages.some((msg) => msg.content?.toLowerCase().includes(text.toLowerCase()))
9606
+ (ctx) => ctx.messages.some((msg) => extractText(msg.content).toLowerCase().includes(text3.toLowerCase()))
7508
9607
  );
7509
9608
  return this;
7510
9609
  }
@@ -7514,10 +9613,11 @@ var MockBuilder = class {
7514
9613
  * @example
7515
9614
  * mockLLM().whenLastMessageContains('goodbye')
7516
9615
  */
7517
- whenLastMessageContains(text) {
9616
+ whenLastMessageContains(text3) {
7518
9617
  this.matchers.push((ctx) => {
7519
9618
  const lastMsg = ctx.messages[ctx.messages.length - 1];
7520
- return lastMsg?.content?.toLowerCase().includes(text.toLowerCase()) ?? false;
9619
+ if (!lastMsg) return false;
9620
+ return extractText(lastMsg.content).toLowerCase().includes(text3.toLowerCase());
7521
9621
  });
7522
9622
  return this;
7523
9623
  }
@@ -7528,7 +9628,7 @@ var MockBuilder = class {
7528
9628
  * mockLLM().whenMessageMatches(/calculate \d+/)
7529
9629
  */
7530
9630
  whenMessageMatches(regex) {
7531
- this.matchers.push((ctx) => ctx.messages.some((msg) => regex.test(msg.content ?? "")));
9631
+ this.matchers.push((ctx) => ctx.messages.some((msg) => regex.test(extractText(msg.content))));
7532
9632
  return this;
7533
9633
  }
7534
9634
  /**
@@ -7537,10 +9637,10 @@ var MockBuilder = class {
7537
9637
  * @example
7538
9638
  * mockLLM().whenRoleContains('system', 'You are a helpful assistant')
7539
9639
  */
7540
- whenRoleContains(role, text) {
9640
+ whenRoleContains(role, text3) {
7541
9641
  this.matchers.push(
7542
9642
  (ctx) => ctx.messages.some(
7543
- (msg) => msg.role === role && msg.content?.toLowerCase().includes(text.toLowerCase())
9643
+ (msg) => msg.role === role && extractText(msg.content).toLowerCase().includes(text3.toLowerCase())
7544
9644
  )
7545
9645
  );
7546
9646
  return this;
@@ -7568,6 +9668,43 @@ var MockBuilder = class {
7568
9668
  this.matchers.push(matcher);
7569
9669
  return this;
7570
9670
  }
9671
+ // ==========================================================================
9672
+ // Multimodal Matchers
9673
+ // ==========================================================================
9674
+ /**
9675
+ * Match when any message contains an image.
9676
+ *
9677
+ * @example
9678
+ * mockLLM().whenMessageHasImage().returns("I see an image of a sunset.")
9679
+ */
9680
+ whenMessageHasImage() {
9681
+ this.matchers.push((ctx) => ctx.messages.some((msg) => hasImageContent(msg.content)));
9682
+ return this;
9683
+ }
9684
+ /**
9685
+ * Match when any message contains audio.
9686
+ *
9687
+ * @example
9688
+ * mockLLM().whenMessageHasAudio().returns("I hear music playing.")
9689
+ */
9690
+ whenMessageHasAudio() {
9691
+ this.matchers.push((ctx) => ctx.messages.some((msg) => hasAudioContent(msg.content)));
9692
+ return this;
9693
+ }
9694
+ /**
9695
+ * Match based on the number of images in the last message.
9696
+ *
9697
+ * @example
9698
+ * mockLLM().whenImageCount((n) => n >= 2).returns("Comparing multiple images...")
9699
+ */
9700
+ whenImageCount(predicate) {
9701
+ this.matchers.push((ctx) => {
9702
+ const lastMsg = ctx.messages[ctx.messages.length - 1];
9703
+ if (!lastMsg) return false;
9704
+ return predicate(countImages(lastMsg.content));
9705
+ });
9706
+ return this;
9707
+ }
7571
9708
  /**
7572
9709
  * Set the text response to return.
7573
9710
  * Can be a static string or a function that returns a string dynamically.
@@ -7577,17 +9714,17 @@ var MockBuilder = class {
7577
9714
  * mockLLM().returns(() => `Response at ${Date.now()}`)
7578
9715
  * mockLLM().returns((ctx) => `You said: ${ctx.messages[0]?.content}`)
7579
9716
  */
7580
- returns(text) {
7581
- if (typeof text === "function") {
9717
+ returns(text3) {
9718
+ if (typeof text3 === "function") {
7582
9719
  this.response = async (ctx) => {
7583
- const resolvedText = await Promise.resolve().then(() => text(ctx));
9720
+ const resolvedText = await Promise.resolve().then(() => text3(ctx));
7584
9721
  return { text: resolvedText };
7585
9722
  };
7586
9723
  } else {
7587
9724
  if (typeof this.response === "function") {
7588
9725
  throw new Error("Cannot use returns() after withResponse() with a function");
7589
9726
  }
7590
- this.response.text = text;
9727
+ this.response.text = text3;
7591
9728
  }
7592
9729
  return this;
7593
9730
  }
@@ -7624,6 +9761,112 @@ var MockBuilder = class {
7624
9761
  this.response.gadgetCalls.push({ gadgetName, parameters });
7625
9762
  return this;
7626
9763
  }
9764
+ // ==========================================================================
9765
+ // Multimodal Response Helpers
9766
+ // ==========================================================================
9767
+ /**
9768
+ * Return a single image in the response.
9769
+ * Useful for mocking image generation endpoints.
9770
+ *
9771
+ * @param data - Image data (base64 string or Buffer)
9772
+ * @param mimeType - MIME type (auto-detected if Buffer provided without type)
9773
+ *
9774
+ * @example
9775
+ * mockLLM()
9776
+ * .forModel('dall-e-3')
9777
+ * .returnsImage(pngBuffer)
9778
+ * .register();
9779
+ */
9780
+ returnsImage(data, mimeType) {
9781
+ if (typeof this.response === "function") {
9782
+ throw new Error("Cannot use returnsImage() after withResponse() with a function");
9783
+ }
9784
+ let imageData;
9785
+ let imageMime;
9786
+ if (typeof data === "string") {
9787
+ imageData = data;
9788
+ if (!mimeType) {
9789
+ throw new Error("MIME type is required when providing base64 string data");
9790
+ }
9791
+ imageMime = mimeType;
9792
+ } else {
9793
+ imageData = toBase64(data);
9794
+ const detected = mimeType ?? detectImageMimeType(data);
9795
+ if (!detected) {
9796
+ throw new Error(
9797
+ "Could not detect image MIME type. Please provide the mimeType parameter explicitly."
9798
+ );
9799
+ }
9800
+ imageMime = detected;
9801
+ }
9802
+ if (!this.response.images) {
9803
+ this.response.images = [];
9804
+ }
9805
+ this.response.images.push({ data: imageData, mimeType: imageMime });
9806
+ return this;
9807
+ }
9808
+ /**
9809
+ * Return multiple images in the response.
9810
+ *
9811
+ * @example
9812
+ * mockLLM()
9813
+ * .forModel('dall-e-3')
9814
+ * .returnsImages([
9815
+ * { data: pngBuffer1 },
9816
+ * { data: pngBuffer2 },
9817
+ * ])
9818
+ * .register();
9819
+ */
9820
+ returnsImages(images) {
9821
+ for (const img of images) {
9822
+ this.returnsImage(img.data, img.mimeType);
9823
+ if (img.revisedPrompt && this.response && typeof this.response !== "function") {
9824
+ const lastImage = this.response.images?.[this.response.images.length - 1];
9825
+ if (lastImage) {
9826
+ lastImage.revisedPrompt = img.revisedPrompt;
9827
+ }
9828
+ }
9829
+ }
9830
+ return this;
9831
+ }
9832
+ /**
9833
+ * Return audio data in the response.
9834
+ * Useful for mocking speech synthesis endpoints.
9835
+ *
9836
+ * @param data - Audio data (base64 string or Buffer)
9837
+ * @param mimeType - MIME type (auto-detected if Buffer provided without type)
9838
+ *
9839
+ * @example
9840
+ * mockLLM()
9841
+ * .forModel('tts-1')
9842
+ * .returnsAudio(mp3Buffer)
9843
+ * .register();
9844
+ */
9845
+ returnsAudio(data, mimeType) {
9846
+ if (typeof this.response === "function") {
9847
+ throw new Error("Cannot use returnsAudio() after withResponse() with a function");
9848
+ }
9849
+ let audioData;
9850
+ let audioMime;
9851
+ if (typeof data === "string") {
9852
+ audioData = data;
9853
+ if (!mimeType) {
9854
+ throw new Error("MIME type is required when providing base64 string data");
9855
+ }
9856
+ audioMime = mimeType;
9857
+ } else {
9858
+ audioData = toBase64(data);
9859
+ const detected = mimeType ?? detectAudioMimeType(data);
9860
+ if (!detected) {
9861
+ throw new Error(
9862
+ "Could not detect audio MIME type. Please provide the mimeType parameter explicitly."
9863
+ );
9864
+ }
9865
+ audioMime = detected;
9866
+ }
9867
+ this.response.audio = { data: audioData, mimeType: audioMime };
9868
+ return this;
9869
+ }
7627
9870
  /**
7628
9871
  * Set the complete mock response object.
7629
9872
  * This allows full control over all response properties.
@@ -7954,23 +10197,23 @@ function createTestStream(chunks) {
7954
10197
  }
7955
10198
  }();
7956
10199
  }
7957
- function createTextStream(text, options) {
10200
+ function createTextStream(text3, options) {
7958
10201
  return async function* () {
7959
10202
  if (options?.delayMs) {
7960
10203
  await sleep2(options.delayMs);
7961
10204
  }
7962
- const chunkSize = options?.chunkSize ?? text.length;
10205
+ const chunkSize = options?.chunkSize ?? text3.length;
7963
10206
  const chunks = [];
7964
- for (let i = 0; i < text.length; i += chunkSize) {
7965
- chunks.push(text.slice(i, i + chunkSize));
10207
+ for (let i = 0; i < text3.length; i += chunkSize) {
10208
+ chunks.push(text3.slice(i, i + chunkSize));
7966
10209
  }
7967
10210
  for (let i = 0; i < chunks.length; i++) {
7968
10211
  const isLast = i === chunks.length - 1;
7969
10212
  const chunk = { text: chunks[i] };
7970
10213
  if (isLast) {
7971
10214
  chunk.finishReason = options?.finishReason ?? "stop";
7972
- const inputTokens = Math.ceil(text.length / 4);
7973
- const outputTokens = Math.ceil(text.length / 4);
10215
+ const inputTokens = Math.ceil(text3.length / 4);
10216
+ const outputTokens = Math.ceil(text3.length / 4);
7974
10217
  chunk.usage = options?.usage ?? {
7975
10218
  inputTokens,
7976
10219
  outputTokens,
@@ -7992,11 +10235,11 @@ async function collectStream(stream2) {
7992
10235
  return chunks;
7993
10236
  }
7994
10237
  async function collectStreamText(stream2) {
7995
- let text = "";
10238
+ let text3 = "";
7996
10239
  for await (const chunk of stream2) {
7997
- text += chunk.text ?? "";
10240
+ text3 += chunk.text ?? "";
7998
10241
  }
7999
- return text;
10242
+ return text3;
8000
10243
  }
8001
10244
  async function getStreamFinalChunk(stream2) {
8002
10245
  let lastChunk;
@@ -8378,6 +10621,21 @@ function filterDefinedEnv(env) {
8378
10621
  }
8379
10622
 
8380
10623
  export {
10624
+ isTextPart,
10625
+ isImagePart,
10626
+ isAudioPart,
10627
+ text,
10628
+ imageFromBase64,
10629
+ imageFromUrl,
10630
+ detectImageMimeType,
10631
+ detectAudioMimeType,
10632
+ toBase64,
10633
+ imageFromBuffer,
10634
+ audioFromBase64,
10635
+ audioFromBuffer,
10636
+ isDataUrl,
10637
+ parseDataUrl,
10638
+ init_input_content,
8381
10639
  MODEL_ALIASES,
8382
10640
  resolveModel,
8383
10641
  hasProviderPrefix,
@@ -8394,6 +10652,8 @@ export {
8394
10652
  resolveRulesTemplate,
8395
10653
  resolveHintTemplate,
8396
10654
  init_prompt_config,
10655
+ normalizeContent,
10656
+ extractText,
8397
10657
  LLMMessageBuilder,
8398
10658
  init_messages,
8399
10659
  BreakLoopException,
@@ -8450,11 +10710,11 @@ export {
8450
10710
  init_discovery,
8451
10711
  ModelRegistry,
8452
10712
  init_model_registry,
8453
- ModelIdentifierParser,
8454
- init_options,
8455
10713
  complete,
8456
10714
  stream,
8457
10715
  init_quick_methods,
10716
+ ModelIdentifierParser,
10717
+ init_options,
8458
10718
  LLMist,
8459
10719
  init_client,
8460
10720
  AgentBuilder,
@@ -8501,4 +10761,4 @@ export {
8501
10761
  MockPromptRecorder,
8502
10762
  waitFor
8503
10763
  };
8504
- //# sourceMappingURL=chunk-GANXNBIZ.js.map
10764
+ //# sourceMappingURL=chunk-YHS2DYXP.js.map