llmist 2.4.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -861,8 +861,21 @@ interface ParsedGadgetCall {
861
861
  parametersRaw: string;
862
862
  parameters?: Record<string, unknown>;
863
863
  parseError?: string;
864
+ /** List of invocation IDs this gadget depends on. Empty array if no dependencies. */
865
+ dependencies: string[];
864
866
  }
865
867
 
868
+ /** Event emitted when a gadget is skipped due to a failed dependency */
869
+ interface GadgetSkippedEvent {
870
+ type: "gadget_skipped";
871
+ gadgetName: string;
872
+ invocationId: string;
873
+ parameters: Record<string, unknown>;
874
+ /** The invocation ID of the dependency that failed */
875
+ failedDependency: string;
876
+ /** The error message from the failed dependency */
877
+ failedDependencyError: string;
878
+ }
866
879
  type StreamEvent = {
867
880
  type: "text";
868
881
  content: string;
@@ -872,7 +885,7 @@ type StreamEvent = {
872
885
  } | {
873
886
  type: "gadget_result";
874
887
  result: GadgetExecutionResult;
875
- } | {
888
+ } | GadgetSkippedEvent | {
876
889
  type: "human_input_required";
877
890
  question: string;
878
891
  gadgetName: string;
@@ -1321,6 +1334,215 @@ declare abstract class BaseGadget {
1321
1334
  }): string;
1322
1335
  }
1323
1336
 
1337
+ /**
1338
+ * Types and interfaces for multimodal input content.
1339
+ *
1340
+ * These types define the structure for sending images, audio, and other
1341
+ * media alongside text in LLM messages. They complement the output types
1342
+ * in media-types.ts.
1343
+ */
1344
+ /**
1345
+ * Supported image MIME types for input.
1346
+ * All major providers support these formats.
1347
+ */
1348
+ type ImageMimeType = "image/jpeg" | "image/png" | "image/gif" | "image/webp";
1349
+ /**
1350
+ * Supported audio MIME types for input.
1351
+ * Currently only Gemini supports audio input.
1352
+ */
1353
+ type AudioMimeType = "audio/mp3" | "audio/mpeg" | "audio/wav" | "audio/webm" | "audio/ogg";
1354
+ /**
1355
+ * Base interface for all content parts.
1356
+ */
1357
+ interface BaseContentPart {
1358
+ type: string;
1359
+ }
1360
+ /**
1361
+ * Text content part.
1362
+ */
1363
+ interface TextContentPart extends BaseContentPart {
1364
+ type: "text";
1365
+ text: string;
1366
+ }
1367
+ /**
1368
+ * Image content part.
1369
+ */
1370
+ interface ImageContentPart extends BaseContentPart {
1371
+ type: "image";
1372
+ source: ImageSource;
1373
+ }
1374
+ /**
1375
+ * Audio content part.
1376
+ * Currently only supported by Gemini.
1377
+ */
1378
+ interface AudioContentPart extends BaseContentPart {
1379
+ type: "audio";
1380
+ source: AudioSource;
1381
+ }
1382
+ /**
1383
+ * Union of all supported content part types.
1384
+ */
1385
+ type ContentPart = TextContentPart | ImageContentPart | AudioContentPart;
1386
+ /**
1387
+ * Image can come from base64 data or a URL.
1388
+ */
1389
+ type ImageSource = ImageBase64Source | ImageUrlSource;
1390
+ /**
1391
+ * Base64-encoded image data.
1392
+ * Supported by all providers.
1393
+ */
1394
+ interface ImageBase64Source {
1395
+ type: "base64";
1396
+ mediaType: ImageMimeType;
1397
+ data: string;
1398
+ }
1399
+ /**
1400
+ * Image URL reference.
1401
+ * Only supported by OpenAI.
1402
+ */
1403
+ interface ImageUrlSource {
1404
+ type: "url";
1405
+ url: string;
1406
+ }
1407
+ /**
1408
+ * Audio source (base64 only).
1409
+ * URL sources are not currently supported for audio.
1410
+ */
1411
+ interface AudioSource {
1412
+ type: "base64";
1413
+ mediaType: AudioMimeType;
1414
+ data: string;
1415
+ }
1416
+ /**
1417
+ * Check if a content part is a text part.
1418
+ */
1419
+ declare function isTextPart(part: ContentPart): part is TextContentPart;
1420
+ /**
1421
+ * Check if a content part is an image part.
1422
+ */
1423
+ declare function isImagePart(part: ContentPart): part is ImageContentPart;
1424
+ /**
1425
+ * Check if a content part is an audio part.
1426
+ */
1427
+ declare function isAudioPart(part: ContentPart): part is AudioContentPart;
1428
+ /**
1429
+ * Create a text content part.
1430
+ *
1431
+ * @example
1432
+ * ```typescript
1433
+ * const part = text("What's in this image?");
1434
+ * ```
1435
+ */
1436
+ declare function text(content: string): TextContentPart;
1437
+ /**
1438
+ * Create an image content part from base64-encoded data.
1439
+ *
1440
+ * @param data - Base64-encoded image data
1441
+ * @param mediaType - MIME type of the image
1442
+ *
1443
+ * @example
1444
+ * ```typescript
1445
+ * const part = imageFromBase64(base64Data, "image/jpeg");
1446
+ * ```
1447
+ */
1448
+ declare function imageFromBase64(data: string, mediaType: ImageMimeType): ImageContentPart;
1449
+ /**
1450
+ * Create an image content part from a URL.
1451
+ * Note: Only supported by OpenAI.
1452
+ *
1453
+ * @param url - URL to the image (must be accessible)
1454
+ *
1455
+ * @example
1456
+ * ```typescript
1457
+ * const part = imageFromUrl("https://example.com/image.jpg");
1458
+ * ```
1459
+ */
1460
+ declare function imageFromUrl(url: string): ImageContentPart;
1461
+ /**
1462
+ * Detect the MIME type of image data from magic bytes.
1463
+ *
1464
+ * @param data - Raw image data
1465
+ * @returns Detected MIME type or null if unknown
1466
+ */
1467
+ declare function detectImageMimeType(data: Buffer | Uint8Array): ImageMimeType | null;
1468
+ /**
1469
+ * Detect the MIME type of audio data from magic bytes.
1470
+ *
1471
+ * @param data - Raw audio data
1472
+ * @returns Detected MIME type or null if unknown
1473
+ */
1474
+ declare function detectAudioMimeType(data: Buffer | Uint8Array): AudioMimeType | null;
1475
+ /**
1476
+ * Convert data to base64 string.
1477
+ *
1478
+ * @param data - Data to encode (Buffer, Uint8Array, or already base64 string)
1479
+ * @returns Base64-encoded string
1480
+ */
1481
+ declare function toBase64(data: Buffer | Uint8Array | string): string;
1482
+ /**
1483
+ * Create an image content part from a Buffer or Uint8Array.
1484
+ * Automatically detects the MIME type if not provided.
1485
+ *
1486
+ * @param buffer - Image data
1487
+ * @param mediaType - Optional MIME type (auto-detected if not provided)
1488
+ *
1489
+ * @example
1490
+ * ```typescript
1491
+ * const imageData = await fs.readFile("photo.jpg");
1492
+ * const part = imageFromBuffer(imageData); // Auto-detects JPEG
1493
+ * ```
1494
+ */
1495
+ declare function imageFromBuffer(buffer: Buffer | Uint8Array, mediaType?: ImageMimeType): ImageContentPart;
1496
+ /**
1497
+ * Create an audio content part from base64-encoded data.
1498
+ *
1499
+ * @param data - Base64-encoded audio data
1500
+ * @param mediaType - MIME type of the audio
1501
+ *
1502
+ * @example
1503
+ * ```typescript
1504
+ * const part = audioFromBase64(base64Audio, "audio/mp3");
1505
+ * ```
1506
+ */
1507
+ declare function audioFromBase64(data: string, mediaType: AudioMimeType): AudioContentPart;
1508
+ /**
1509
+ * Create an audio content part from a Buffer or Uint8Array.
1510
+ * Automatically detects the MIME type if not provided.
1511
+ *
1512
+ * @param buffer - Audio data
1513
+ * @param mediaType - Optional MIME type (auto-detected if not provided)
1514
+ *
1515
+ * @example
1516
+ * ```typescript
1517
+ * const audioData = await fs.readFile("audio.mp3");
1518
+ * const part = audioFromBuffer(audioData); // Auto-detects MP3
1519
+ * ```
1520
+ */
1521
+ declare function audioFromBuffer(buffer: Buffer | Uint8Array, mediaType?: AudioMimeType): AudioContentPart;
1522
+ /**
1523
+ * Check if a string is a data URL.
1524
+ *
1525
+ * @param input - String to check
1526
+ * @returns True if it's a data URL
1527
+ */
1528
+ declare function isDataUrl(input: string): boolean;
1529
+ /**
1530
+ * Parse a data URL into its components.
1531
+ *
1532
+ * @param url - Data URL to parse
1533
+ * @returns Parsed components or null if invalid
1534
+ *
1535
+ * @example
1536
+ * ```typescript
1537
+ * const result = parseDataUrl("data:image/jpeg;base64,/9j/4AAQ...");
1538
+ * // { mimeType: "image/jpeg", data: "/9j/4AAQ..." }
1539
+ * ```
1540
+ */
1541
+ declare function parseDataUrl(url: string): {
1542
+ mimeType: string;
1543
+ data: string;
1544
+ } | null;
1545
+
1324
1546
  /**
1325
1547
  * Context provided to prompt template functions for rendering dynamic content.
1326
1548
  */
@@ -1454,12 +1676,33 @@ declare function resolveRulesTemplate(rules: PromptConfig["rules"] | undefined,
1454
1676
  declare function resolveHintTemplate(template: HintTemplate | undefined, defaultValue: string, context: HintContext): string;
1455
1677
 
1456
1678
  type LLMRole = "system" | "user" | "assistant";
1679
+ /**
1680
+ * Message content can be a simple string (text only) or an array of content parts (multimodal).
1681
+ * Using a string is simpler for text-only messages, while arrays support images and audio.
1682
+ */
1683
+ type MessageContent = string | ContentPart[];
1457
1684
  interface LLMMessage {
1458
1685
  role: LLMRole;
1459
- content: string;
1686
+ content: MessageContent;
1460
1687
  name?: string;
1461
1688
  metadata?: Record<string, unknown>;
1462
1689
  }
1690
+ /**
1691
+ * Normalize message content to an array of content parts.
1692
+ * Converts string content to a single text part.
1693
+ *
1694
+ * @param content - Message content (string or ContentPart[])
1695
+ * @returns Array of content parts
1696
+ */
1697
+ declare function normalizeContent(content: MessageContent): ContentPart[];
1698
+ /**
1699
+ * Extract text from message content.
1700
+ * Concatenates all text parts in the content.
1701
+ *
1702
+ * @param content - Message content (string or ContentPart[])
1703
+ * @returns Combined text from all text parts
1704
+ */
1705
+ declare function extractText(content: MessageContent): string;
1463
1706
  declare class LLMMessageBuilder {
1464
1707
  private readonly messages;
1465
1708
  private startPrefix;
@@ -1482,8 +1725,92 @@ declare class LLMMessageBuilder {
1482
1725
  private buildUsageSection;
1483
1726
  private buildExamplesSection;
1484
1727
  private buildRulesSection;
1485
- addUser(content: string, metadata?: Record<string, unknown>): this;
1728
+ /**
1729
+ * Add a user message.
1730
+ * Content can be a string (text only) or an array of content parts (multimodal).
1731
+ *
1732
+ * @param content - Message content
1733
+ * @param metadata - Optional metadata
1734
+ *
1735
+ * @example
1736
+ * ```typescript
1737
+ * // Text only
1738
+ * builder.addUser("Hello!");
1739
+ *
1740
+ * // Multimodal
1741
+ * builder.addUser([
1742
+ * text("What's in this image?"),
1743
+ * imageFromBuffer(imageData),
1744
+ * ]);
1745
+ * ```
1746
+ */
1747
+ addUser(content: MessageContent, metadata?: Record<string, unknown>): this;
1486
1748
  addAssistant(content: string, metadata?: Record<string, unknown>): this;
1749
+ /**
1750
+ * Add a user message with an image attachment.
1751
+ *
1752
+ * @param textContent - Text prompt
1753
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
1754
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
1755
+ *
1756
+ * @example
1757
+ * ```typescript
1758
+ * builder.addUserWithImage(
1759
+ * "What's in this image?",
1760
+ * await fs.readFile("photo.jpg"),
1761
+ * "image/jpeg" // Optional - auto-detected
1762
+ * );
1763
+ * ```
1764
+ */
1765
+ addUserWithImage(textContent: string, imageData: Buffer | Uint8Array | string, mimeType?: ImageMimeType): this;
1766
+ /**
1767
+ * Add a user message with an image URL (OpenAI only).
1768
+ *
1769
+ * @param textContent - Text prompt
1770
+ * @param imageUrl - URL to the image
1771
+ *
1772
+ * @example
1773
+ * ```typescript
1774
+ * builder.addUserWithImageUrl(
1775
+ * "What's in this image?",
1776
+ * "https://example.com/image.jpg"
1777
+ * );
1778
+ * ```
1779
+ */
1780
+ addUserWithImageUrl(textContent: string, imageUrl: string): this;
1781
+ /**
1782
+ * Add a user message with an audio attachment (Gemini only).
1783
+ *
1784
+ * @param textContent - Text prompt
1785
+ * @param audioData - Audio data (Buffer, Uint8Array, or base64 string)
1786
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
1787
+ *
1788
+ * @example
1789
+ * ```typescript
1790
+ * builder.addUserWithAudio(
1791
+ * "Transcribe this audio",
1792
+ * await fs.readFile("recording.mp3"),
1793
+ * "audio/mp3" // Optional - auto-detected
1794
+ * );
1795
+ * ```
1796
+ */
1797
+ addUserWithAudio(textContent: string, audioData: Buffer | Uint8Array | string, mimeType?: AudioMimeType): this;
1798
+ /**
1799
+ * Add a user message with multiple content parts.
1800
+ * Provides full flexibility for complex multimodal messages.
1801
+ *
1802
+ * @param parts - Array of content parts
1803
+ *
1804
+ * @example
1805
+ * ```typescript
1806
+ * builder.addUserMultimodal([
1807
+ * text("Compare these images:"),
1808
+ * imageFromBuffer(image1),
1809
+ * imageFromBuffer(image2),
1810
+ * ]);
1811
+ * ```
1812
+ */
1813
+ addUserMultimodal(parts: ContentPart[]): this;
1487
1814
  addGadgetCall(gadget: string, parameters: Record<string, unknown>, result: string): this;
1488
1815
  /**
1489
1816
  * Format parameters as Block format with JSON Pointer paths.
@@ -1688,6 +2015,120 @@ declare class TextNamespace {
1688
2015
  stream(prompt: string, options?: QuickOptions): AsyncGenerator<string>;
1689
2016
  }
1690
2017
 
2018
+ /**
2019
+ * Vision Analysis Namespace
2020
+ *
2021
+ * Provides one-shot image analysis without agent setup.
2022
+ * Useful for quick image understanding tasks.
2023
+ *
2024
+ * @example
2025
+ * ```typescript
2026
+ * const llmist = new LLMist();
2027
+ *
2028
+ * const description = await llmist.vision.analyze({
2029
+ * model: "gpt-4o",
2030
+ * image: await readFile("photo.jpg"),
2031
+ * prompt: "Describe this image in detail",
2032
+ * });
2033
+ *
2034
+ * console.log(description);
2035
+ * ```
2036
+ */
2037
+
2038
+ /**
2039
+ * Options for vision analysis.
2040
+ */
2041
+ interface VisionAnalyzeOptions {
2042
+ /** Model to use (must support vision, e.g., "gpt-4o", "claude-sonnet-4-20250514", "gemini-2.5-flash") */
2043
+ model: string;
2044
+ /** Image data: Buffer, Uint8Array, base64 string, data URL, or HTTPS URL */
2045
+ image: string | Buffer | Uint8Array;
2046
+ /** Analysis prompt describing what to do with the image */
2047
+ prompt: string;
2048
+ /** MIME type (auto-detected if not provided for Buffer/Uint8Array) */
2049
+ mimeType?: ImageMimeType;
2050
+ /** System prompt for analysis context */
2051
+ systemPrompt?: string;
2052
+ /** Max tokens for response */
2053
+ maxTokens?: number;
2054
+ /** Temperature (0-1) */
2055
+ temperature?: number;
2056
+ }
2057
+ /**
2058
+ * Result of vision analysis.
2059
+ */
2060
+ interface VisionAnalyzeResult {
2061
+ /** The analysis text */
2062
+ text: string;
2063
+ /** Model used */
2064
+ model: string;
2065
+ /** Token usage if available */
2066
+ usage?: {
2067
+ inputTokens: number;
2068
+ outputTokens: number;
2069
+ totalTokens: number;
2070
+ };
2071
+ }
2072
+ declare class VisionNamespace {
2073
+ private readonly client;
2074
+ constructor(client: LLMist);
2075
+ /**
2076
+ * Build a message builder with the image content attached.
2077
+ * Handles URLs, data URLs, base64 strings, and binary buffers.
2078
+ */
2079
+ private buildImageMessage;
2080
+ /**
2081
+ * Stream the response and collect text and usage information.
2082
+ */
2083
+ private streamAndCollect;
2084
+ /**
2085
+ * Analyze an image with a vision-capable model.
2086
+ * Returns the analysis as a string.
2087
+ *
2088
+ * @param options - Vision analysis options
2089
+ * @returns Promise resolving to the analysis text
2090
+ * @throws Error if the image format is unsupported or model doesn't support vision
2091
+ *
2092
+ * @example
2093
+ * ```typescript
2094
+ * // From file
2095
+ * const result = await llmist.vision.analyze({
2096
+ * model: "gpt-4o",
2097
+ * image: await fs.readFile("photo.jpg"),
2098
+ * prompt: "What's in this image?",
2099
+ * });
2100
+ *
2101
+ * // From URL (OpenAI only)
2102
+ * const result = await llmist.vision.analyze({
2103
+ * model: "gpt-4o",
2104
+ * image: "https://example.com/image.jpg",
2105
+ * prompt: "Describe this image",
2106
+ * });
2107
+ * ```
2108
+ */
2109
+ analyze(options: VisionAnalyzeOptions): Promise<string>;
2110
+ /**
2111
+ * Analyze an image and return detailed result with usage info.
2112
+ *
2113
+ * @param options - Vision analysis options
2114
+ * @returns Promise resolving to the analysis result with usage info
2115
+ */
2116
+ analyzeWithUsage(options: VisionAnalyzeOptions): Promise<VisionAnalyzeResult>;
2117
+ /**
2118
+ * Check if a model supports vision/image input.
2119
+ *
2120
+ * @param modelId - Model ID to check
2121
+ * @returns True if the model supports vision
2122
+ */
2123
+ supportsModel(modelId: string): boolean;
2124
+ /**
2125
+ * List all models that support vision.
2126
+ *
2127
+ * @returns Array of model IDs that support vision
2128
+ */
2129
+ listModels(): string[];
2130
+ }
2131
+
1691
2132
  interface LLMistOptions {
1692
2133
  /**
1693
2134
  * Provider adapters to register manually.
@@ -1735,6 +2176,7 @@ declare class LLMist {
1735
2176
  readonly text: TextNamespace;
1736
2177
  readonly image: ImageNamespace;
1737
2178
  readonly speech: SpeechNamespace;
2179
+ readonly vision: VisionNamespace;
1738
2180
  constructor();
1739
2181
  constructor(adapters: ProviderAdapter[]);
1740
2182
  constructor(adapters: ProviderAdapter[], defaultProvider: string);
@@ -1962,12 +2404,15 @@ interface EventHandlers {
1962
2404
  /** Called when a gadget is about to be executed */
1963
2405
  onGadgetCall?: (call: {
1964
2406
  gadgetName: string;
2407
+ invocationId: string;
1965
2408
  parameters?: Record<string, unknown>;
1966
2409
  parametersRaw: string;
2410
+ dependencies: string[];
1967
2411
  }) => void | Promise<void>;
1968
2412
  /** Called when a gadget execution completes */
1969
2413
  onGadgetResult?: (result: {
1970
2414
  gadgetName: string;
2415
+ invocationId: string;
1971
2416
  result?: string;
1972
2417
  error?: string;
1973
2418
  parameters: Record<string, unknown>;
@@ -2246,6 +2691,21 @@ interface ObserveGadgetCompleteContext {
2246
2691
  cost?: number;
2247
2692
  logger: Logger<ILogObj>;
2248
2693
  }
2694
+ /**
2695
+ * Context provided when a gadget is skipped due to a failed dependency.
2696
+ * Read-only observation point.
2697
+ */
2698
+ interface ObserveGadgetSkippedContext {
2699
+ iteration: number;
2700
+ gadgetName: string;
2701
+ invocationId: string;
2702
+ parameters: Readonly<Record<string, unknown>>;
2703
+ /** The invocation ID of the dependency that failed */
2704
+ failedDependency: string;
2705
+ /** The error message from the failed dependency */
2706
+ failedDependencyError: string;
2707
+ logger: Logger<ILogObj>;
2708
+ }
2249
2709
  /**
2250
2710
  * Context provided for each stream chunk.
2251
2711
  * Read-only observation point.
@@ -2279,6 +2739,8 @@ interface Observers {
2279
2739
  onGadgetExecutionStart?: (context: ObserveGadgetStartContext) => void | Promise<void>;
2280
2740
  /** Called when a gadget execution completes (success or error) */
2281
2741
  onGadgetExecutionComplete?: (context: ObserveGadgetCompleteContext) => void | Promise<void>;
2742
+ /** Called when a gadget is skipped due to a failed dependency */
2743
+ onGadgetSkipped?: (context: ObserveGadgetSkippedContext) => void | Promise<void>;
2282
2744
  /** Called for each stream chunk */
2283
2745
  onStreamChunk?: (context: ObserveChunkContext) => void | Promise<void>;
2284
2746
  /** Called when context compaction occurs */
@@ -2518,6 +2980,39 @@ type AfterGadgetExecutionAction = {
2518
2980
  action: "recover";
2519
2981
  fallbackResult: string;
2520
2982
  };
2983
+ /**
2984
+ * Context for dependency skip controller.
2985
+ * Called when a gadget would be skipped due to a failed dependency.
2986
+ */
2987
+ interface DependencySkipControllerContext {
2988
+ iteration: number;
2989
+ gadgetName: string;
2990
+ invocationId: string;
2991
+ /** Parameters of the gadget that would be skipped */
2992
+ parameters: Record<string, unknown>;
2993
+ /** The invocation ID of the dependency that failed */
2994
+ failedDependency: string;
2995
+ /** The error message from the failed dependency */
2996
+ failedDependencyError: string;
2997
+ logger: Logger<ILogObj>;
2998
+ }
2999
+ /**
3000
+ * Action returned by onDependencySkipped controller.
3001
+ */
3002
+ type DependencySkipAction =
3003
+ /** Skip execution and propagate failure to downstream dependents */
3004
+ {
3005
+ action: "skip";
3006
+ }
3007
+ /** Execute the gadget anyway despite the failed dependency */
3008
+ | {
3009
+ action: "execute_anyway";
3010
+ }
3011
+ /** Skip execution but provide a fallback result (doesn't propagate failure) */
3012
+ | {
3013
+ action: "use_fallback";
3014
+ fallbackResult: string;
3015
+ };
2521
3016
  /**
2522
3017
  * Controllers: Async lifecycle hooks that control execution flow.
2523
3018
  * - Can short-circuit execution
@@ -2550,6 +3045,11 @@ interface Controllers {
2550
3045
  * Can provide a fallback result to recover from errors.
2551
3046
  */
2552
3047
  afterGadgetExecution?: (context: AfterGadgetExecutionControllerContext) => Promise<AfterGadgetExecutionAction>;
3048
+ /**
3049
+ * Called before skipping a gadget due to a failed dependency.
3050
+ * Can override the default skip behavior to execute anyway or provide a fallback.
3051
+ */
3052
+ onDependencySkipped?: (context: DependencySkipControllerContext) => Promise<DependencySkipAction>;
2553
3053
  }
2554
3054
  /**
2555
3055
  * Clean hooks system with three distinct categories:
@@ -2583,8 +3083,8 @@ interface AgentOptions {
2583
3083
  model: string;
2584
3084
  /** System prompt */
2585
3085
  systemPrompt?: string;
2586
- /** Initial user prompt (optional if using build()) */
2587
- userPrompt?: string;
3086
+ /** Initial user prompt (optional if using build()). Can be text or multimodal content. */
3087
+ userPrompt?: string | ContentPart[];
2588
3088
  /** Maximum iterations */
2589
3089
  maxIterations?: number;
2590
3090
  /** Temperature */
@@ -2603,10 +3103,10 @@ interface AgentOptions {
2603
3103
  gadgetEndPrefix?: string;
2604
3104
  /** Custom gadget argument prefix for block format parameters */
2605
3105
  gadgetArgPrefix?: string;
2606
- /** Initial messages */
3106
+ /** Initial messages. User messages support multimodal content. */
2607
3107
  initialMessages?: Array<{
2608
3108
  role: "system" | "user" | "assistant";
2609
- content: string;
3109
+ content: MessageContent;
2610
3110
  }>;
2611
3111
  /** Text-only handler */
2612
3112
  textOnlyHandler?: TextOnlyHandler;
@@ -2815,9 +3315,10 @@ declare class Agent {
2815
3315
 
2816
3316
  /**
2817
3317
  * Message for conversation history.
3318
+ * User messages can be text (string) or multimodal (ContentPart[]).
2818
3319
  */
2819
3320
  type HistoryMessage = {
2820
- user: string;
3321
+ user: string | ContentPart[];
2821
3322
  } | {
2822
3323
  assistant: string;
2823
3324
  } | {
@@ -3336,7 +3837,62 @@ declare class AgentBuilder {
3336
3837
  * }
3337
3838
  * ```
3338
3839
  */
3840
+ /**
3841
+ * Build AgentOptions with the given user prompt.
3842
+ * Centralizes options construction for ask(), askWithImage(), and askWithContent().
3843
+ */
3844
+ private buildAgentOptions;
3339
3845
  ask(userPrompt: string): Agent;
3846
+ /**
3847
+ * Build and create the agent with a multimodal user prompt (text + image).
3848
+ * Returns the Agent instance ready to run.
3849
+ *
3850
+ * @param textPrompt - Text prompt describing what to do with the image
3851
+ * @param imageData - Image data (Buffer, Uint8Array, or base64 string)
3852
+ * @param mimeType - Optional MIME type (auto-detected if not provided)
3853
+ * @returns Configured Agent instance
3854
+ *
3855
+ * @example
3856
+ * ```typescript
3857
+ * const agent = LLMist.createAgent()
3858
+ * .withModel("gpt-4o")
3859
+ * .withSystem("You analyze images")
3860
+ * .askWithImage(
3861
+ * "What's in this image?",
3862
+ * await fs.readFile("photo.jpg")
3863
+ * );
3864
+ *
3865
+ * for await (const event of agent.run()) {
3866
+ * // handle events
3867
+ * }
3868
+ * ```
3869
+ */
3870
+ askWithImage(textPrompt: string, imageData: Buffer | Uint8Array | string, mimeType?: ImageMimeType): Agent;
3871
+ /**
3872
+ * Build and return an Agent configured with multimodal content.
3873
+ * More flexible than askWithImage - accepts any combination of content parts.
3874
+ *
3875
+ * @param content - Array of content parts (text, images, audio)
3876
+ * @returns A configured Agent ready for execution
3877
+ *
3878
+ * @example
3879
+ * ```typescript
3880
+ * import { text, imageFromBuffer, audioFromBuffer } from "llmist";
3881
+ *
3882
+ * const agent = LLMist.createAgent()
3883
+ * .withModel("gemini:gemini-2.5-flash")
3884
+ * .askWithContent([
3885
+ * text("Describe this image and transcribe the audio:"),
3886
+ * imageFromBuffer(imageData),
3887
+ * audioFromBuffer(audioData),
3888
+ * ]);
3889
+ *
3890
+ * for await (const event of agent.run()) {
3891
+ * // handle events
3892
+ * }
3893
+ * ```
3894
+ */
3895
+ askWithContent(content: ContentPart[]): Agent;
3340
3896
  /**
3341
3897
  * Build, run, and collect only the text response.
3342
3898
  * Convenient for simple queries where you just want the final answer.
@@ -3416,8 +3972,9 @@ declare class AgentBuilder {
3416
3972
  interface IConversationManager {
3417
3973
  /**
3418
3974
  * Adds a user message to the conversation.
3975
+ * Supports multimodal content (text + images/audio).
3419
3976
  */
3420
- addUserMessage(content: string): void;
3977
+ addUserMessage(content: MessageContent): void;
3421
3978
  /**
3422
3979
  * Adds an assistant message to the conversation.
3423
3980
  */
@@ -3485,6 +4042,26 @@ interface MockMatcherContext {
3485
4042
  * const matcher: MockMatcher = (ctx) => ctx.provider === 'anthropic';
3486
4043
  */
3487
4044
  type MockMatcher = (context: MockMatcherContext) => boolean | Promise<boolean>;
4045
+ /**
4046
+ * Image data in a mock response.
4047
+ */
4048
+ interface MockImageData {
4049
+ /** Base64-encoded image data */
4050
+ data: string;
4051
+ /** MIME type of the image */
4052
+ mimeType: ImageMimeType;
4053
+ /** Revised prompt (for image generation responses) */
4054
+ revisedPrompt?: string;
4055
+ }
4056
+ /**
4057
+ * Audio data in a mock response.
4058
+ */
4059
+ interface MockAudioData {
4060
+ /** Base64-encoded audio data */
4061
+ data: string;
4062
+ /** MIME type of the audio */
4063
+ mimeType: AudioMimeType;
4064
+ }
3488
4065
  /**
3489
4066
  * A mock response that will be returned when a matcher succeeds.
3490
4067
  */
@@ -3504,6 +4081,16 @@ interface MockResponse {
3504
4081
  /** Optional invocationId, will be auto-generated if not provided */
3505
4082
  invocationId?: string;
3506
4083
  }>;
4084
+ /**
4085
+ * Image data to return in the response (e.g., for image generation mocks).
4086
+ * Each image will be yielded as a separate chunk in the stream.
4087
+ */
4088
+ images?: MockImageData[];
4089
+ /**
4090
+ * Audio data to return in the response (e.g., for speech synthesis mocks).
4091
+ * Will be yielded as a chunk in the stream.
4092
+ */
4093
+ audio?: MockAudioData;
3507
4094
  /**
3508
4095
  * Simulated token usage statistics
3509
4096
  */
@@ -3610,9 +4197,58 @@ declare class MockProviderAdapter implements ProviderAdapter {
3610
4197
  readonly priority = 100;
3611
4198
  private readonly mockManager;
3612
4199
  constructor(options?: MockOptions);
3613
- supports(descriptor: ModelDescriptor): boolean;
3614
- stream(options: LLMGenerationOptions, descriptor: ModelDescriptor, spec?: unknown): LLMStream;
4200
+ supports(_descriptor: ModelDescriptor): boolean;
4201
+ stream(options: LLMGenerationOptions, descriptor: ModelDescriptor, _spec?: unknown): LLMStream;
3615
4202
  private createMockStreamFromContext;
4203
+ /**
4204
+ * Check if this adapter supports image generation for a given model.
4205
+ * Returns true if there's a registered mock with images for this model.
4206
+ */
4207
+ supportsImageGeneration(_modelId: string): boolean;
4208
+ /**
4209
+ * Generate mock images based on registered mocks.
4210
+ *
4211
+ * @param options - Image generation options
4212
+ * @returns Mock image generation result
4213
+ */
4214
+ generateImage(options: ImageGenerationOptions): Promise<ImageGenerationResult>;
4215
+ /**
4216
+ * Transform mock response into ImageGenerationResult format.
4217
+ *
4218
+ * @param options - Original image generation options
4219
+ * @param mockResponse - Mock response containing image data
4220
+ * @returns ImageGenerationResult with mock data and zero cost
4221
+ */
4222
+ private createImageResult;
4223
+ /**
4224
+ * Check if this adapter supports speech generation for a given model.
4225
+ * Returns true if there's a registered mock with audio for this model.
4226
+ */
4227
+ supportsSpeechGeneration(_modelId: string): boolean;
4228
+ /**
4229
+ * Generate mock speech based on registered mocks.
4230
+ *
4231
+ * @param options - Speech generation options
4232
+ * @returns Mock speech generation result
4233
+ */
4234
+ generateSpeech(options: SpeechGenerationOptions): Promise<SpeechGenerationResult>;
4235
+ /**
4236
+ * Transform mock response into SpeechGenerationResult format.
4237
+ * Converts base64 audio data to ArrayBuffer.
4238
+ *
4239
+ * @param options - Original speech generation options
4240
+ * @param mockResponse - Mock response containing audio data
4241
+ * @returns SpeechGenerationResult with mock data and zero cost
4242
+ */
4243
+ private createSpeechResult;
4244
+ /**
4245
+ * Map MIME type to audio format for SpeechGenerationResult.
4246
+ * Defaults to "mp3" for unknown MIME types.
4247
+ *
4248
+ * @param mimeType - Audio MIME type string
4249
+ * @returns Audio format identifier
4250
+ */
4251
+ private mimeTypeToAudioFormat;
3616
4252
  }
3617
4253
  /**
3618
4254
  * Create a mock provider adapter instance.
@@ -3743,6 +4379,27 @@ declare class MockBuilder {
3743
4379
  * })
3744
4380
  */
3745
4381
  when(matcher: MockMatcher): this;
4382
+ /**
4383
+ * Match when any message contains an image.
4384
+ *
4385
+ * @example
4386
+ * mockLLM().whenMessageHasImage().returns("I see an image of a sunset.")
4387
+ */
4388
+ whenMessageHasImage(): this;
4389
+ /**
4390
+ * Match when any message contains audio.
4391
+ *
4392
+ * @example
4393
+ * mockLLM().whenMessageHasAudio().returns("I hear music playing.")
4394
+ */
4395
+ whenMessageHasAudio(): this;
4396
+ /**
4397
+ * Match based on the number of images in the last message.
4398
+ *
4399
+ * @example
4400
+ * mockLLM().whenImageCount((n) => n >= 2).returns("Comparing multiple images...")
4401
+ */
4402
+ whenImageCount(predicate: (count: number) => boolean): this;
3746
4403
  /**
3747
4404
  * Set the text response to return.
3748
4405
  * Can be a static string or a function that returns a string dynamically.
@@ -3775,6 +4432,51 @@ declare class MockBuilder {
3775
4432
  * .returnsGadgetCall('logger', { message: 'Done!' })
3776
4433
  */
3777
4434
  returnsGadgetCall(gadgetName: string, parameters: Record<string, unknown>): this;
4435
+ /**
4436
+ * Return a single image in the response.
4437
+ * Useful for mocking image generation endpoints.
4438
+ *
4439
+ * @param data - Image data (base64 string or Buffer)
4440
+ * @param mimeType - MIME type (auto-detected if Buffer provided without type)
4441
+ *
4442
+ * @example
4443
+ * mockLLM()
4444
+ * .forModel('dall-e-3')
4445
+ * .returnsImage(pngBuffer)
4446
+ * .register();
4447
+ */
4448
+ returnsImage(data: string | Buffer | Uint8Array, mimeType?: ImageMimeType): this;
4449
+ /**
4450
+ * Return multiple images in the response.
4451
+ *
4452
+ * @example
4453
+ * mockLLM()
4454
+ * .forModel('dall-e-3')
4455
+ * .returnsImages([
4456
+ * { data: pngBuffer1 },
4457
+ * { data: pngBuffer2 },
4458
+ * ])
4459
+ * .register();
4460
+ */
4461
+ returnsImages(images: Array<{
4462
+ data: string | Buffer | Uint8Array;
4463
+ mimeType?: ImageMimeType;
4464
+ revisedPrompt?: string;
4465
+ }>): this;
4466
+ /**
4467
+ * Return audio data in the response.
4468
+ * Useful for mocking speech synthesis endpoints.
4469
+ *
4470
+ * @param data - Audio data (base64 string or Buffer)
4471
+ * @param mimeType - MIME type (auto-detected if Buffer provided without type)
4472
+ *
4473
+ * @example
4474
+ * mockLLM()
4475
+ * .forModel('tts-1')
4476
+ * .returnsAudio(mp3Buffer)
4477
+ * .register();
4478
+ */
4479
+ returnsAudio(data: string | Buffer | Uint8Array, mimeType?: AudioMimeType): this;
3778
4480
  /**
3779
4481
  * Set the complete mock response object.
3780
4482
  * This allows full control over all response properties.
@@ -4016,4 +4718,4 @@ declare function createTextMockStream(text: string, options?: {
4016
4718
  usage?: MockResponse["usage"];
4017
4719
  }): LLMStream;
4018
4720
 
4019
- export { type EventHandlers as $, type AgentHooks as A, BaseGadget as B, type CompactionStrategy as C, type ProviderAdapter as D, type ExecutionContext as E, type ModelDescriptor as F, GadgetRegistry as G, type HintTemplate as H, type IConversationManager as I, type ModelSpec as J, type LLMGenerationOptions as K, type LLMStream as L, MockProviderAdapter as M, type ImageModelSpec as N, type ImageGenerationOptions as O, type ParsedGadgetCall as P, type ImageGenerationResult as Q, type ResolvedCompactionConfig as R, type StreamEvent as S, type TokenUsage as T, type SpeechModelSpec as U, type SpeechGenerationOptions as V, type SpeechGenerationResult as W, type HistoryMessage as X, type TrailingMessage as Y, type TrailingMessageContext as Z, AgentBuilder as _, type LLMStreamChunk as a, collectEvents as a0, collectText as a1, runWithHandlers as a2, type AfterGadgetExecutionAction as a3, type AfterGadgetExecutionControllerContext as a4, type AfterLLMCallAction as a5, type AfterLLMCallControllerContext as a6, type AfterLLMErrorAction as a7, type AgentOptions as a8, type BeforeGadgetExecutionAction as a9, type ModelLimits as aA, type ModelPricing as aB, type ProviderIdentifier as aC, ModelIdentifierParser as aD, type HintContext as aE, type PromptConfig as aF, type PromptContext as aG, type PromptTemplate as aH, DEFAULT_HINTS as aI, DEFAULT_PROMPTS as aJ, resolveHintTemplate as aK, resolvePromptTemplate as aL, resolveRulesTemplate as aM, type QuickOptions as aN, complete as aO, stream as aP, type GadgetClass as aQ, type GadgetOrClass as aR, type CostReportingLLMist as aS, type GadgetExecuteResult as aT, type TextOnlyAction as aU, type TextOnlyContext as aV, type TextOnlyCustomHandler as aW, type TextOnlyGadgetConfig as aX, type TextOnlyHandler as aY, type TextOnlyStrategy as aZ, type BeforeLLMCallAction as aa, type ChunkInterceptorContext as ab, type Controllers as ac, type GadgetExecutionControllerContext as ad, type GadgetParameterInterceptorContext as ae, type GadgetResultInterceptorContext as af, type Interceptors as ag, type LLMCallControllerContext as ah, type LLMErrorControllerContext as ai, type MessageInterceptorContext as aj, type ObserveChunkContext as ak, type ObserveGadgetCompleteContext as al, type ObserveGadgetStartContext as am, type ObserveLLMCallContext as an, type ObserveLLMCompleteContext as ao, type ObserveLLMErrorContext as ap, type Observers as aq, type MessageTurn as ar, type ObserveCompactionContext as as, DEFAULT_COMPACTION_CONFIG as at, DEFAULT_SUMMARIZATION_PROMPT as au, type LLMistOptions as av, type LLMRole as aw, LLMMessageBuilder as ax, type CostEstimate as ay, type ModelFeatures as az, type LLMMessage as b, createMockAdapter as c, MockBuilder as d, createMockClient as e, MockManager as f, getMockManager as g, createMockStream as h, createTextMockStream as i, type MockMatcher as j, type MockMatcherContext as k, type MockOptions as l, mockLLM as m, type MockRegistration as n, type MockResponse as o, type MockStats as p, ModelRegistry as q, LLMist as r, type CompactionContext as s, type CompactionResult as t, type CompactionConfig as u, type CompactionEvent as v, type CompactionStats as w, type GadgetExecuteReturn as x, type GadgetExample as y, type GadgetExecutionResult as z };
4721
+ export { type TrailingMessage as $, type AgentHooks as A, BaseGadget as B, type CompactionStrategy as C, type GadgetExecuteReturn as D, type ExecutionContext as E, type GadgetExample as F, GadgetRegistry as G, type HintTemplate as H, type IConversationManager as I, type GadgetExecutionResult as J, type ProviderAdapter as K, type LLMStream as L, MockProviderAdapter as M, type ModelDescriptor as N, type ModelSpec as O, type ParsedGadgetCall as P, type LLMGenerationOptions as Q, type ResolvedCompactionConfig as R, type StreamEvent as S, type TokenUsage as T, type ImageModelSpec as U, type ImageGenerationOptions as V, type ImageGenerationResult as W, type SpeechModelSpec as X, type SpeechGenerationOptions as Y, type SpeechGenerationResult as Z, type HistoryMessage as _, type LLMStreamChunk as a, type VisionAnalyzeOptions as a$, type TrailingMessageContext as a0, AgentBuilder as a1, type EventHandlers as a2, collectEvents as a3, collectText as a4, runWithHandlers as a5, type AfterGadgetExecutionAction as a6, type AfterGadgetExecutionControllerContext as a7, type AfterLLMCallAction as a8, type AfterLLMCallControllerContext as a9, type AudioMimeType as aA, type AudioSource as aB, type ContentPart as aC, type ImageBase64Source as aD, type ImageContentPart as aE, type ImageMimeType as aF, type ImageSource as aG, type ImageUrlSource as aH, type TextContentPart as aI, audioFromBase64 as aJ, audioFromBuffer as aK, detectAudioMimeType as aL, detectImageMimeType as aM, imageFromBase64 as aN, imageFromBuffer as aO, imageFromUrl as aP, isAudioPart as aQ, isDataUrl as aR, isImagePart as aS, isTextPart as aT, parseDataUrl as aU, text as aV, toBase64 as aW, type LLMRole as aX, extractText as aY, LLMMessageBuilder as aZ, normalizeContent as a_, type AfterLLMErrorAction as aa, type AgentOptions as ab, type BeforeGadgetExecutionAction as ac, type BeforeLLMCallAction as ad, type ChunkInterceptorContext as ae, type Controllers as af, type GadgetExecutionControllerContext as ag, type GadgetParameterInterceptorContext as ah, type GadgetResultInterceptorContext as ai, type Interceptors as aj, type LLMCallControllerContext as ak, type LLMErrorControllerContext as al, type MessageInterceptorContext as am, type ObserveChunkContext as an, type ObserveGadgetCompleteContext as ao, type ObserveGadgetStartContext as ap, type ObserveLLMCallContext as aq, type ObserveLLMCompleteContext as ar, type ObserveLLMErrorContext as as, type Observers as at, type MessageTurn as au, type ObserveCompactionContext as av, DEFAULT_COMPACTION_CONFIG as aw, DEFAULT_SUMMARIZATION_PROMPT as ax, type LLMistOptions as ay, type AudioContentPart as az, type LLMMessage as b, type VisionAnalyzeResult as b0, type CostEstimate as b1, type ModelFeatures as b2, type ModelLimits as b3, type ModelPricing as b4, type ProviderIdentifier as b5, ModelIdentifierParser as b6, type HintContext as b7, type PromptConfig as b8, type PromptContext as b9, type PromptTemplate as ba, DEFAULT_HINTS as bb, DEFAULT_PROMPTS as bc, resolveHintTemplate as bd, resolvePromptTemplate as be, resolveRulesTemplate as bf, type QuickOptions as bg, complete as bh, stream as bi, type GadgetClass as bj, type GadgetOrClass as bk, type CostReportingLLMist as bl, type GadgetExecuteResult as bm, type GadgetSkippedEvent as bn, type TextOnlyAction as bo, type TextOnlyContext as bp, type TextOnlyCustomHandler as bq, type TextOnlyGadgetConfig as br, type TextOnlyHandler as bs, type TextOnlyStrategy as bt, createMockAdapter as c, MockBuilder as d, createMockClient as e, MockManager as f, getMockManager as g, createMockStream as h, createTextMockStream as i, type MockAudioData as j, type MockImageData as k, type MockMatcher as l, mockLLM as m, type MockMatcherContext as n, type MockOptions as o, type MockRegistration as p, type MockResponse as q, type MockStats as r, ModelRegistry as s, type MessageContent as t, LLMist as u, type CompactionContext as v, type CompactionResult as w, type CompactionConfig as x, type CompactionEvent as y, type CompactionStats as z };