llmist 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import "./chunk-ZDNV7DDO.js";
2
+ import "./chunk-IHSZUAYN.js";
3
3
  import {
4
4
  AgentBuilder,
5
5
  BaseGadget,
@@ -10,14 +10,20 @@ import {
10
10
  LLMMessageBuilder,
11
11
  LLMist,
12
12
  MODEL_ALIASES,
13
+ audioFromBuffer,
13
14
  createGadget,
14
15
  createLogger,
16
+ detectAudioMimeType,
17
+ detectImageMimeType,
18
+ extractText,
19
+ imageFromBuffer,
15
20
  init_builder,
16
21
  init_client,
17
22
  init_constants,
18
23
  init_create_gadget,
19
24
  init_exceptions,
20
25
  init_gadget,
26
+ init_input_content,
21
27
  init_logger,
22
28
  init_messages,
23
29
  init_model_shortcuts,
@@ -26,8 +32,9 @@ import {
26
32
  init_schema_validator,
27
33
  resolveModel,
28
34
  schemaToJSONSchema,
35
+ text,
29
36
  validateGadgetSchema
30
- } from "./chunk-GANXNBIZ.js";
37
+ } from "./chunk-YHS2DYXP.js";
31
38
 
32
39
  // src/cli/constants.ts
33
40
  var CLI_NAME = "llmist";
@@ -36,7 +43,10 @@ var COMMANDS = {
36
43
  complete: "complete",
37
44
  agent: "agent",
38
45
  models: "models",
39
- gadget: "gadget"
46
+ gadget: "gadget",
47
+ image: "image",
48
+ speech: "speech",
49
+ vision: "vision"
40
50
  };
41
51
  var LOG_LEVELS = ["silly", "trace", "debug", "info", "warn", "error", "fatal"];
42
52
  var DEFAULT_MODEL = "openai:gpt-5-nano";
@@ -57,7 +67,20 @@ var OPTION_FLAGS = {
57
67
  docker: "--docker",
58
68
  dockerRo: "--docker-ro",
59
69
  noDocker: "--no-docker",
60
- dockerDev: "--docker-dev"
70
+ dockerDev: "--docker-dev",
71
+ // Multimodal input options
72
+ inputImage: "--image <path>",
73
+ inputAudio: "--audio <path>",
74
+ // Image generation options
75
+ imageSize: "--size <size>",
76
+ imageQuality: "--quality <quality>",
77
+ imageCount: "-n, --count <number>",
78
+ imageOutput: "-o, --output <path>",
79
+ // Speech generation options
80
+ voice: "--voice <name>",
81
+ speechFormat: "--format <format>",
82
+ speechSpeed: "--speed <value>",
83
+ speechOutput: "-o, --output <path>"
61
84
  };
62
85
  var OPTION_DESCRIPTIONS = {
63
86
  model: "Model identifier, e.g. openai:gpt-5-nano or anthropic:claude-sonnet-4-5.",
@@ -73,10 +96,23 @@ var OPTION_DESCRIPTIONS = {
73
96
  noBuiltins: "Disable built-in gadgets (AskUser, TellUser).",
74
97
  noBuiltinInteraction: "Disable interactive gadgets (AskUser) while keeping TellUser.",
75
98
  quiet: "Suppress all output except content (text and TellUser messages).",
99
+ // Multimodal input descriptions
100
+ inputImage: "Image file to include with the prompt (vision models).",
101
+ inputAudio: "Audio file to include with the prompt (Gemini only).",
76
102
  docker: "Run agent in a Docker sandbox container for security isolation.",
77
103
  dockerRo: "Run in Docker with current directory mounted read-only.",
78
104
  noDocker: "Disable Docker sandboxing (override config).",
79
- dockerDev: "Run in Docker dev mode (mount local source instead of npm install)."
105
+ dockerDev: "Run in Docker dev mode (mount local source instead of npm install).",
106
+ // Image generation descriptions
107
+ imageSize: "Image size/aspect ratio, e.g. '1024x1024', '1:1', '16:9'.",
108
+ imageQuality: "Image quality: 'standard', 'hd', 'low', 'medium', 'high'.",
109
+ imageCount: "Number of images to generate (model dependent, usually 1-4).",
110
+ imageOutput: "Output path for the generated image. Defaults to stdout if not specified.",
111
+ // Speech generation descriptions
112
+ voice: "Voice name for speech generation, e.g. 'nova', 'alloy', 'Zephyr'.",
113
+ speechFormat: "Audio format: 'mp3', 'opus', 'aac', 'flac', 'wav', 'pcm'.",
114
+ speechSpeed: "Speech speed multiplier (0.25 to 4.0, default 1.0).",
115
+ speechOutput: "Output path for audio file. Defaults to stdout if not specified."
80
116
  };
81
117
  var SUMMARY_PREFIX = "[llmist]";
82
118
 
@@ -86,7 +122,7 @@ import { Command, InvalidArgumentError as InvalidArgumentError2 } from "commande
86
122
  // package.json
87
123
  var package_default = {
88
124
  name: "llmist",
89
- version: "2.2.0",
125
+ version: "2.5.0",
90
126
  description: "TypeScript LLM client with streaming tool execution. Tools fire mid-stream. Built-in function calling works with any model\u2014no structured outputs or native tool support required.",
91
127
  type: "module",
92
128
  main: "dist/index.cjs",
@@ -226,6 +262,7 @@ function isAbortError(error) {
226
262
  }
227
263
 
228
264
  // src/cli/agent-command.ts
265
+ init_input_content();
229
266
  init_registry();
230
267
  init_constants();
231
268
 
@@ -550,6 +587,75 @@ var finish = createGadget({
550
587
  });
551
588
  var builtinGadgets = [askUser, tellUser, finish];
552
589
 
590
+ // src/cli/file-utils.ts
591
+ init_input_content();
592
+ import { readFile, stat } from "node:fs/promises";
593
+ import { resolve as resolve2 } from "node:path";
594
+ var DEFAULT_MAX_FILE_SIZE = 50 * 1024 * 1024;
595
+ function formatFileSize(bytes) {
596
+ if (bytes < 1024) return `${bytes} bytes`;
597
+ if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
598
+ if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
599
+ return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
600
+ }
601
+ async function checkFileSize(absolutePath, filePath, maxSize) {
602
+ const stats = await stat(absolutePath);
603
+ if (stats.size > maxSize) {
604
+ throw new Error(
605
+ `File "${filePath}" is too large (${formatFileSize(stats.size)}). Maximum allowed size is ${formatFileSize(maxSize)}. Consider compressing the file or using a smaller version.`
606
+ );
607
+ }
608
+ }
609
+ async function readImageFile(filePath, options = {}) {
610
+ const absolutePath = resolve2(filePath);
611
+ const maxFileSize = options.maxFileSize ?? DEFAULT_MAX_FILE_SIZE;
612
+ let buffer;
613
+ try {
614
+ await checkFileSize(absolutePath, filePath, maxFileSize);
615
+ buffer = await readFile(absolutePath);
616
+ } catch (error) {
617
+ const message = error instanceof Error ? error.message : String(error);
618
+ throw new Error(`Failed to read image file "${filePath}": ${message}`);
619
+ }
620
+ const mimeType = detectImageMimeType(buffer);
621
+ if (!mimeType) {
622
+ throw new Error(
623
+ `File "${filePath}" is not a supported image format. Supported formats: JPEG, PNG, GIF, WebP`
624
+ );
625
+ }
626
+ return imageFromBuffer(buffer, mimeType);
627
+ }
628
+ async function readAudioFile(filePath, options = {}) {
629
+ const absolutePath = resolve2(filePath);
630
+ const maxFileSize = options.maxFileSize ?? DEFAULT_MAX_FILE_SIZE;
631
+ let buffer;
632
+ try {
633
+ await checkFileSize(absolutePath, filePath, maxFileSize);
634
+ buffer = await readFile(absolutePath);
635
+ } catch (error) {
636
+ const message = error instanceof Error ? error.message : String(error);
637
+ throw new Error(`Failed to read audio file "${filePath}": ${message}`);
638
+ }
639
+ const mimeType = detectAudioMimeType(buffer);
640
+ if (!mimeType) {
641
+ throw new Error(
642
+ `File "${filePath}" is not a supported audio format. Supported formats: MP3, WAV, OGG, WebM`
643
+ );
644
+ }
645
+ return audioFromBuffer(buffer, mimeType);
646
+ }
647
+ async function readFileBuffer(filePath, options = {}) {
648
+ const absolutePath = resolve2(filePath);
649
+ const maxFileSize = options.maxFileSize ?? DEFAULT_MAX_FILE_SIZE;
650
+ try {
651
+ await checkFileSize(absolutePath, filePath, maxFileSize);
652
+ return await readFile(absolutePath);
653
+ } catch (error) {
654
+ const message = error instanceof Error ? error.message : String(error);
655
+ throw new Error(`Failed to read file "${filePath}": ${message}`);
656
+ }
657
+ }
658
+
553
659
  // src/cli/gadgets.ts
554
660
  init_gadget();
555
661
  import fs5 from "node:fs";
@@ -713,7 +819,7 @@ ${formattedList}`;
713
819
  // src/cli/builtins/filesystem/read-file.ts
714
820
  import fs3 from "node:fs";
715
821
  import { z as z3 } from "zod";
716
- var readFile = createGadget({
822
+ var readFile2 = createGadget({
717
823
  name: "ReadFile",
718
824
  description: "Read the entire content of a file and return it as text. The file path must be within the current working directory or its subdirectories.",
719
825
  schema: z3.object({
@@ -984,7 +1090,7 @@ error: ${message}`;
984
1090
  // src/cli/builtins/index.ts
985
1091
  var builtinGadgetRegistry = {
986
1092
  ListDirectory: listDirectory,
987
- ReadFile: readFile,
1093
+ ReadFile: readFile2,
988
1094
  WriteFile: writeFile,
989
1095
  EditFile: editFile,
990
1096
  RunCommand: runCommand
@@ -1119,6 +1225,7 @@ async function loadGadgets(specifiers, cwd, importer = (specifier) => import(spe
1119
1225
  }
1120
1226
 
1121
1227
  // src/cli/llm-logging.ts
1228
+ init_messages();
1122
1229
  import { mkdir, writeFile as writeFile2 } from "node:fs/promises";
1123
1230
  import { homedir } from "node:os";
1124
1231
  import { join } from "node:path";
@@ -1136,7 +1243,7 @@ function formatLlmRequest(messages) {
1136
1243
  const lines = [];
1137
1244
  for (const msg of messages) {
1138
1245
  lines.push(`=== ${msg.role.toUpperCase()} ===`);
1139
- lines.push(msg.content ?? "");
1246
+ lines.push(msg.content ? extractText(msg.content) : "");
1140
1247
  lines.push("");
1141
1248
  }
1142
1249
  return lines.join("\n");
@@ -1210,9 +1317,9 @@ function ensureMarkedConfigured() {
1210
1317
  markedConfigured = true;
1211
1318
  }
1212
1319
  }
1213
- function renderMarkdown(text) {
1320
+ function renderMarkdown(text2) {
1214
1321
  ensureMarkedConfigured();
1215
- let rendered = marked.parse(text);
1322
+ let rendered = marked.parse(text2);
1216
1323
  rendered = rendered.replace(/\*\*(.+?)\*\*/g, (_, content) => chalk3.bold(content)).replace(/(?<!\*)\*(\S[^*]*)\*(?!\*)/g, (_, content) => chalk3.italic(content));
1217
1324
  return rendered.trimEnd();
1218
1325
  }
@@ -1226,8 +1333,8 @@ function createRainbowSeparator() {
1226
1333
  }
1227
1334
  return result;
1228
1335
  }
1229
- function renderMarkdownWithSeparators(text) {
1230
- const rendered = renderMarkdown(text);
1336
+ function renderMarkdownWithSeparators(text2) {
1337
+ const rendered = renderMarkdown(text2);
1231
1338
  const separator = createRainbowSeparator();
1232
1339
  return `
1233
1340
  ${separator}
@@ -1395,12 +1502,12 @@ var StreamPrinter = class {
1395
1502
  *
1396
1503
  * @param text - Text to write
1397
1504
  */
1398
- write(text) {
1399
- if (!text) {
1505
+ write(text2) {
1506
+ if (!text2) {
1400
1507
  return;
1401
1508
  }
1402
- this.target.write(text);
1403
- this.endedWithNewline = text.endsWith("\n");
1509
+ this.target.write(text2);
1510
+ this.endedWithNewline = text2.endsWith("\n");
1404
1511
  }
1405
1512
  /**
1406
1513
  * Ensures output ends with a newline by writing one if needed.
@@ -1879,7 +1986,7 @@ function addCompleteOptions(cmd, defaults) {
1879
1986
  OPTION_DESCRIPTIONS.maxTokens,
1880
1987
  createNumericParser({ label: "Max tokens", integer: true, min: 1 }),
1881
1988
  defaults?.["max-tokens"]
1882
- ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]);
1989
+ ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]).option(OPTION_FLAGS.inputImage, OPTION_DESCRIPTIONS.inputImage).option(OPTION_FLAGS.inputAudio, OPTION_DESCRIPTIONS.inputAudio);
1883
1990
  }
1884
1991
  function addAgentOptions(cmd, defaults) {
1885
1992
  const gadgetAccumulator = (value, previous = []) => [
@@ -1903,7 +2010,7 @@ function addAgentOptions(cmd, defaults) {
1903
2010
  OPTION_FLAGS.noBuiltinInteraction,
1904
2011
  OPTION_DESCRIPTIONS.noBuiltinInteraction,
1905
2012
  defaults?.["builtin-interaction"] !== false
1906
- ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]).option(OPTION_FLAGS.docker, OPTION_DESCRIPTIONS.docker).option(OPTION_FLAGS.dockerRo, OPTION_DESCRIPTIONS.dockerRo).option(OPTION_FLAGS.noDocker, OPTION_DESCRIPTIONS.noDocker).option(OPTION_FLAGS.dockerDev, OPTION_DESCRIPTIONS.dockerDev);
2013
+ ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, defaults?.quiet).option(OPTION_FLAGS.logLlmRequests, OPTION_DESCRIPTIONS.logLlmRequests, defaults?.["log-llm-requests"]).option(OPTION_FLAGS.inputImage, OPTION_DESCRIPTIONS.inputImage).option(OPTION_FLAGS.inputAudio, OPTION_DESCRIPTIONS.inputAudio).option(OPTION_FLAGS.docker, OPTION_DESCRIPTIONS.docker).option(OPTION_FLAGS.dockerRo, OPTION_DESCRIPTIONS.dockerRo).option(OPTION_FLAGS.noDocker, OPTION_DESCRIPTIONS.noDocker).option(OPTION_FLAGS.dockerDev, OPTION_DESCRIPTIONS.dockerDev);
1907
2014
  }
1908
2015
  function configToCompleteOptions(config) {
1909
2016
  const result = {};
@@ -2108,6 +2215,22 @@ var AGENT_CONFIG_KEYS = /* @__PURE__ */ new Set([
2108
2215
  "docker-cwd-permission"
2109
2216
  // Override CWD mount permission for this profile
2110
2217
  ]);
2218
+ var IMAGE_CONFIG_KEYS = /* @__PURE__ */ new Set([
2219
+ "model",
2220
+ "size",
2221
+ "quality",
2222
+ "count",
2223
+ "output",
2224
+ "quiet"
2225
+ ]);
2226
+ var SPEECH_CONFIG_KEYS = /* @__PURE__ */ new Set([
2227
+ "model",
2228
+ "voice",
2229
+ "format",
2230
+ "speed",
2231
+ "output",
2232
+ "quiet"
2233
+ ]);
2111
2234
  var CUSTOM_CONFIG_KEYS = /* @__PURE__ */ new Set([
2112
2235
  ...COMPLETE_CONFIG_KEYS,
2113
2236
  ...AGENT_CONFIG_KEYS,
@@ -2368,6 +2491,75 @@ function validateAgentConfig(raw, section) {
2368
2491
  }
2369
2492
  return result;
2370
2493
  }
2494
+ function validateImageConfig(raw, section) {
2495
+ if (typeof raw !== "object" || raw === null) {
2496
+ throw new ConfigError(`[${section}] must be a table`);
2497
+ }
2498
+ const rawObj = raw;
2499
+ for (const key of Object.keys(rawObj)) {
2500
+ if (!IMAGE_CONFIG_KEYS.has(key)) {
2501
+ throw new ConfigError(`[${section}].${key} is not a valid option`);
2502
+ }
2503
+ }
2504
+ const result = {};
2505
+ if ("model" in rawObj) {
2506
+ result.model = validateString(rawObj.model, "model", section);
2507
+ }
2508
+ if ("size" in rawObj) {
2509
+ result.size = validateString(rawObj.size, "size", section);
2510
+ }
2511
+ if ("quality" in rawObj) {
2512
+ result.quality = validateString(rawObj.quality, "quality", section);
2513
+ }
2514
+ if ("count" in rawObj) {
2515
+ result.count = validateNumber(rawObj.count, "count", section, {
2516
+ integer: true,
2517
+ min: 1,
2518
+ max: 10
2519
+ });
2520
+ }
2521
+ if ("output" in rawObj) {
2522
+ result.output = validateString(rawObj.output, "output", section);
2523
+ }
2524
+ if ("quiet" in rawObj) {
2525
+ result.quiet = validateBoolean(rawObj.quiet, "quiet", section);
2526
+ }
2527
+ return result;
2528
+ }
2529
+ function validateSpeechConfig(raw, section) {
2530
+ if (typeof raw !== "object" || raw === null) {
2531
+ throw new ConfigError(`[${section}] must be a table`);
2532
+ }
2533
+ const rawObj = raw;
2534
+ for (const key of Object.keys(rawObj)) {
2535
+ if (!SPEECH_CONFIG_KEYS.has(key)) {
2536
+ throw new ConfigError(`[${section}].${key} is not a valid option`);
2537
+ }
2538
+ }
2539
+ const result = {};
2540
+ if ("model" in rawObj) {
2541
+ result.model = validateString(rawObj.model, "model", section);
2542
+ }
2543
+ if ("voice" in rawObj) {
2544
+ result.voice = validateString(rawObj.voice, "voice", section);
2545
+ }
2546
+ if ("format" in rawObj) {
2547
+ result.format = validateString(rawObj.format, "format", section);
2548
+ }
2549
+ if ("speed" in rawObj) {
2550
+ result.speed = validateNumber(rawObj.speed, "speed", section, {
2551
+ min: 0.25,
2552
+ max: 4
2553
+ });
2554
+ }
2555
+ if ("output" in rawObj) {
2556
+ result.output = validateString(rawObj.output, "output", section);
2557
+ }
2558
+ if ("quiet" in rawObj) {
2559
+ result.quiet = validateBoolean(rawObj.quiet, "quiet", section);
2560
+ }
2561
+ return result;
2562
+ }
2371
2563
  function validateStringOrBoolean(value, field, section) {
2372
2564
  if (typeof value === "string" || typeof value === "boolean") {
2373
2565
  return value;
@@ -2490,6 +2682,10 @@ function validateConfig(raw, configPath) {
2490
2682
  result.complete = validateCompleteConfig(value, key);
2491
2683
  } else if (key === "agent") {
2492
2684
  result.agent = validateAgentConfig(value, key);
2685
+ } else if (key === "image") {
2686
+ result.image = validateImageConfig(value, key);
2687
+ } else if (key === "speech") {
2688
+ result.speech = validateSpeechConfig(value, key);
2493
2689
  } else if (key === "prompts") {
2494
2690
  result.prompts = validatePromptsConfig(value, key);
2495
2691
  } else if (key === "docker") {
@@ -2534,7 +2730,7 @@ function loadConfig() {
2534
2730
  return resolveTemplatesInConfig(inherited, configPath);
2535
2731
  }
2536
2732
  function getCustomCommandNames(config) {
2537
- const reserved = /* @__PURE__ */ new Set(["global", "complete", "agent", "prompts", "docker"]);
2733
+ const reserved = /* @__PURE__ */ new Set(["global", "complete", "agent", "image", "speech", "prompts", "docker"]);
2538
2734
  return Object.keys(config).filter((key) => !reserved.has(key));
2539
2735
  }
2540
2736
  function resolveTemplatesInConfig(config, configPath) {
@@ -3528,8 +3724,8 @@ Denied: ${result.reason ?? "by user"}`
3528
3724
  builder.withTextOnlyHandler("acknowledge");
3529
3725
  builder.withTextWithGadgetsHandler({
3530
3726
  gadgetName: "TellUser",
3531
- parameterMapping: (text) => ({ message: text, done: false, type: "info" }),
3532
- resultMapping: (text) => `\u2139\uFE0F ${text}`
3727
+ parameterMapping: (text2) => ({ message: text2, done: false, type: "info" }),
3728
+ resultMapping: (text2) => `\u2139\uFE0F ${text2}`
3533
3729
  });
3534
3730
  builder.withTrailingMessage(
3535
3731
  (ctx) => [
@@ -3538,7 +3734,19 @@ Denied: ${result.reason ?? "by user"}`
3538
3734
  "Maximize efficiency by batching independent operations in a single response."
3539
3735
  ].join(" ")
3540
3736
  );
3541
- const agent = builder.ask(prompt);
3737
+ let agent;
3738
+ if (options.image || options.audio) {
3739
+ const parts = [text(prompt)];
3740
+ if (options.image) {
3741
+ parts.push(await readImageFile(options.image));
3742
+ }
3743
+ if (options.audio) {
3744
+ parts.push(await readAudioFile(options.audio));
3745
+ }
3746
+ agent = builder.askWithContent(parts);
3747
+ } else {
3748
+ agent = builder.ask(prompt);
3749
+ }
3542
3750
  let textBuffer = "";
3543
3751
  const flushTextBuffer = () => {
3544
3752
  if (textBuffer) {
@@ -3613,6 +3821,7 @@ function registerAgentCommand(program, env, config) {
3613
3821
  }
3614
3822
 
3615
3823
  // src/cli/complete-command.ts
3824
+ init_input_content();
3616
3825
  init_messages();
3617
3826
  init_model_shortcuts();
3618
3827
  init_constants();
@@ -3624,7 +3833,18 @@ async function executeComplete(promptArg, options, env) {
3624
3833
  if (options.system) {
3625
3834
  builder.addSystem(options.system);
3626
3835
  }
3627
- builder.addUser(prompt);
3836
+ if (options.image || options.audio) {
3837
+ const parts = [text(prompt)];
3838
+ if (options.image) {
3839
+ parts.push(await readImageFile(options.image));
3840
+ }
3841
+ if (options.audio) {
3842
+ parts.push(await readAudioFile(options.audio));
3843
+ }
3844
+ builder.addUserMultimodal(parts);
3845
+ } else {
3846
+ builder.addUser(prompt);
3847
+ }
3628
3848
  const messages = builder.build();
3629
3849
  const llmLogsBaseDir = resolveLogDir(options.logLlmRequests, "requests");
3630
3850
  let llmSessionDir;
@@ -4129,19 +4349,118 @@ function registerGadgetCommand(program, env) {
4129
4349
  );
4130
4350
  }
4131
4351
 
4352
+ // src/cli/image-command.ts
4353
+ import { writeFileSync as writeFileSync2 } from "node:fs";
4354
+ var DEFAULT_IMAGE_MODEL = "dall-e-3";
4355
+ async function executeImage(promptArg, options, env) {
4356
+ const prompt = await resolvePrompt(promptArg, env);
4357
+ const client = env.createClient();
4358
+ const model = options.model;
4359
+ const n = options.count ? Number.parseInt(options.count, 10) : 1;
4360
+ const stderrTTY = env.stderr.isTTY === true;
4361
+ if (!options.quiet && stderrTTY) {
4362
+ env.stderr.write(`${SUMMARY_PREFIX} Generating image with ${model}...
4363
+ `);
4364
+ }
4365
+ const result = await client.image.generate({
4366
+ model,
4367
+ prompt,
4368
+ size: options.size,
4369
+ quality: options.quality,
4370
+ n,
4371
+ responseFormat: options.output ? "b64_json" : "url"
4372
+ });
4373
+ if (options.output) {
4374
+ const imageData = result.images[0];
4375
+ if (imageData.b64Json) {
4376
+ const buffer = Buffer.from(imageData.b64Json, "base64");
4377
+ writeFileSync2(options.output, buffer);
4378
+ if (!options.quiet) {
4379
+ env.stderr.write(`${SUMMARY_PREFIX} Image saved to ${options.output}
4380
+ `);
4381
+ }
4382
+ } else if (imageData.url) {
4383
+ env.stdout.write(`${imageData.url}
4384
+ `);
4385
+ }
4386
+ } else {
4387
+ for (const image of result.images) {
4388
+ if (image.url) {
4389
+ env.stdout.write(`${image.url}
4390
+ `);
4391
+ } else if (image.b64Json) {
4392
+ env.stdout.write(image.b64Json);
4393
+ }
4394
+ }
4395
+ }
4396
+ if (!options.quiet && stderrTTY) {
4397
+ const parts = [
4398
+ `${result.images.length} image(s)`,
4399
+ `size: ${result.usage.size}`,
4400
+ `quality: ${result.usage.quality}`
4401
+ ];
4402
+ if (result.cost !== void 0) {
4403
+ parts.push(`cost: ${formatCost(result.cost)}`);
4404
+ }
4405
+ env.stderr.write(`${SUMMARY_PREFIX} ${parts.join(" | ")}
4406
+ `);
4407
+ }
4408
+ }
4409
+ function registerImageCommand(program, env, config) {
4410
+ program.command(COMMANDS.image).description("Generate images from a text prompt.").argument("[prompt]", "Image generation prompt. If omitted, stdin is used when available.").option(
4411
+ OPTION_FLAGS.model,
4412
+ OPTION_DESCRIPTIONS.model,
4413
+ config?.model ?? DEFAULT_IMAGE_MODEL
4414
+ ).option(OPTION_FLAGS.imageSize, OPTION_DESCRIPTIONS.imageSize, config?.size).option(OPTION_FLAGS.imageQuality, OPTION_DESCRIPTIONS.imageQuality, config?.quality).option(OPTION_FLAGS.imageCount, OPTION_DESCRIPTIONS.imageCount, config?.count?.toString()).option(OPTION_FLAGS.imageOutput, OPTION_DESCRIPTIONS.imageOutput, config?.output).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, config?.quiet ?? false).action(
4415
+ (prompt, options) => executeAction(() => executeImage(prompt, options, env), env)
4416
+ );
4417
+ }
4418
+
4132
4419
  // src/cli/models-command.ts
4133
4420
  import chalk8 from "chalk";
4134
4421
  init_model_shortcuts();
4135
4422
  async function handleModelsCommand(options, env) {
4136
4423
  const client = env.createClient();
4137
- const models = client.modelRegistry.listModels(options.provider);
4424
+ const showText = options.all || options.text || !options.image && !options.speech;
4425
+ const showImage = options.all || options.image;
4426
+ const showSpeech = options.all || options.speech;
4427
+ const textModels = showText ? client.modelRegistry.listModels(options.provider) : [];
4428
+ const imageModels = showImage ? client.image.listModels().filter((m) => !options.provider || m.provider === options.provider) : [];
4429
+ const speechModels = showSpeech ? client.speech.listModels().filter((m) => !options.provider || m.provider === options.provider) : [];
4138
4430
  if (options.format === "json") {
4139
- renderJSON(models, env.stdout);
4431
+ renderJSON(textModels, imageModels, speechModels, env.stdout);
4140
4432
  } else {
4141
- renderTable(models, options.verbose || false, env.stdout);
4433
+ renderAllTables(textModels, imageModels, speechModels, options.verbose || false, env.stdout);
4434
+ }
4435
+ }
4436
+ function renderAllTables(textModels, imageModels, speechModels, verbose, stream) {
4437
+ const hasAnyModels = textModels.length > 0 || imageModels.length > 0 || speechModels.length > 0;
4438
+ if (!hasAnyModels) {
4439
+ stream.write(chalk8.yellow("\nNo models found matching the specified criteria.\n\n"));
4440
+ return;
4441
+ }
4442
+ stream.write(chalk8.bold.cyan("\nAvailable Models\n"));
4443
+ stream.write(chalk8.cyan("=".repeat(80)) + "\n\n");
4444
+ if (textModels.length > 0) {
4445
+ renderTextTable(textModels, verbose, stream);
4446
+ }
4447
+ if (imageModels.length > 0) {
4448
+ renderImageTable(imageModels, verbose, stream);
4449
+ }
4450
+ if (speechModels.length > 0) {
4451
+ renderSpeechTable(speechModels, verbose, stream);
4452
+ }
4453
+ if (textModels.length > 0) {
4454
+ stream.write(chalk8.bold.magenta("Model Shortcuts\n"));
4455
+ stream.write(chalk8.dim("\u2500".repeat(80)) + "\n");
4456
+ const shortcuts = Object.entries(MODEL_ALIASES).sort((a, b) => a[0].localeCompare(b[0]));
4457
+ for (const [shortcut, fullName] of shortcuts) {
4458
+ stream.write(chalk8.cyan(` ${shortcut.padEnd(15)}`) + chalk8.dim(" \u2192 ") + chalk8.white(fullName) + "\n");
4459
+ }
4460
+ stream.write("\n");
4142
4461
  }
4143
4462
  }
4144
- function renderTable(models, verbose, stream) {
4463
+ function renderTextTable(models, verbose, stream) {
4145
4464
  const grouped = /* @__PURE__ */ new Map();
4146
4465
  for (const model of models) {
4147
4466
  const provider = model.provider;
@@ -4150,13 +4469,13 @@ function renderTable(models, verbose, stream) {
4150
4469
  }
4151
4470
  grouped.get(provider).push(model);
4152
4471
  }
4153
- stream.write(chalk8.bold.cyan("\nAvailable Models\n"));
4154
- stream.write(chalk8.cyan("=".repeat(80)) + "\n\n");
4472
+ stream.write(chalk8.bold.blue("\u{1F4DD} Text/LLM Models\n"));
4473
+ stream.write(chalk8.dim("\u2500".repeat(80)) + "\n\n");
4155
4474
  const providers = Array.from(grouped.keys()).sort();
4156
4475
  for (const provider of providers) {
4157
4476
  const providerModels = grouped.get(provider);
4158
4477
  const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
4159
- stream.write(chalk8.bold.yellow(`${providerName} Models
4478
+ stream.write(chalk8.bold.yellow(`${providerName}
4160
4479
  `));
4161
4480
  if (verbose) {
4162
4481
  renderVerboseTable(providerModels, stream);
@@ -4165,13 +4484,6 @@ function renderTable(models, verbose, stream) {
4165
4484
  }
4166
4485
  stream.write("\n");
4167
4486
  }
4168
- stream.write(chalk8.bold.magenta("Model Shortcuts\n"));
4169
- stream.write(chalk8.dim("\u2500".repeat(80)) + "\n");
4170
- const shortcuts = Object.entries(MODEL_ALIASES).sort((a, b) => a[0].localeCompare(b[0]));
4171
- for (const [shortcut, fullName] of shortcuts) {
4172
- stream.write(chalk8.cyan(` ${shortcut.padEnd(15)}`) + chalk8.dim(" \u2192 ") + chalk8.white(fullName) + "\n");
4173
- }
4174
- stream.write("\n");
4175
4487
  }
4176
4488
  function renderCompactTable(models, stream) {
4177
4489
  const idWidth = 25;
@@ -4248,9 +4560,171 @@ function renderVerboseTable(models, stream) {
4248
4560
  }
4249
4561
  stream.write("\n");
4250
4562
  }
4251
- function renderJSON(models, stream) {
4252
- const output = {
4253
- models: models.map((model) => ({
4563
+ function renderImageTable(models, verbose, stream) {
4564
+ stream.write(chalk8.bold.green("\u{1F3A8} Image Generation Models\n"));
4565
+ stream.write(chalk8.dim("\u2500".repeat(80)) + "\n\n");
4566
+ const grouped = /* @__PURE__ */ new Map();
4567
+ for (const model of models) {
4568
+ if (!grouped.has(model.provider)) {
4569
+ grouped.set(model.provider, []);
4570
+ }
4571
+ grouped.get(model.provider).push(model);
4572
+ }
4573
+ for (const [provider, providerModels] of Array.from(grouped.entries()).sort()) {
4574
+ const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
4575
+ stream.write(chalk8.bold.yellow(`${providerName}
4576
+ `));
4577
+ if (verbose) {
4578
+ for (const model of providerModels) {
4579
+ stream.write(chalk8.bold.green(`
4580
+ ${model.modelId}
4581
+ `));
4582
+ stream.write(chalk8.dim(" " + "\u2500".repeat(60)) + "\n");
4583
+ stream.write(` ${chalk8.dim("Name:")} ${chalk8.white(model.displayName)}
4584
+ `);
4585
+ stream.write(` ${chalk8.dim("Sizes:")} ${chalk8.yellow(model.supportedSizes.join(", "))}
4586
+ `);
4587
+ if (model.supportedQualities) {
4588
+ stream.write(` ${chalk8.dim("Qualities:")} ${chalk8.yellow(model.supportedQualities.join(", "))}
4589
+ `);
4590
+ }
4591
+ stream.write(` ${chalk8.dim("Max Images:")} ${chalk8.yellow(model.maxImages.toString())}
4592
+ `);
4593
+ stream.write(` ${chalk8.dim("Pricing:")} ${chalk8.cyan(formatImagePrice(model))}
4594
+ `);
4595
+ if (model.features) {
4596
+ const features = [];
4597
+ if (model.features.textRendering) features.push("text-rendering");
4598
+ if (model.features.transparency) features.push("transparency");
4599
+ if (model.features.conversational) features.push("conversational");
4600
+ if (features.length > 0) {
4601
+ stream.write(` ${chalk8.dim("Features:")} ${chalk8.blue(features.join(", "))}
4602
+ `);
4603
+ }
4604
+ }
4605
+ }
4606
+ } else {
4607
+ const idWidth = 32;
4608
+ const nameWidth = 25;
4609
+ const sizesWidth = 20;
4610
+ const priceWidth = 15;
4611
+ stream.write(chalk8.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
4612
+ stream.write(
4613
+ chalk8.bold(
4614
+ "Model ID".padEnd(idWidth) + " " + "Display Name".padEnd(nameWidth) + " " + "Sizes".padEnd(sizesWidth) + " " + "Price".padEnd(priceWidth)
4615
+ ) + "\n"
4616
+ );
4617
+ stream.write(chalk8.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
4618
+ for (const model of providerModels) {
4619
+ const sizes = model.supportedSizes.length > 2 ? model.supportedSizes.slice(0, 2).join(", ") + "..." : model.supportedSizes.join(", ");
4620
+ stream.write(
4621
+ chalk8.green(model.modelId.padEnd(idWidth)) + " " + chalk8.white(model.displayName.substring(0, nameWidth - 1).padEnd(nameWidth)) + " " + chalk8.yellow(sizes.padEnd(sizesWidth)) + " " + chalk8.cyan(formatImagePrice(model).padEnd(priceWidth)) + "\n"
4622
+ );
4623
+ }
4624
+ stream.write(chalk8.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
4625
+ }
4626
+ stream.write("\n");
4627
+ }
4628
+ }
4629
+ function renderSpeechTable(models, verbose, stream) {
4630
+ stream.write(chalk8.bold.magenta("\u{1F3A4} Speech (TTS) Models\n"));
4631
+ stream.write(chalk8.dim("\u2500".repeat(80)) + "\n\n");
4632
+ const grouped = /* @__PURE__ */ new Map();
4633
+ for (const model of models) {
4634
+ if (!grouped.has(model.provider)) {
4635
+ grouped.set(model.provider, []);
4636
+ }
4637
+ grouped.get(model.provider).push(model);
4638
+ }
4639
+ for (const [provider, providerModels] of Array.from(grouped.entries()).sort()) {
4640
+ const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
4641
+ stream.write(chalk8.bold.yellow(`${providerName}
4642
+ `));
4643
+ if (verbose) {
4644
+ for (const model of providerModels) {
4645
+ stream.write(chalk8.bold.green(`
4646
+ ${model.modelId}
4647
+ `));
4648
+ stream.write(chalk8.dim(" " + "\u2500".repeat(60)) + "\n");
4649
+ stream.write(` ${chalk8.dim("Name:")} ${chalk8.white(model.displayName)}
4650
+ `);
4651
+ stream.write(` ${chalk8.dim("Voices:")} ${chalk8.yellow(model.voices.length.toString())} voices
4652
+ `);
4653
+ if (model.voices.length <= 6) {
4654
+ stream.write(` ${chalk8.dim(model.voices.join(", "))}
4655
+ `);
4656
+ } else {
4657
+ stream.write(` ${chalk8.dim(model.voices.slice(0, 6).join(", ") + "...")}
4658
+ `);
4659
+ }
4660
+ stream.write(` ${chalk8.dim("Formats:")} ${chalk8.yellow(model.formats.join(", "))}
4661
+ `);
4662
+ stream.write(` ${chalk8.dim("Max Input:")} ${chalk8.yellow(model.maxInputLength.toString())} chars
4663
+ `);
4664
+ stream.write(` ${chalk8.dim("Pricing:")} ${chalk8.cyan(formatSpeechPrice(model))}
4665
+ `);
4666
+ if (model.features) {
4667
+ const features = [];
4668
+ if (model.features.multiSpeaker) features.push("multi-speaker");
4669
+ if (model.features.voiceInstructions) features.push("voice-instructions");
4670
+ if (model.features.languages) features.push(`${model.features.languages} languages`);
4671
+ if (features.length > 0) {
4672
+ stream.write(` ${chalk8.dim("Features:")} ${chalk8.blue(features.join(", "))}
4673
+ `);
4674
+ }
4675
+ }
4676
+ }
4677
+ } else {
4678
+ const idWidth = 30;
4679
+ const nameWidth = 28;
4680
+ const voicesWidth = 12;
4681
+ const priceWidth = 18;
4682
+ stream.write(chalk8.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
4683
+ stream.write(
4684
+ chalk8.bold(
4685
+ "Model ID".padEnd(idWidth) + " " + "Display Name".padEnd(nameWidth) + " " + "Voices".padEnd(voicesWidth) + " " + "Price".padEnd(priceWidth)
4686
+ ) + "\n"
4687
+ );
4688
+ stream.write(chalk8.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
4689
+ for (const model of providerModels) {
4690
+ stream.write(
4691
+ chalk8.green(model.modelId.padEnd(idWidth)) + " " + chalk8.white(model.displayName.substring(0, nameWidth - 1).padEnd(nameWidth)) + " " + chalk8.yellow(`${model.voices.length} voices`.padEnd(voicesWidth)) + " " + chalk8.cyan(formatSpeechPrice(model).padEnd(priceWidth)) + "\n"
4692
+ );
4693
+ }
4694
+ stream.write(chalk8.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
4695
+ }
4696
+ stream.write("\n");
4697
+ }
4698
+ }
4699
+ function formatImagePrice(model) {
4700
+ if (model.pricing.perImage !== void 0) {
4701
+ return `$${model.pricing.perImage.toFixed(2)}/img`;
4702
+ }
4703
+ if (model.pricing.bySize) {
4704
+ const prices = Object.values(model.pricing.bySize);
4705
+ const minPrice = Math.min(...prices.flatMap((p) => typeof p === "number" ? [p] : Object.values(p)));
4706
+ const maxPrice = Math.max(...prices.flatMap((p) => typeof p === "number" ? [p] : Object.values(p)));
4707
+ if (minPrice === maxPrice) {
4708
+ return `$${minPrice.toFixed(2)}/img`;
4709
+ }
4710
+ return `$${minPrice.toFixed(2)}-${maxPrice.toFixed(2)}`;
4711
+ }
4712
+ return "varies";
4713
+ }
4714
+ function formatSpeechPrice(model) {
4715
+ if (model.pricing.perCharacter !== void 0) {
4716
+ const perMillion = model.pricing.perCharacter * 1e6;
4717
+ return `$${perMillion.toFixed(0)}/1M chars`;
4718
+ }
4719
+ if (model.pricing.perMinute !== void 0) {
4720
+ return `~$${model.pricing.perMinute.toFixed(2)}/min`;
4721
+ }
4722
+ return "varies";
4723
+ }
4724
+ function renderJSON(textModels, imageModels, speechModels, stream) {
4725
+ const output = {};
4726
+ if (textModels.length > 0) {
4727
+ output.textModels = textModels.map((model) => ({
4254
4728
  provider: model.provider,
4255
4729
  modelId: model.modelId,
4256
4730
  displayName: model.displayName,
@@ -4266,9 +4740,33 @@ function renderJSON(models, stream) {
4266
4740
  knowledgeCutoff: model.knowledgeCutoff,
4267
4741
  features: model.features,
4268
4742
  metadata: model.metadata
4269
- })),
4270
- shortcuts: MODEL_ALIASES
4271
- };
4743
+ }));
4744
+ output.shortcuts = MODEL_ALIASES;
4745
+ }
4746
+ if (imageModels.length > 0) {
4747
+ output.imageModels = imageModels.map((model) => ({
4748
+ provider: model.provider,
4749
+ modelId: model.modelId,
4750
+ displayName: model.displayName,
4751
+ supportedSizes: model.supportedSizes,
4752
+ supportedQualities: model.supportedQualities,
4753
+ maxImages: model.maxImages,
4754
+ pricing: model.pricing,
4755
+ features: model.features
4756
+ }));
4757
+ }
4758
+ if (speechModels.length > 0) {
4759
+ output.speechModels = speechModels.map((model) => ({
4760
+ provider: model.provider,
4761
+ modelId: model.modelId,
4762
+ displayName: model.displayName,
4763
+ voices: model.voices,
4764
+ formats: model.formats,
4765
+ maxInputLength: model.maxInputLength,
4766
+ pricing: model.pricing,
4767
+ features: model.features
4768
+ }));
4769
+ }
4272
4770
  stream.write(JSON.stringify(output, null, 2) + "\n");
4273
4771
  }
4274
4772
  function formatTokens2(count) {
@@ -4281,7 +4779,7 @@ function formatTokens2(count) {
4281
4779
  }
4282
4780
  }
4283
4781
  function registerModelsCommand(program, env) {
4284
- program.command(COMMANDS.models).description("List all available LLM models with pricing and capabilities.").option("--provider <name>", "Filter by provider (openai, anthropic, gemini)").option("--format <format>", "Output format: table or json", "table").option("--verbose", "Show detailed model information", false).action(
4782
+ program.command(COMMANDS.models).description("List available models with pricing and capabilities.").option("--provider <name>", "Filter by provider (openai, anthropic, gemini)").option("--format <format>", "Output format: table or json", "table").option("--verbose", "Show detailed model information", false).option("--text", "Show text/LLM models (default if no type specified)").option("--image", "Show image generation models").option("--speech", "Show speech/TTS models").option("--all", "Show all model types (text, image, speech)").action(
4285
4783
  (options) => executeAction(
4286
4784
  () => handleModelsCommand(options, env),
4287
4785
  env
@@ -4289,6 +4787,96 @@ function registerModelsCommand(program, env) {
4289
4787
  );
4290
4788
  }
4291
4789
 
4790
+ // src/cli/speech-command.ts
4791
+ import { writeFileSync as writeFileSync3 } from "node:fs";
4792
+ var DEFAULT_SPEECH_MODEL = "tts-1";
4793
+ var DEFAULT_VOICE = "nova";
4794
+ async function executeSpeech(textArg, options, env) {
4795
+ const text2 = await resolvePrompt(textArg, env);
4796
+ const client = env.createClient();
4797
+ const model = options.model;
4798
+ const voice = options.voice ?? DEFAULT_VOICE;
4799
+ const speed = options.speed ? Number.parseFloat(options.speed) : void 0;
4800
+ const stderrTTY = env.stderr.isTTY === true;
4801
+ if (!options.quiet && stderrTTY) {
4802
+ env.stderr.write(`${SUMMARY_PREFIX} Generating speech with ${model} (voice: ${voice})...
4803
+ `);
4804
+ }
4805
+ const result = await client.speech.generate({
4806
+ model,
4807
+ input: text2,
4808
+ voice,
4809
+ responseFormat: options.format,
4810
+ speed
4811
+ });
4812
+ const audioBuffer = Buffer.from(result.audio);
4813
+ if (options.output) {
4814
+ writeFileSync3(options.output, audioBuffer);
4815
+ if (!options.quiet) {
4816
+ env.stderr.write(`${SUMMARY_PREFIX} Audio saved to ${options.output}
4817
+ `);
4818
+ }
4819
+ } else {
4820
+ env.stdout.write(audioBuffer);
4821
+ }
4822
+ if (!options.quiet && stderrTTY) {
4823
+ const parts = [
4824
+ `${result.usage.characterCount} characters`,
4825
+ `format: ${result.format}`
4826
+ ];
4827
+ if (result.cost !== void 0) {
4828
+ parts.push(`cost: ${formatCost(result.cost)}`);
4829
+ }
4830
+ env.stderr.write(`${SUMMARY_PREFIX} ${parts.join(" | ")}
4831
+ `);
4832
+ }
4833
+ }
4834
+ function registerSpeechCommand(program, env, config) {
4835
+ program.command(COMMANDS.speech).description("Generate speech audio from text.").argument("[text]", "Text to convert to speech. If omitted, stdin is used when available.").option(
4836
+ OPTION_FLAGS.model,
4837
+ OPTION_DESCRIPTIONS.model,
4838
+ config?.model ?? DEFAULT_SPEECH_MODEL
4839
+ ).option(OPTION_FLAGS.voice, OPTION_DESCRIPTIONS.voice, config?.voice ?? DEFAULT_VOICE).option(OPTION_FLAGS.speechFormat, OPTION_DESCRIPTIONS.speechFormat, config?.format).option(OPTION_FLAGS.speechSpeed, OPTION_DESCRIPTIONS.speechSpeed, config?.speed?.toString()).option(OPTION_FLAGS.speechOutput, OPTION_DESCRIPTIONS.speechOutput, config?.output).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, config?.quiet ?? false).action(
4840
+ (text2, options) => executeAction(() => executeSpeech(text2, options, env), env)
4841
+ );
4842
+ }
4843
+
4844
+ // src/cli/vision-command.ts
4845
+ init_model_shortcuts();
4846
+ async function executeVision(imagePath, options, env) {
4847
+ const client = env.createClient();
4848
+ const model = resolveModel(options.model);
4849
+ const imageBuffer = await readFileBuffer(imagePath);
4850
+ const prompt = options.prompt ?? "Describe this image in detail.";
4851
+ const stderrTTY = env.stderr.isTTY === true;
4852
+ if (!options.quiet && stderrTTY) {
4853
+ env.stderr.write(`${SUMMARY_PREFIX} Analyzing image with ${model}...
4854
+ `);
4855
+ }
4856
+ const result = await client.vision.analyze({
4857
+ model,
4858
+ image: imageBuffer,
4859
+ prompt,
4860
+ maxTokens: options.maxTokens
4861
+ });
4862
+ env.stdout.write(result);
4863
+ env.stdout.write("\n");
4864
+ }
4865
+ function registerVisionCommand(program, env) {
4866
+ program.command(COMMANDS.vision ?? "vision").description("Analyze an image using vision-capable models").argument("<image>", "Path to image file to analyze").option(
4867
+ OPTION_FLAGS.model,
4868
+ OPTION_DESCRIPTIONS.model,
4869
+ "gpt-4o"
4870
+ // Default to a vision-capable model
4871
+ ).option("-p, --prompt <prompt>", "Analysis prompt describing what to extract or describe").option(
4872
+ OPTION_FLAGS.maxTokens,
4873
+ OPTION_DESCRIPTIONS.maxTokens,
4874
+ createNumericParser({ label: "Max tokens", integer: true, min: 1 })
4875
+ ).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet).action(
4876
+ (imagePath, options) => executeAction(() => executeVision(imagePath, options, env), env)
4877
+ );
4878
+ }
4879
+
4292
4880
  // src/cli/environment.ts
4293
4881
  init_client();
4294
4882
  init_logger();
@@ -4334,7 +4922,7 @@ function createLoggerFactory(config) {
4334
4922
  }
4335
4923
  function createPromptFunction(stdin, stdout) {
4336
4924
  return (question) => {
4337
- return new Promise((resolve2) => {
4925
+ return new Promise((resolve3) => {
4338
4926
  const rl = readline.createInterface({
4339
4927
  input: stdin,
4340
4928
  output: stdout
@@ -4349,7 +4937,7 @@ function createPromptFunction(stdin, stdout) {
4349
4937
  `);
4350
4938
  rl.question(chalk9.green.bold("You: "), (answer) => {
4351
4939
  rl.close();
4352
- resolve2(answer);
4940
+ resolve3(answer);
4353
4941
  });
4354
4942
  });
4355
4943
  };
@@ -4440,6 +5028,9 @@ function createProgram(env, config) {
4440
5028
  });
4441
5029
  registerCompleteCommand(program, env, config?.complete);
4442
5030
  registerAgentCommand(program, env, config?.agent);
5031
+ registerImageCommand(program, env, config?.image);
5032
+ registerSpeechCommand(program, env, config?.speech);
5033
+ registerVisionCommand(program, env);
4443
5034
  registerModelsCommand(program, env);
4444
5035
  registerGadgetCommand(program, env);
4445
5036
  if (config) {