@kajidog/mcp-tts-voicevox 0.6.1 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -429,7 +429,7 @@ var init_dist = __esm({
429
429
  });
430
430
  if (!chunk) {
431
431
  if (i === 1) {
432
- await new Promise((resolve) => setTimeout(resolve));
432
+ await new Promise((resolve2) => setTimeout(resolve2));
433
433
  maxReadCount = 3;
434
434
  continue;
435
435
  }
@@ -576,7 +576,7 @@ var init_dist = __esm({
576
576
 
577
577
  // src/index.ts
578
578
  import { readFileSync as readFileSync2 } from "fs";
579
- import { dirname as dirname2, join as join2 } from "path";
579
+ import { dirname as dirname3, join as join4 } from "path";
580
580
  import { fileURLToPath as fileURLToPath2 } from "url";
581
581
 
582
582
  // ../../packages/mcp-core/dist/config.js
@@ -620,6 +620,12 @@ function parseBaseCliArgs(argv = process.argv.slice(2)) {
620
620
  i++;
621
621
  }
622
622
  break;
623
+ case "--api-key":
624
+ if (nextArg && !nextArg.startsWith("-")) {
625
+ config2.apiKey = nextArg;
626
+ i++;
627
+ }
628
+ break;
623
629
  }
624
630
  }
625
631
  return config2;
@@ -641,6 +647,9 @@ function parseBaseEnvVars(env = process.env) {
641
647
  if (env.MCP_ALLOWED_ORIGINS) {
642
648
  config2.allowedOrigins = env.MCP_ALLOWED_ORIGINS.split(",").map((o) => o.trim());
643
649
  }
650
+ if (env.MCP_API_KEY) {
651
+ config2.apiKey = env.MCP_API_KEY;
652
+ }
644
653
  return config2;
645
654
  }
646
655
  function filterUndefined(obj) {
@@ -1648,9 +1657,9 @@ var Context = class {
1648
1657
  * })
1649
1658
  * ```
1650
1659
  */
1651
- json = (object2, arg, headers) => {
1660
+ json = (object3, arg, headers) => {
1652
1661
  return this.#newResponse(
1653
- JSON.stringify(object2),
1662
+ JSON.stringify(object3),
1654
1663
  arg,
1655
1664
  setDefaultContentType("application/json", headers)
1656
1665
  );
@@ -2805,6 +2814,13 @@ function forbiddenError(message) {
2805
2814
  id: null
2806
2815
  };
2807
2816
  }
2817
+ function unauthorizedError(message) {
2818
+ return {
2819
+ jsonrpc: "2.0",
2820
+ error: { code: -32001, message },
2821
+ id: null
2822
+ };
2823
+ }
2808
2824
  function validateOrigin(config2) {
2809
2825
  return async (c, next) => {
2810
2826
  const origin = c.req.header("Origin");
@@ -2847,6 +2863,22 @@ function validateHost(config2) {
2847
2863
  return next();
2848
2864
  };
2849
2865
  }
2866
+ function validateApiKey(config2) {
2867
+ return async (c, next) => {
2868
+ if (!config2.apiKey || c.req.method === "OPTIONS") {
2869
+ return next();
2870
+ }
2871
+ const xApiKey = c.req.header("X-API-Key");
2872
+ const authorization = c.req.header("Authorization");
2873
+ const bearerToken = authorization?.startsWith("Bearer ") ? authorization.slice(7).trim() : void 0;
2874
+ const providedKey = xApiKey ?? bearerToken;
2875
+ if (providedKey !== config2.apiKey) {
2876
+ console.log("Rejected request with invalid API key");
2877
+ return c.json(unauthorizedError("Unauthorized: Invalid API key"), { status: 401 });
2878
+ }
2879
+ return next();
2880
+ };
2881
+ }
2850
2882
  function createHttpApp(options) {
2851
2883
  const { server: server2, config: config2, serverFactory, extraCorsHeaders = [], onSessionInitialized, onSessionClosed } = options;
2852
2884
  const transports = /* @__PURE__ */ new Map();
@@ -2912,6 +2944,8 @@ function createHttpApp(options) {
2912
2944
  "mcp-session-id",
2913
2945
  "Last-Event-ID",
2914
2946
  "mcp-protocol-version",
2947
+ "X-API-Key",
2948
+ "Authorization",
2915
2949
  ...extraCorsHeaders
2916
2950
  ];
2917
2951
  app.use("/mcp", cors({
@@ -2922,6 +2956,7 @@ function createHttpApp(options) {
2922
2956
  }));
2923
2957
  app.use("/mcp", validateOrigin(config2));
2924
2958
  app.use("/mcp", validateHost(config2));
2959
+ app.use("/mcp", validateApiKey(config2));
2925
2960
  app.all("/mcp", handleMCP);
2926
2961
  app.get("/health", handleHealth);
2927
2962
  return app;
@@ -2969,7 +3004,7 @@ async function startHttpServer(options) {
2969
3004
  console.error(`Health check: http://${info.address}:${info.port}/health`);
2970
3005
  });
2971
3006
  }
2972
- await new Promise((resolve) => setTimeout(resolve, 1e3));
3007
+ await new Promise((resolve2) => setTimeout(resolve2, 1e3));
2973
3008
  console.error("HTTP server startup completed");
2974
3009
  } catch (error) {
2975
3010
  console.error("HTTP server startup failed:", error);
@@ -3016,6 +3051,7 @@ async function launchServer(options) {
3016
3051
  }
3017
3052
 
3018
3053
  // src/config.ts
3054
+ import { join } from "path";
3019
3055
  var defaultConfig = {
3020
3056
  ...defaultBaseConfig,
3021
3057
  voicevoxUrl: "http://localhost:50021",
@@ -3028,7 +3064,15 @@ var defaultConfig = {
3028
3064
  restrictImmediate: false,
3029
3065
  restrictWaitForStart: false,
3030
3066
  restrictWaitForEnd: false,
3067
+ playerDomain: "",
3031
3068
  autoPlay: true,
3069
+ playerExportEnabled: true,
3070
+ playerExportDir: join(process.cwd(), "voicevox-player-exports"),
3071
+ playerCacheDir: join(process.cwd(), ".voicevox-player-cache"),
3072
+ playerStateFile: join(process.cwd(), ".voicevox-player-cache", "player-state.json"),
3073
+ playerAudioCacheEnabled: true,
3074
+ playerAudioCacheTtlDays: 30,
3075
+ playerAudioCacheMaxMb: 512,
3032
3076
  disabledTools: []
3033
3077
  };
3034
3078
  function parseCliArgs(argv = process.argv.slice(2)) {
@@ -3095,6 +3139,48 @@ function parseCliArgs(argv = process.argv.slice(2)) {
3095
3139
  case "--no-auto-play":
3096
3140
  config2.autoPlay = false;
3097
3141
  break;
3142
+ case "--player-export":
3143
+ config2.playerExportEnabled = true;
3144
+ break;
3145
+ case "--no-player-export":
3146
+ config2.playerExportEnabled = false;
3147
+ break;
3148
+ case "--player-export-dir":
3149
+ if (nextArg && !nextArg.startsWith("-")) {
3150
+ config2.playerExportDir = nextArg;
3151
+ i++;
3152
+ }
3153
+ break;
3154
+ case "--player-cache-dir":
3155
+ if (nextArg && !nextArg.startsWith("-")) {
3156
+ config2.playerCacheDir = nextArg;
3157
+ i++;
3158
+ }
3159
+ break;
3160
+ case "--player-state-file":
3161
+ if (nextArg && !nextArg.startsWith("-")) {
3162
+ config2.playerStateFile = nextArg;
3163
+ i++;
3164
+ }
3165
+ break;
3166
+ case "--player-audio-cache":
3167
+ config2.playerAudioCacheEnabled = true;
3168
+ break;
3169
+ case "--no-player-audio-cache":
3170
+ config2.playerAudioCacheEnabled = false;
3171
+ break;
3172
+ case "--player-audio-cache-ttl-days":
3173
+ if (nextArg && !nextArg.startsWith("-")) {
3174
+ config2.playerAudioCacheTtlDays = Number(nextArg);
3175
+ i++;
3176
+ }
3177
+ break;
3178
+ case "--player-audio-cache-max-mb":
3179
+ if (nextArg && !nextArg.startsWith("-")) {
3180
+ config2.playerAudioCacheMaxMb = Number(nextArg);
3181
+ i++;
3182
+ }
3183
+ break;
3098
3184
  case "--disable-tools":
3099
3185
  if (nextArg && !nextArg.startsWith("-")) {
3100
3186
  config2.disabledTools = nextArg.split(",").map((t) => t.trim());
@@ -3138,9 +3224,35 @@ function parseEnvVars(env = process.env) {
3138
3224
  if (env.VOICEVOX_RESTRICT_WAIT_FOR_END === "true") {
3139
3225
  config2.restrictWaitForEnd = true;
3140
3226
  }
3227
+ if (env.VOICEVOX_PLAYER_DOMAIN) {
3228
+ config2.playerDomain = env.VOICEVOX_PLAYER_DOMAIN;
3229
+ }
3141
3230
  if (env.VOICEVOX_AUTO_PLAY !== void 0) {
3142
3231
  config2.autoPlay = env.VOICEVOX_AUTO_PLAY !== "false";
3143
3232
  }
3233
+ if (env.VOICEVOX_PLAYER_EXPORT_ENABLED !== void 0) {
3234
+ config2.playerExportEnabled = env.VOICEVOX_PLAYER_EXPORT_ENABLED !== "false";
3235
+ }
3236
+ if (env.VOICEVOX_PLAYER_EXPORT_DIR) {
3237
+ config2.playerExportDir = env.VOICEVOX_PLAYER_EXPORT_DIR;
3238
+ }
3239
+ if (env.VOICEVOX_PLAYER_CACHE_DIR) {
3240
+ config2.playerCacheDir = env.VOICEVOX_PLAYER_CACHE_DIR;
3241
+ }
3242
+ if (env.VOICEVOX_PLAYER_STATE_FILE) {
3243
+ config2.playerStateFile = env.VOICEVOX_PLAYER_STATE_FILE;
3244
+ }
3245
+ if (env.VOICEVOX_PLAYER_AUDIO_CACHE_ENABLED !== void 0) {
3246
+ config2.playerAudioCacheEnabled = env.VOICEVOX_PLAYER_AUDIO_CACHE_ENABLED !== "false";
3247
+ }
3248
+ if (env.VOICEVOX_PLAYER_AUDIO_CACHE_TTL_DAYS !== void 0) {
3249
+ const ttlDays = Number(env.VOICEVOX_PLAYER_AUDIO_CACHE_TTL_DAYS);
3250
+ if (Number.isFinite(ttlDays)) config2.playerAudioCacheTtlDays = ttlDays;
3251
+ }
3252
+ if (env.VOICEVOX_PLAYER_AUDIO_CACHE_MAX_MB !== void 0) {
3253
+ const maxMb = Number(env.VOICEVOX_PLAYER_AUDIO_CACHE_MAX_MB);
3254
+ if (Number.isFinite(maxMb)) config2.playerAudioCacheMaxMb = maxMb;
3255
+ }
3144
3256
  if (env.VOICEVOX_DISABLED_TOOLS) {
3145
3257
  config2.disabledTools = env.VOICEVOX_DISABLED_TOOLS.split(",").map((t) => t.trim());
3146
3258
  }
@@ -3149,11 +3261,16 @@ function parseEnvVars(env = process.env) {
3149
3261
  function getConfig(argv, env) {
3150
3262
  const cliConfig = parseCliArgs(argv);
3151
3263
  const envConfig = parseEnvVars(env);
3152
- return {
3264
+ const merged = {
3153
3265
  ...defaultConfig,
3154
3266
  ...filterUndefined(envConfig),
3155
3267
  ...filterUndefined(cliConfig)
3156
3268
  };
3269
+ const isPlayerStateFileExplicit = envConfig.playerStateFile !== void 0 || cliConfig.playerStateFile !== void 0;
3270
+ if (!isPlayerStateFileExplicit) {
3271
+ merged.playerStateFile = join(merged.playerCacheDir, "player-state.json");
3272
+ }
3273
+ return merged;
3157
3274
  }
3158
3275
 
3159
3276
  // src/server.ts
@@ -3161,29 +3278,81 @@ import { VoicevoxClient } from "@kajidog/voicevox-client";
3161
3278
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
3162
3279
 
3163
3280
  // src/tools/player.ts
3164
- import { randomUUID as randomUUID2 } from "crypto";
3165
- import { readFileSync } from "fs";
3166
- import { dirname, join } from "path";
3281
+ import { createHash, randomUUID as randomUUID2 } from "crypto";
3282
+ import { mkdirSync as mkdirSync2, readFileSync } from "fs";
3283
+ import { readdir, rename, stat, unlink, writeFile as writeFile2 } from "fs/promises";
3284
+ import { basename, dirname as dirname2, join as join3 } from "path";
3167
3285
  import { fileURLToPath } from "url";
3168
3286
  import { VoicevoxApi } from "@kajidog/voicevox-client";
3169
3287
  import { RESOURCE_MIME_TYPE, registerAppResource } from "@modelcontextprotocol/ext-apps/server";
3288
+ import * as z2 from "zod/v4";
3289
+
3290
+ // src/tools/player-cache-utils.ts
3291
+ function resolveAudioCachePolicy(input) {
3292
+ const isDiskCacheEnabled = input.enabledFlag && input.ttlDays !== 0 && input.maxMb !== 0;
3293
+ const ttlMs = input.ttlDays < 0 ? null : input.ttlDays * 24 * 60 * 60 * 1e3;
3294
+ const maxBytes = input.maxMb < 0 ? null : input.maxMb * 1024 * 1024;
3295
+ return { isDiskCacheEnabled, ttlMs, maxBytes };
3296
+ }
3297
+ function planAudioCacheCleanup(input) {
3298
+ const toDelete = /* @__PURE__ */ new Set();
3299
+ if (input.ttlMs !== null) {
3300
+ for (const entry of input.entries) {
3301
+ if (input.now - entry.mtimeMs > input.ttlMs) {
3302
+ toDelete.add(entry.path);
3303
+ }
3304
+ }
3305
+ }
3306
+ if (input.maxBytes !== null) {
3307
+ const kept = input.entries.filter((entry) => !toDelete.has(entry.path));
3308
+ let totalBytes = kept.reduce((sum, entry) => sum + entry.size, 0);
3309
+ if (totalBytes > input.maxBytes) {
3310
+ const byOldestFirst = [...kept].sort((a, b) => a.mtimeMs - b.mtimeMs);
3311
+ for (const entry of byOldestFirst) {
3312
+ if (totalBytes <= input.maxBytes) break;
3313
+ toDelete.add(entry.path);
3314
+ totalBytes -= entry.size;
3315
+ }
3316
+ }
3317
+ }
3318
+ return toDelete;
3319
+ }
3320
+
3321
+ // src/tools/player-ui-tools.ts
3322
+ import { spawn, spawnSync } from "child_process";
3323
+ import { constants, accessSync } from "fs";
3324
+ import { mkdir, writeFile } from "fs/promises";
3325
+ import { dirname, join as join2, resolve } from "path";
3170
3326
  import * as z from "zod/v4";
3171
3327
 
3172
3328
  // src/tools/registration.ts
3173
3329
  import { registerAppTool } from "@modelcontextprotocol/ext-apps/server";
3330
+ var TOOL_PREFIX = "voicevox_";
3331
+ function addToolPrefix(name) {
3332
+ if (name.startsWith("_")) {
3333
+ return name;
3334
+ }
3335
+ return `${TOOL_PREFIX}${name}`;
3336
+ }
3337
+ function isToolDisabled(disabledTools, name) {
3338
+ const fullName = addToolPrefix(name);
3339
+ return disabledTools.has(name) || disabledTools.has(fullName);
3340
+ }
3174
3341
  function registerToolIfEnabled(server2, disabledTools, name, definition, handler) {
3175
- if (disabledTools.has(name)) {
3176
- console.error(`Tool "${name}" is disabled via configuration`);
3342
+ const fullName = addToolPrefix(name);
3343
+ if (isToolDisabled(disabledTools, name)) {
3344
+ console.error(`Tool "${fullName}" is disabled via configuration`);
3177
3345
  return;
3178
3346
  }
3179
- server2.registerTool(name, definition, handler);
3347
+ server2.registerTool(fullName, definition, handler);
3180
3348
  }
3181
3349
  function registerAppToolIfEnabled(server2, disabledTools, name, definition, handler) {
3182
- if (disabledTools.has(name)) {
3183
- console.error(`Tool "${name}" is disabled via configuration`);
3350
+ const fullName = addToolPrefix(name);
3351
+ if (isToolDisabled(disabledTools, name)) {
3352
+ console.error(`Tool "${fullName}" is disabled via configuration`);
3184
3353
  return;
3185
3354
  }
3186
- registerAppTool(server2, name, definition, handler);
3355
+ registerAppTool(server2, fullName, definition, handler);
3187
3356
  }
3188
3357
 
3189
3358
  // src/tools/utils.ts
@@ -3191,7 +3360,7 @@ var createErrorResponse = (error) => ({
3191
3360
  content: [
3192
3361
  {
3193
3362
  type: "text",
3194
- text: `\u30A8\u30E9\u30FC: ${error instanceof Error ? error.message : String(error)}`
3363
+ text: `Error: ${error instanceof Error ? error.message : String(error)}`
3195
3364
  }
3196
3365
  ],
3197
3366
  isError: true
@@ -3239,115 +3408,157 @@ var processTextInput = async (voicevoxClient, text, speaker, speedScale, playbac
3239
3408
  });
3240
3409
  };
3241
3410
 
3242
- // src/tools/player.ts
3243
- var __dirname = typeof import.meta.dirname === "string" ? import.meta.dirname : dirname(fileURLToPath(import.meta.url));
3244
- var playerHtml;
3245
- try {
3246
- const htmlPath = join(__dirname, "mcp-app.html");
3247
- playerHtml = readFileSync(htmlPath, "utf-8");
3248
- } catch {
3411
+ // src/tools/player-ui-tools.ts
3412
+ var commandExistsCache = /* @__PURE__ */ new Map();
3413
+ function commandExists(command) {
3414
+ if (commandExistsCache.has(command)) return commandExistsCache.get(command);
3415
+ if (process.platform === "win32" && command === "explorer") {
3416
+ commandExistsCache.set(command, true);
3417
+ return true;
3418
+ }
3419
+ const checkCmd = process.platform === "win32" ? "where" : "which";
3420
+ const result = spawnSync(checkCmd, [command], { stdio: "ignore" });
3421
+ const exists = result.status === 0;
3422
+ commandExistsCache.set(command, exists);
3423
+ return exists;
3424
+ }
3425
+ function canOpenExplorer() {
3426
+ if (process.platform === "win32") return commandExists("explorer");
3427
+ if (process.platform === "darwin") return commandExists("open");
3428
+ if (process.platform === "linux") {
3429
+ const hasDisplay = Boolean(process.env.DISPLAY || process.env.WAYLAND_DISPLAY);
3430
+ return hasDisplay && commandExists("xdg-open");
3431
+ }
3432
+ return false;
3433
+ }
3434
+ function canChooseDirectoryDialog() {
3435
+ return process.platform === "win32" || process.platform === "darwin";
3436
+ }
3437
+ function sanitizeFilePart(input, fallback) {
3438
+ const value = input.trim().replace(/[<>:"/\\|?*\x00-\x1f]/g, "_").replace(/\s+/g, "_").slice(0, 40);
3439
+ return value.length > 0 ? value : fallback;
3440
+ }
3441
+ function openDirectoryInExplorer(directoryPath) {
3249
3442
  try {
3250
- const htmlPath = join(__dirname, "..", "..", "node_modules", "@kajidog", "player-ui", "dist", "mcp-app.html");
3251
- playerHtml = readFileSync(htmlPath, "utf-8");
3443
+ const child = process.platform === "win32" ? spawn("explorer", [directoryPath], { detached: true, stdio: "ignore" }) : process.platform === "darwin" ? spawn("open", [directoryPath], { detached: true, stdio: "ignore" }) : spawn("xdg-open", [directoryPath], { detached: true, stdio: "ignore" });
3444
+ child.unref();
3445
+ return true;
3252
3446
  } catch {
3253
- console.error("Warning: player-ui HTML not found. Please build @kajidog/player-ui first.");
3254
- playerHtml = "<html><body><p>Player UI not available. Please build @kajidog/player-ui.</p></body></html>";
3447
+ return false;
3255
3448
  }
3256
3449
  }
3257
- var playerResourceUri = "ui://speak-player/player.html";
3258
- var speakerCache = null;
3259
- function registerPlayerTools(deps) {
3260
- const { server: server2, config: config2, disabledTools } = deps;
3261
- const playerVoicevoxApi = new VoicevoxApi(config2.voicevoxUrl);
3262
- const getSpeakerList = async () => {
3263
- if (speakerCache) return speakerCache;
3264
- try {
3265
- const speakers = await playerVoicevoxApi.getSpeakers();
3266
- speakerCache = speakers.flatMap(
3267
- (speaker) => speaker.styles.map((style) => ({
3268
- id: style.id,
3269
- name: style.name,
3270
- characterName: speaker.name,
3271
- uuid: speaker.speaker_uuid
3272
- }))
3273
- );
3274
- return speakerCache;
3275
- } catch {
3276
- return [];
3450
+ function showDirectoryPicker(defaultPath) {
3451
+ return new Promise((resolve2) => {
3452
+ if (process.platform === "win32") {
3453
+ const defaultPathB64 = defaultPath ? Buffer.from(defaultPath).toString("base64") : "";
3454
+ const psScript = `
3455
+ Add-Type -AssemblyName System.Windows.Forms
3456
+ $form = New-Object System.Windows.Forms.Form
3457
+ $form.TopMost = $true
3458
+ $form.ShowInTaskbar = $false
3459
+ $form.WindowState = 'Minimized'
3460
+ $dialog = New-Object System.Windows.Forms.FolderBrowserDialog
3461
+ $dialog.Description = "Select Export Folder"
3462
+ ${defaultPathB64 ? `$dialog.SelectedPath = [System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String("${defaultPathB64}"))` : ""}
3463
+ $dialog.ShowNewFolderButton = $true
3464
+ if ($dialog.ShowDialog($form) -eq [System.Windows.Forms.DialogResult]::OK) {
3465
+ Write-Output $dialog.SelectedPath
3466
+ }
3467
+ `;
3468
+ const child = spawn("powershell", ["-NoProfile", "-Command", psScript], { stdio: ["ignore", "pipe", "ignore"] });
3469
+ let output = "";
3470
+ child.stdout.on("data", (data) => {
3471
+ output += data.toString();
3472
+ });
3473
+ child.on("close", () => {
3474
+ const path = output.trim();
3475
+ resolve2(path || null);
3476
+ });
3477
+ } else if (process.platform === "darwin") {
3478
+ const script = `on run argv
3479
+ try
3480
+ ${defaultPath ? "set defaultArg to item 1 of argv" : ""}
3481
+ return POSIX path of (choose folder with prompt "Select Export Folder" ${defaultPath ? "default location POSIX file defaultArg" : ""})
3482
+ on error
3483
+ return ""
3484
+ end try
3485
+ end run`;
3486
+ const args = ["-e", script];
3487
+ if (defaultPath) args.push(defaultPath);
3488
+ const child = spawn("osascript", args, { stdio: ["ignore", "pipe", "ignore"] });
3489
+ let output = "";
3490
+ child.stdout.on("data", (data) => {
3491
+ output += data.toString();
3492
+ });
3493
+ child.on("close", () => {
3494
+ const path = output.trim();
3495
+ resolve2(path || null);
3496
+ });
3497
+ } else {
3498
+ resolve2(null);
3277
3499
  }
3278
- };
3279
- const getSpeakerName = async (speakerId) => {
3280
- const list = await getSpeakerList();
3281
- const found = list?.find((s) => s.id === speakerId);
3282
- return found ? `${found.characterName}\uFF08${found.name}\uFF09` : `Speaker ${speakerId}`;
3283
- };
3500
+ });
3501
+ }
3502
+ function isKatakana(input) {
3503
+ return /^[ァ-ヶー]+$/.test(input);
3504
+ }
3505
+ function estimateAccentType(pronunciation) {
3506
+ const smallKana = /* @__PURE__ */ new Set(["\u30E3", "\u30E5", "\u30E7", "\u30A1", "\u30A3", "\u30A5", "\u30A7", "\u30A9", "\u30EE"]);
3507
+ let moraCount = 0;
3508
+ for (const char of pronunciation) {
3509
+ if (char === "\u30FC") continue;
3510
+ if (smallKana.has(char)) continue;
3511
+ moraCount += 1;
3512
+ }
3513
+ return Math.max(1, moraCount);
3514
+ }
3515
+ function normalizeUserDictionaryWords(dictionary) {
3516
+ return Object.entries(dictionary).map(([wordUuid, word]) => ({
3517
+ wordUuid,
3518
+ surface: word.surface,
3519
+ pronunciation: word.pronunciation,
3520
+ accentType: word.accent_type,
3521
+ priority: word.priority
3522
+ }));
3523
+ }
3524
+ var moraSchema = z.object({
3525
+ text: z.string(),
3526
+ consonant: z.string().nullable().optional(),
3527
+ consonant_length: z.number().nullable().optional(),
3528
+ vowel: z.string(),
3529
+ vowel_length: z.number(),
3530
+ pitch: z.number()
3531
+ });
3532
+ var accentPhraseSchema = z.object({
3533
+ moras: z.array(moraSchema),
3534
+ accent: z.number().int(),
3535
+ pause_mora: moraSchema.nullable().optional(),
3536
+ is_interrogative: z.boolean().nullable().optional()
3537
+ });
3538
+ var audioQuerySchema = z.object({
3539
+ accent_phrases: z.array(accentPhraseSchema),
3540
+ speedScale: z.number(),
3541
+ pitchScale: z.number(),
3542
+ intonationScale: z.number(),
3543
+ volumeScale: z.number(),
3544
+ prePhonemeLength: z.number(),
3545
+ postPhonemeLength: z.number(),
3546
+ outputSamplingRate: z.number(),
3547
+ outputStereo: z.boolean(),
3548
+ kana: z.string().optional(),
3549
+ pauseLengthScale: z.number().optional()
3550
+ });
3551
+ function registerPlayerUITools(deps, shared) {
3552
+ const { server: server2, disabledTools, config: config2 } = deps;
3553
+ const {
3554
+ playerVoicevoxApi,
3555
+ playerResourceUri: playerResourceUri2,
3556
+ synthesizeWithCache,
3557
+ setSessionState: setSessionState2,
3558
+ getSessionState: getSessionState2,
3559
+ getSpeakerList
3560
+ } = shared;
3284
3561
  const speakerIconCache = /* @__PURE__ */ new Map();
3285
- registerAppResource(
3286
- server2,
3287
- "VOICEVOX Player",
3288
- playerResourceUri,
3289
- {
3290
- description: "Audio player UI for VOICEVOX TTS",
3291
- mimeType: RESOURCE_MIME_TYPE
3292
- },
3293
- async () => ({
3294
- contents: [{ uri: playerResourceUri, mimeType: RESOURCE_MIME_TYPE, text: playerHtml }]
3295
- })
3296
- );
3297
- registerAppToolIfEnabled(
3298
- server2,
3299
- disabledTools,
3300
- "speak_player",
3301
- {
3302
- title: "Speak Player",
3303
- description: 'Convert text to speech and display an audio player in the UI. Audio is played in the browser, not on the server. Does not use the playback queue. Supports multi-speaker dialogue: prefix each line with speaker ID like "1:Hello\\n2:World".',
3304
- inputSchema: {
3305
- text: z.string().describe(
3306
- 'Text to convert to speech. Supports multi-speaker dialogue format with speaker ID prefix per line: "1:Hello\\n2:World". Each line is synthesized with the specified speaker and played sequentially.'
3307
- ),
3308
- speaker: z.number().optional().describe("Speaker ID (optional)"),
3309
- speedScale: z.number().optional().describe("Playback speed (optional, default from environment)"),
3310
- autoPlay: z.boolean().optional().describe("Auto-play audio when loaded (default: true)")
3311
- },
3312
- annotations: {
3313
- readOnlyHint: true,
3314
- destructiveHint: false,
3315
- idempotentHint: false,
3316
- openWorldHint: true
3317
- },
3318
- _meta: { ui: { resourceUri: playerResourceUri } }
3319
- },
3320
- async ({
3321
- text,
3322
- speaker,
3323
- speedScale,
3324
- autoPlay
3325
- }, extra) => {
3326
- try {
3327
- const effectiveSpeaker = getEffectiveSpeaker(speaker, extra.sessionId) ?? config2.defaultSpeaker;
3328
- const speed = speedScale ?? config2.defaultSpeedScale;
3329
- const segments = parseStringInput(text);
3330
- const firstSegment = segments[0];
3331
- if (!firstSegment) {
3332
- throw new Error("\u30C6\u30AD\u30B9\u30C8\u304C\u7A7A\u3067\u3059");
3333
- }
3334
- const speakerId = firstSegment.speaker ?? effectiveSpeaker;
3335
- const speakerName = await getSpeakerName(speakerId);
3336
- const fullText = segments.map((s) => s.text).join(" ");
3337
- return {
3338
- content: [
3339
- {
3340
- type: "text",
3341
- text: `Voicevox Player started: ${speakerName} \u300C${fullText.slice(0, 50)}${fullText.length > 50 ? "..." : ""}\u300D`
3342
- }
3343
- ],
3344
- _meta: { viewUUID: randomUUID2() }
3345
- };
3346
- } catch (error) {
3347
- return createErrorResponse(error);
3348
- }
3349
- }
3350
- );
3351
3562
  registerAppToolIfEnabled(
3352
3563
  server2,
3353
3564
  disabledTools,
@@ -3357,7 +3568,7 @@ function registerPlayerTools(deps) {
3357
3568
  description: "Get speaker list for the player UI. This tool is only callable from the app UI.",
3358
3569
  _meta: {
3359
3570
  ui: {
3360
- resourceUri: playerResourceUri,
3571
+ resourceUri: playerResourceUri2,
3361
3572
  visibility: ["app"]
3362
3573
  }
3363
3574
  }
@@ -3383,7 +3594,7 @@ function registerPlayerTools(deps) {
3383
3594
  },
3384
3595
  _meta: {
3385
3596
  ui: {
3386
- resourceUri: playerResourceUri,
3597
+ resourceUri: playerResourceUri2,
3387
3598
  visibility: ["app"]
3388
3599
  }
3389
3600
  }
@@ -3406,87 +3617,234 @@ function registerPlayerTools(deps) {
3406
3617
  }
3407
3618
  }
3408
3619
  );
3620
+ registerAppToolIfEnabled(
3621
+ server2,
3622
+ disabledTools,
3623
+ "_save_player_state_for_player",
3624
+ {
3625
+ title: "Save Player State (Player)",
3626
+ description: "Persist current player segments to server state without synthesizing audio. Only callable from the app UI.",
3627
+ inputSchema: {
3628
+ viewUUID: z.string().optional().describe("Player instance ID to associate this state with"),
3629
+ segments: z.array(
3630
+ z.object({
3631
+ text: z.string(),
3632
+ speaker: z.number(),
3633
+ speedScale: z.number().optional(),
3634
+ intonationScale: z.number().optional(),
3635
+ volumeScale: z.number().optional(),
3636
+ prePhonemeLength: z.number().optional(),
3637
+ postPhonemeLength: z.number().optional(),
3638
+ pauseLengthScale: z.number().optional(),
3639
+ audioQuery: audioQuerySchema.optional(),
3640
+ accentPhrases: z.array(accentPhraseSchema).optional()
3641
+ })
3642
+ ).describe("Full current player segment list to persist")
3643
+ },
3644
+ _meta: {
3645
+ ui: {
3646
+ resourceUri: playerResourceUri2,
3647
+ visibility: ["app"]
3648
+ }
3649
+ }
3650
+ },
3651
+ async ({
3652
+ viewUUID,
3653
+ segments
3654
+ }, extra) => {
3655
+ try {
3656
+ if (!segments || segments.length === 0) {
3657
+ throw new Error("segments is required");
3658
+ }
3659
+ const stateKey = viewUUID ?? extra?.sessionId ?? "global";
3660
+ const effectiveDefaultSpeaker = config2.defaultSpeaker;
3661
+ const effectiveSpeed = config2.defaultSpeedScale;
3662
+ const list = await getSpeakerList();
3663
+ const speakerNameMap = /* @__PURE__ */ new Map();
3664
+ for (const speakerId of [...new Set(segments.map((seg) => seg.speaker ?? effectiveDefaultSpeaker))]) {
3665
+ const found = list.find((entry) => entry.id === speakerId);
3666
+ speakerNameMap.set(speakerId, found ? `${found.characterName}\uFF08${found.name}\uFF09` : `Speaker ${speakerId}`);
3667
+ }
3668
+ setSessionState2(stateKey, {
3669
+ segments: segments.map((seg) => {
3670
+ const speakerId = seg.speaker ?? effectiveDefaultSpeaker;
3671
+ return {
3672
+ text: seg.text,
3673
+ speaker: speakerId,
3674
+ speakerName: speakerNameMap.get(speakerId) ?? `Speaker ${speakerId}`,
3675
+ kana: seg.audioQuery?.kana,
3676
+ speedScale: seg.speedScale ?? effectiveSpeed,
3677
+ intonationScale: seg.intonationScale,
3678
+ volumeScale: seg.volumeScale,
3679
+ prePhonemeLength: seg.prePhonemeLength,
3680
+ postPhonemeLength: seg.postPhonemeLength,
3681
+ pauseLengthScale: seg.pauseLengthScale,
3682
+ audioQuery: seg.audioQuery,
3683
+ accentPhrases: seg.audioQuery?.accent_phrases ?? seg.accentPhrases
3684
+ };
3685
+ }),
3686
+ updatedAt: Date.now()
3687
+ });
3688
+ return {
3689
+ content: [{ type: "text", text: JSON.stringify({ ok: true, viewUUID: stateKey, count: segments.length }) }]
3690
+ };
3691
+ } catch (error) {
3692
+ return createErrorResponse(error);
3693
+ }
3694
+ }
3695
+ );
3409
3696
  registerAppToolIfEnabled(
3410
3697
  server2,
3411
3698
  disabledTools,
3412
3699
  "_resynthesize_for_player",
3413
3700
  {
3414
3701
  title: "Resynthesize (Player)",
3415
- description: "Re-synthesize audio with a different speaker. Only callable from the app UI.",
3702
+ description: "Re-synthesize audio with a different speaker or updated parameters. Only callable from the app UI.",
3416
3703
  inputSchema: {
3704
+ viewUUID: z.string().optional().describe("Player instance ID to associate this synthesis with"),
3417
3705
  text: z.string().describe("Text to re-synthesize"),
3418
3706
  speaker: z.number().optional().describe("Speaker ID (uses server default if omitted)"),
3707
+ audioQuery: audioQuerySchema.optional().describe("Audio query to synthesize from (preferred over text parameters)"),
3419
3708
  speedScale: z.number().optional().describe("Playback speed (uses server default if omitted)"),
3709
+ intonationScale: z.number().optional().describe("Intonation scale \u6291\u63DA (optional)"),
3710
+ volumeScale: z.number().optional().describe("Volume scale \u97F3\u91CF (optional)"),
3711
+ prePhonemeLength: z.number().optional().describe("Pre-phoneme silence length in seconds (optional)"),
3712
+ postPhonemeLength: z.number().optional().describe("Post-phoneme silence length in seconds (optional)"),
3713
+ pauseLengthScale: z.number().optional().describe("Pause length scale between phrases \u9593\u306E\u9577\u3055 (optional)"),
3714
+ accentPhrases: z.array(accentPhraseSchema).optional().describe("Accent phrases override"),
3420
3715
  autoPlay: z.boolean().optional().describe("Auto-play audio when loaded (uses server config if omitted)"),
3716
+ segmentIndex: z.number().int().min(0).optional().describe("Segment index for single-segment state update"),
3717
+ persistState: z.boolean().optional().describe("Persist player state to server store (default: true)"),
3421
3718
  segments: z.array(
3422
3719
  z.object({
3423
3720
  text: z.string(),
3424
- speaker: z.number()
3721
+ speaker: z.number(),
3722
+ speedScale: z.number().optional(),
3723
+ intonationScale: z.number().optional(),
3724
+ volumeScale: z.number().optional(),
3725
+ prePhonemeLength: z.number().optional(),
3726
+ postPhonemeLength: z.number().optional(),
3727
+ pauseLengthScale: z.number().optional(),
3728
+ audioQuery: audioQuerySchema.optional(),
3729
+ accentPhrases: z.array(accentPhraseSchema).optional()
3425
3730
  })
3426
- ).optional().describe("Multi-speaker segments to synthesize individually")
3731
+ ).optional().describe("All current player segments \u2014 pass the full list to update server state")
3427
3732
  },
3428
3733
  _meta: {
3429
3734
  ui: {
3430
- resourceUri: playerResourceUri,
3735
+ resourceUri: playerResourceUri2,
3431
3736
  visibility: ["app"]
3432
3737
  }
3433
3738
  }
3434
3739
  },
3435
3740
  async ({
3741
+ viewUUID,
3436
3742
  text,
3437
3743
  speaker,
3744
+ audioQuery,
3438
3745
  speedScale,
3746
+ intonationScale,
3747
+ volumeScale,
3748
+ prePhonemeLength,
3749
+ postPhonemeLength,
3750
+ pauseLengthScale,
3751
+ accentPhrases,
3439
3752
  autoPlay,
3753
+ segmentIndex,
3754
+ persistState,
3440
3755
  segments
3441
- }) => {
3756
+ }, extra) => {
3442
3757
  try {
3443
3758
  const effectiveSpeed = speedScale ?? config2.defaultSpeedScale;
3444
3759
  const effectiveAutoPlay = autoPlay ?? config2.autoPlay;
3760
+ const shouldPersistState = persistState !== false;
3445
3761
  const effectiveDefaultSpeaker = speaker ?? config2.defaultSpeaker;
3446
- if (segments && segments.length > 0) {
3447
- const results = await Promise.all(
3448
- segments.map(async (seg) => {
3449
- const segSpeaker = seg.speaker ?? effectiveDefaultSpeaker;
3450
- const audioQuery2 = await playerVoicevoxApi.generateQuery(seg.text, segSpeaker);
3451
- audioQuery2.speedScale = effectiveSpeed;
3452
- const audioData2 = await playerVoicevoxApi.synthesize(audioQuery2, segSpeaker);
3453
- const base64Audio2 = Buffer.from(audioData2).toString("base64");
3454
- const segSpeakerName = await getSpeakerName(segSpeaker);
3762
+ const stateKey = viewUUID ?? extra?.sessionId ?? "global";
3763
+ if (segments && segments.length > 0 && shouldPersistState) {
3764
+ const list = await getSpeakerList();
3765
+ const speakerNameMap = /* @__PURE__ */ new Map();
3766
+ for (const speakerId of [...new Set(segments.map((seg) => seg.speaker ?? effectiveDefaultSpeaker))]) {
3767
+ const found = list.find((entry) => entry.id === speakerId);
3768
+ speakerNameMap.set(speakerId, found ? `${found.characterName}\uFF08${found.name}\uFF09` : `Speaker ${speakerId}`);
3769
+ }
3770
+ setSessionState2(stateKey, {
3771
+ segments: segments.map((seg) => {
3772
+ const speakerId = seg.speaker ?? effectiveDefaultSpeaker;
3455
3773
  return {
3456
- audioBase64: base64Audio2,
3457
3774
  text: seg.text,
3458
- speaker: segSpeaker,
3459
- speakerName: segSpeakerName
3775
+ speaker: speakerId,
3776
+ speakerName: speakerNameMap.get(speakerId) ?? `Speaker ${speakerId}`,
3777
+ kana: seg.audioQuery?.kana,
3778
+ speedScale: seg.speedScale ?? effectiveSpeed,
3779
+ intonationScale: seg.intonationScale,
3780
+ volumeScale: seg.volumeScale,
3781
+ prePhonemeLength: seg.prePhonemeLength,
3782
+ postPhonemeLength: seg.postPhonemeLength,
3783
+ pauseLengthScale: seg.pauseLengthScale,
3784
+ audioQuery: seg.audioQuery,
3785
+ accentPhrases: seg.audioQuery?.accent_phrases ?? seg.accentPhrases
3460
3786
  };
3461
- })
3462
- );
3463
- return {
3464
- content: [
3465
- {
3466
- type: "text",
3467
- text: JSON.stringify({
3468
- segments: results,
3469
- autoPlay: effectiveAutoPlay
3470
- })
3471
- }
3472
- ]
3473
- };
3787
+ }),
3788
+ updatedAt: Date.now()
3789
+ });
3790
+ }
3791
+ const result = await synthesizeWithCache({
3792
+ text,
3793
+ speaker: effectiveDefaultSpeaker,
3794
+ audioQuery,
3795
+ speedScale: effectiveSpeed,
3796
+ intonationScale,
3797
+ volumeScale,
3798
+ prePhonemeLength,
3799
+ postPhonemeLength,
3800
+ pauseLengthScale,
3801
+ accentPhrases
3802
+ });
3803
+ if (shouldPersistState && segmentIndex !== void 0) {
3804
+ const prev = getSessionState2(stateKey);
3805
+ if (prev?.segments[segmentIndex]) {
3806
+ const nextSegments = prev.segments.slice();
3807
+ nextSegments[segmentIndex] = {
3808
+ ...nextSegments[segmentIndex],
3809
+ text: result.text,
3810
+ speaker: result.speaker,
3811
+ speakerName: result.speakerName,
3812
+ kana: result.kana,
3813
+ audioQuery: result.audioQuery,
3814
+ accentPhrases: result.accentPhrases,
3815
+ speedScale: result.speedScale,
3816
+ intonationScale: result.intonationScale,
3817
+ volumeScale: result.volumeScale,
3818
+ prePhonemeLength: result.prePhonemeLength,
3819
+ postPhonemeLength: result.postPhonemeLength,
3820
+ pauseLengthScale: result.pauseLengthScale
3821
+ };
3822
+ setSessionState2(stateKey, {
3823
+ segments: nextSegments,
3824
+ updatedAt: Date.now()
3825
+ });
3826
+ }
3474
3827
  }
3475
- const audioQuery = await playerVoicevoxApi.generateQuery(text, effectiveDefaultSpeaker);
3476
- audioQuery.speedScale = effectiveSpeed;
3477
- const audioData = await playerVoicevoxApi.synthesize(audioQuery, effectiveDefaultSpeaker);
3478
- const base64Audio = Buffer.from(audioData).toString("base64");
3479
- const speakerName = await getSpeakerName(effectiveDefaultSpeaker);
3480
3828
  return {
3481
3829
  content: [
3482
3830
  {
3483
3831
  type: "text",
3484
3832
  text: JSON.stringify({
3485
- audioBase64: base64Audio,
3486
- text,
3487
- speaker: effectiveDefaultSpeaker,
3488
- speakerName,
3489
- autoPlay: effectiveAutoPlay
3833
+ audioBase64: result.audioBase64,
3834
+ text: result.text,
3835
+ speaker: result.speaker,
3836
+ speakerName: result.speakerName,
3837
+ kana: result.kana,
3838
+ audioQuery: result.audioQuery,
3839
+ accentPhrases: result.accentPhrases,
3840
+ speedScale: result.speedScale,
3841
+ intonationScale: result.intonationScale,
3842
+ volumeScale: result.volumeScale,
3843
+ prePhonemeLength: result.prePhonemeLength,
3844
+ postPhonemeLength: result.postPhonemeLength,
3845
+ pauseLengthScale: result.pauseLengthScale,
3846
+ autoPlay: effectiveAutoPlay,
3847
+ viewUUID
3490
3848
  })
3491
3849
  }
3492
3850
  ]
@@ -3496,53 +3854,1172 @@ function registerPlayerTools(deps) {
3496
3854
  }
3497
3855
  }
3498
3856
  );
3499
- }
3500
-
3501
- // src/tools/speak.ts
3502
- import * as z2 from "zod/v4";
3503
- function buildSpeakInputSchema(restrictions) {
3504
- const schema = {
3505
- text: z2.string().describe(
3506
- 'Text split by line breaks (\\n). IMPORTANT: Each line = one speech unit (processed and played separately). Keep the FIRST LINE SHORT for quick playback start - audio begins as soon as the first line is synthesized. Example: "Hi!\\nThis is a longer explanation that follows." Optional speaker prefix per line: "1:Hello\\n2:World"'
3507
- ),
3508
- query: z2.string().optional().describe("Voice synthesis query"),
3509
- speaker: z2.number().optional().describe("Default speaker ID (optional)"),
3510
- speedScale: z2.number().optional().describe("Playback speed (optional, default from environment)")
3511
- };
3512
- if (!restrictions.immediate) {
3513
- schema.immediate = z2.boolean().optional().describe(
3514
- "If true, stops current playback and plays new audio immediately. If false, waits for current playback to finish. Default depends on environment variable."
3515
- );
3516
- }
3517
- if (!restrictions.waitForStart) {
3518
- schema.waitForStart = z2.boolean().optional().describe("Wait for playback to start (optional, default: false)");
3519
- }
3520
- if (!restrictions.waitForEnd) {
3521
- schema.waitForEnd = z2.boolean().optional().describe("Wait for playback to end (optional, default: false)");
3522
- }
3523
- return schema;
3524
- }
3525
- function registerSpeakTool(deps) {
3526
- const { server: server2, voicevoxClient, config: config2, disabledTools, restrictions } = deps;
3527
- registerToolIfEnabled(
3857
+ registerAppToolIfEnabled(
3528
3858
  server2,
3529
3859
  disabledTools,
3530
- "speak",
3860
+ "_get_user_dictionary_for_player",
3531
3861
  {
3532
- title: "Speak",
3533
- description: "Convert text to speech and play it. Text is split by line breaks (\\n) into separate speech units. Each line is processed as an independent audio segment.",
3534
- inputSchema: buildSpeakInputSchema(restrictions),
3535
- annotations: {
3536
- readOnlyHint: false,
3537
- destructiveHint: false,
3538
- idempotentHint: false,
3539
- openWorldHint: true
3862
+ title: "Get User Dictionary (Player)",
3863
+ description: "Get VOICEVOX user dictionary words for the dictionary manager UI.",
3864
+ _meta: {
3865
+ ui: {
3866
+ resourceUri: playerResourceUri2,
3867
+ visibility: ["app"]
3868
+ }
3540
3869
  }
3541
3870
  },
3542
- async ({
3543
- text,
3544
- speaker,
3545
- query,
3871
+ async () => {
3872
+ try {
3873
+ const dictionary = await playerVoicevoxApi.getUserDictionary();
3874
+ return {
3875
+ content: [{ type: "text", text: JSON.stringify({ words: normalizeUserDictionaryWords(dictionary) }) }]
3876
+ };
3877
+ } catch (error) {
3878
+ return createErrorResponse(error);
3879
+ }
3880
+ }
3881
+ );
3882
+ registerAppToolIfEnabled(
3883
+ server2,
3884
+ disabledTools,
3885
+ "_add_user_dictionary_word_for_player",
3886
+ {
3887
+ title: "Add User Dictionary Word (Player)",
3888
+ description: "Add a word to VOICEVOX user dictionary.",
3889
+ inputSchema: {
3890
+ surface: z.string().describe("Word surface form"),
3891
+ pronunciation: z.string().describe("Katakana reading"),
3892
+ priority: z.number().int().min(0).max(10).optional().describe("Priority 0-10")
3893
+ },
3894
+ _meta: {
3895
+ ui: {
3896
+ resourceUri: playerResourceUri2,
3897
+ visibility: ["app"]
3898
+ }
3899
+ }
3900
+ },
3901
+ async ({
3902
+ surface,
3903
+ pronunciation,
3904
+ priority
3905
+ }) => {
3906
+ try {
3907
+ const normalizedSurface = surface.trim();
3908
+ const normalizedPronunciation = pronunciation.trim();
3909
+ if (!normalizedSurface) throw new Error("surface is required");
3910
+ if (!normalizedPronunciation) throw new Error("pronunciation is required");
3911
+ if (!isKatakana(normalizedPronunciation)) throw new Error("pronunciation must be Katakana");
3912
+ await playerVoicevoxApi.addUserDictionaryWord({
3913
+ surface: normalizedSurface,
3914
+ pronunciation: normalizedPronunciation,
3915
+ accentType: estimateAccentType(normalizedPronunciation),
3916
+ priority: priority ?? 5
3917
+ });
3918
+ const dictionary = await playerVoicevoxApi.getUserDictionary();
3919
+ return {
3920
+ content: [{ type: "text", text: JSON.stringify({ words: normalizeUserDictionaryWords(dictionary) }) }]
3921
+ };
3922
+ } catch (error) {
3923
+ return createErrorResponse(error);
3924
+ }
3925
+ }
3926
+ );
3927
+ registerAppToolIfEnabled(
3928
+ server2,
3929
+ disabledTools,
3930
+ "_update_user_dictionary_word_for_player",
3931
+ {
3932
+ title: "Update User Dictionary Word (Player)",
3933
+ description: "Update a VOICEVOX user dictionary word.",
3934
+ inputSchema: {
3935
+ wordUuid: z.string().describe("Dictionary word UUID"),
3936
+ surface: z.string().describe("Word surface form"),
3937
+ pronunciation: z.string().describe("Katakana reading"),
3938
+ priority: z.number().int().min(0).max(10).optional().describe("Priority 0-10")
3939
+ },
3940
+ _meta: {
3941
+ ui: {
3942
+ resourceUri: playerResourceUri2,
3943
+ visibility: ["app"]
3944
+ }
3945
+ }
3946
+ },
3947
+ async ({
3948
+ wordUuid,
3949
+ surface,
3950
+ pronunciation,
3951
+ priority
3952
+ }) => {
3953
+ try {
3954
+ const normalizedSurface = surface.trim();
3955
+ const normalizedPronunciation = pronunciation.trim();
3956
+ if (!wordUuid.trim()) throw new Error("wordUuid is required");
3957
+ if (!normalizedSurface) throw new Error("surface is required");
3958
+ if (!normalizedPronunciation) throw new Error("pronunciation is required");
3959
+ if (!isKatakana(normalizedPronunciation)) throw new Error("pronunciation must be Katakana");
3960
+ await playerVoicevoxApi.updateUserDictionaryWord({
3961
+ wordUuid: wordUuid.trim(),
3962
+ surface: normalizedSurface,
3963
+ pronunciation: normalizedPronunciation,
3964
+ accentType: estimateAccentType(normalizedPronunciation),
3965
+ priority: priority ?? 5
3966
+ });
3967
+ const dictionary = await playerVoicevoxApi.getUserDictionary();
3968
+ return {
3969
+ content: [{ type: "text", text: JSON.stringify({ words: normalizeUserDictionaryWords(dictionary) }) }]
3970
+ };
3971
+ } catch (error) {
3972
+ return createErrorResponse(error);
3973
+ }
3974
+ }
3975
+ );
3976
+ registerAppToolIfEnabled(
3977
+ server2,
3978
+ disabledTools,
3979
+ "_delete_user_dictionary_word_for_player",
3980
+ {
3981
+ title: "Delete User Dictionary Word (Player)",
3982
+ description: "Delete a VOICEVOX user dictionary word.",
3983
+ inputSchema: {
3984
+ wordUuid: z.string().describe("Dictionary word UUID")
3985
+ },
3986
+ _meta: {
3987
+ ui: {
3988
+ resourceUri: playerResourceUri2,
3989
+ visibility: ["app"]
3990
+ }
3991
+ }
3992
+ },
3993
+ async ({ wordUuid }) => {
3994
+ try {
3995
+ const normalizedWordUuid = wordUuid.trim();
3996
+ if (!normalizedWordUuid) throw new Error("wordUuid is required");
3997
+ await playerVoicevoxApi.deleteUserDictionaryWord(normalizedWordUuid);
3998
+ const dictionary = await playerVoicevoxApi.getUserDictionary();
3999
+ return {
4000
+ content: [{ type: "text", text: JSON.stringify({ words: normalizeUserDictionaryWords(dictionary) }) }]
4001
+ };
4002
+ } catch (error) {
4003
+ return createErrorResponse(error);
4004
+ }
4005
+ }
4006
+ );
4007
+ registerAppToolIfEnabled(
4008
+ server2,
4009
+ disabledTools,
4010
+ "_preview_dictionary_word_for_player",
4011
+ {
4012
+ title: "Preview Dictionary Word (Player)",
4013
+ description: "Preview pronunciation with a random speaker.",
4014
+ inputSchema: {
4015
+ text: z.string().describe("Text to preview")
4016
+ },
4017
+ _meta: {
4018
+ ui: {
4019
+ resourceUri: playerResourceUri2,
4020
+ visibility: ["app"]
4021
+ }
4022
+ }
4023
+ },
4024
+ async ({ text }) => {
4025
+ try {
4026
+ const normalizedText = text.trim();
4027
+ if (!normalizedText) throw new Error("text is required");
4028
+ const speakers = await getSpeakerList();
4029
+ if (speakers.length === 0) throw new Error("No speakers available");
4030
+ const randomSpeaker = speakers[Math.floor(Math.random() * speakers.length)];
4031
+ const result = await synthesizeWithCache({
4032
+ text: normalizedText,
4033
+ speaker: randomSpeaker.id,
4034
+ speedScale: config2.defaultSpeedScale
4035
+ });
4036
+ return {
4037
+ content: [
4038
+ {
4039
+ type: "text",
4040
+ text: JSON.stringify({
4041
+ audioBase64: result.audioBase64,
4042
+ speaker: result.speaker,
4043
+ speakerName: result.speakerName,
4044
+ kana: result.kana
4045
+ })
4046
+ }
4047
+ ]
4048
+ };
4049
+ } catch (error) {
4050
+ return createErrorResponse(error);
4051
+ }
4052
+ }
4053
+ );
4054
+ registerAppToolIfEnabled(
4055
+ server2,
4056
+ disabledTools,
4057
+ "_get_export_capability_for_player",
4058
+ {
4059
+ title: "Get Export Capability (Player)",
4060
+ description: "Return whether track export + folder open is available for player UI.",
4061
+ _meta: {
4062
+ ui: {
4063
+ resourceUri: playerResourceUri2,
4064
+ visibility: ["app"]
4065
+ }
4066
+ }
4067
+ },
4068
+ async () => {
4069
+ const canExport = config2.playerExportEnabled;
4070
+ const canChooseDirectory = canExport && canChooseDirectoryDialog();
4071
+ const canOpenDirectory = canExport && canOpenExplorer();
4072
+ return {
4073
+ content: [
4074
+ {
4075
+ type: "text",
4076
+ text: JSON.stringify({
4077
+ available: canExport,
4078
+ canChooseDirectory,
4079
+ canOpenDirectory,
4080
+ defaultOutputDir: config2.playerExportDir
4081
+ })
4082
+ }
4083
+ ]
4084
+ };
4085
+ }
4086
+ );
4087
+ registerAppToolIfEnabled(
4088
+ server2,
4089
+ disabledTools,
4090
+ "_select_directory_for_player",
4091
+ {
4092
+ title: "Select Export Directory (Player)",
4093
+ description: "Open a native OS directory picker dialog, to be called from the player UI.",
4094
+ inputSchema: {
4095
+ defaultPath: z.string().optional().describe("Default directory path to show")
4096
+ },
4097
+ _meta: {
4098
+ ui: {
4099
+ resourceUri: playerResourceUri2,
4100
+ visibility: ["app"]
4101
+ }
4102
+ }
4103
+ },
4104
+ async ({ defaultPath }) => {
4105
+ try {
4106
+ const selected = await showDirectoryPicker(defaultPath || config2.playerExportDir);
4107
+ return {
4108
+ content: [
4109
+ {
4110
+ type: "text",
4111
+ text: JSON.stringify({ path: selected })
4112
+ }
4113
+ ]
4114
+ };
4115
+ } catch (error) {
4116
+ return createErrorResponse(error);
4117
+ }
4118
+ }
4119
+ );
4120
+ registerAppToolIfEnabled(
4121
+ server2,
4122
+ disabledTools,
4123
+ "_export_tracks_for_player",
4124
+ {
4125
+ title: "Export Tracks (Player)",
4126
+ description: "Save player tracks as wav files and open the target folder in file explorer.",
4127
+ inputSchema: {
4128
+ outputDir: z.string().optional().describe("Output directory path (optional)"),
4129
+ segments: z.array(
4130
+ z.object({
4131
+ audioBase64: z.string().describe("WAV data in base64"),
4132
+ text: z.string().describe("Segment text"),
4133
+ speaker: z.number().describe("Speaker ID"),
4134
+ speakerName: z.string().describe("Speaker display name")
4135
+ })
4136
+ ).describe("Tracks to export")
4137
+ },
4138
+ _meta: {
4139
+ ui: {
4140
+ resourceUri: playerResourceUri2,
4141
+ visibility: ["app"]
4142
+ }
4143
+ }
4144
+ },
4145
+ async ({
4146
+ outputDir,
4147
+ segments
4148
+ }) => {
4149
+ try {
4150
+ if (!config2.playerExportEnabled) {
4151
+ throw new Error("Track export is disabled by VOICEVOX_PLAYER_EXPORT_ENABLED=false");
4152
+ }
4153
+ if (!segments || segments.length === 0) {
4154
+ throw new Error("No tracks to export");
4155
+ }
4156
+ const rawTarget = outputDir?.trim() || config2.playerExportDir;
4157
+ const targetDir = resolve(rawTarget);
4158
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
4159
+ const sessionDir = join2(targetDir, `voicevox-${timestamp}`);
4160
+ await mkdir(sessionDir, { recursive: true });
4161
+ const files = [];
4162
+ for (let i = 0; i < segments.length; i++) {
4163
+ const seg = segments[i];
4164
+ const indexPart = String(i + 1).padStart(2, "0");
4165
+ const speakerPart = sanitizeFilePart(seg.speakerName || `speaker-${seg.speaker}`, `speaker-${seg.speaker}`);
4166
+ const textPart = sanitizeFilePart(seg.text, `segment-${i + 1}`);
4167
+ const fileName = `${indexPart}-${speakerPart}-${textPart}.wav`;
4168
+ const filePath = join2(sessionDir, fileName);
4169
+ await writeFile(filePath, Buffer.from(seg.audioBase64, "base64"));
4170
+ files.push(filePath);
4171
+ }
4172
+ let warning;
4173
+ let openedDirectory = false;
4174
+ if (canOpenExplorer()) {
4175
+ if (process.platform === "win32") {
4176
+ try {
4177
+ const child = spawn("explorer.exe", [sessionDir], { detached: true, stdio: "ignore" });
4178
+ child.unref();
4179
+ openedDirectory = true;
4180
+ } catch (e) {
4181
+ console.error("Failed to open explorer:", e);
4182
+ warning = `WAV\u30D5\u30A1\u30A4\u30EB\u306F\u4FDD\u5B58\u3055\u308C\u307E\u3057\u305F\u304C\u3001\u30D5\u30A9\u30EB\u30C0\u3092\u958B\u3051\u307E\u305B\u3093\u3067\u3057\u305F: ${sessionDir}`;
4183
+ }
4184
+ } else if (openDirectoryInExplorer(sessionDir)) {
4185
+ openedDirectory = true;
4186
+ } else {
4187
+ warning = `WAV\u30D5\u30A1\u30A4\u30EB\u306F\u4FDD\u5B58\u3055\u308C\u307E\u3057\u305F\u304C\u3001\u30D5\u30A9\u30EB\u30C0\u3092\u958B\u3051\u307E\u305B\u3093\u3067\u3057\u305F: ${sessionDir}`;
4188
+ }
4189
+ } else {
4190
+ warning = `WAV\u30D5\u30A1\u30A4\u30EB\u306F\u4FDD\u5B58\u3055\u308C\u307E\u3057\u305F\u3002\u73FE\u5728\u306E\u74B0\u5883\u3067\u306F\u30D5\u30A9\u30EB\u30C0\u81EA\u52D5\u30AA\u30FC\u30D7\u30F3\u306B\u5BFE\u5FDC\u3057\u3066\u3044\u307E\u305B\u3093: ${sessionDir}`;
4191
+ }
4192
+ return {
4193
+ content: [
4194
+ {
4195
+ type: "text",
4196
+ text: JSON.stringify({
4197
+ ok: true,
4198
+ outputDir: sessionDir,
4199
+ count: files.length,
4200
+ files,
4201
+ openedDirectory,
4202
+ warning
4203
+ })
4204
+ }
4205
+ ]
4206
+ };
4207
+ } catch (error) {
4208
+ return createErrorResponse(error);
4209
+ }
4210
+ }
4211
+ );
4212
+ }
4213
+
4214
+ // src/tools/player.ts
4215
+ var __dirname = typeof import.meta.dirname === "string" ? import.meta.dirname : dirname2(fileURLToPath(import.meta.url));
4216
+ var playerHtml;
4217
+ try {
4218
+ const htmlPath = join3(__dirname, "mcp-app.html");
4219
+ playerHtml = readFileSync(htmlPath, "utf-8");
4220
+ } catch {
4221
+ try {
4222
+ const htmlPath = join3(__dirname, "..", "..", "node_modules", "@kajidog", "player-ui", "dist", "mcp-app.html");
4223
+ playerHtml = readFileSync(htmlPath, "utf-8");
4224
+ } catch {
4225
+ console.error("Warning: player-ui HTML not found. Please build @kajidog/player-ui first.");
4226
+ playerHtml = "<html><body><p>Player UI not available. Please build @kajidog/player-ui.</p></body></html>";
4227
+ }
4228
+ }
4229
+ var playerResourceUri = "ui://speak-player/player.html";
4230
+ var speakerCache = null;
4231
+ var playerStorageInitialized = false;
4232
+ var audioCacheDir = join3(process.cwd(), ".voicevox-player-cache");
4233
+ var audioCacheMem = /* @__PURE__ */ new Map();
4234
+ var AUDIO_CACHE_FILE_PATTERN = /^[a-f0-9]{64}\.txt$/;
4235
+ var DEFAULT_AUDIO_CACHE_TTL_DAYS = 30;
4236
+ var DEFAULT_AUDIO_CACHE_MAX_MB = 512;
4237
+ var AUDIO_CACHE_CLEANUP_EVERY_WRITES = 20;
4238
+ var audioCacheEnabledFlag = true;
4239
+ var audioCacheTtlDays = DEFAULT_AUDIO_CACHE_TTL_DAYS;
4240
+ var audioCacheMaxMb = DEFAULT_AUDIO_CACHE_MAX_MB;
4241
+ var isAudioDiskCacheEnabled = audioCacheEnabledFlag && audioCacheTtlDays !== 0 && audioCacheMaxMb !== 0;
4242
+ var audioCacheTtlMs = audioCacheTtlDays < 0 ? null : audioCacheTtlDays * 24 * 60 * 60 * 1e3;
4243
+ var audioCacheMaxBytes = audioCacheMaxMb < 0 ? null : audioCacheMaxMb * 1024 * 1024;
4244
+ var isAudioCacheCleanupRunning = false;
4245
+ var pendingAudioCacheCleanup = false;
4246
+ var writesSinceLastAudioCleanup = 0;
4247
+ async function cleanupAudioCacheFiles() {
4248
+ if (!isAudioDiskCacheEnabled) return;
4249
+ try {
4250
+ const entries = await readdir(audioCacheDir, { withFileTypes: true });
4251
+ const now = Date.now();
4252
+ const files = [];
4253
+ for (const entry of entries) {
4254
+ if (!entry.isFile() || !AUDIO_CACHE_FILE_PATTERN.test(entry.name)) continue;
4255
+ const filePath = join3(audioCacheDir, entry.name);
4256
+ let fileStat;
4257
+ try {
4258
+ fileStat = await stat(filePath);
4259
+ } catch {
4260
+ continue;
4261
+ }
4262
+ files.push({ name: entry.name, path: filePath, size: fileStat.size, mtimeMs: fileStat.mtimeMs });
4263
+ }
4264
+ const toDelete = planAudioCacheCleanup({
4265
+ entries: files,
4266
+ now,
4267
+ ttlMs: audioCacheTtlMs,
4268
+ maxBytes: audioCacheMaxBytes
4269
+ });
4270
+ if (toDelete.size === 0) return;
4271
+ for (const path of toDelete) {
4272
+ try {
4273
+ await unlink(path);
4274
+ } catch {
4275
+ }
4276
+ const fileName = basename(path);
4277
+ if (fileName.endsWith(".txt")) {
4278
+ audioCacheMem.delete(fileName.slice(0, -4));
4279
+ }
4280
+ }
4281
+ } catch (error) {
4282
+ console.warn("Warning: failed to cleanup VOICEVOX player audio cache:", error);
4283
+ }
4284
+ }
4285
+ function scheduleAudioCacheCleanup(force = false) {
4286
+ if (!isAudioDiskCacheEnabled) return;
4287
+ if (!force) {
4288
+ writesSinceLastAudioCleanup += 1;
4289
+ if (writesSinceLastAudioCleanup < AUDIO_CACHE_CLEANUP_EVERY_WRITES) return;
4290
+ }
4291
+ writesSinceLastAudioCleanup = 0;
4292
+ if (isAudioCacheCleanupRunning) {
4293
+ pendingAudioCacheCleanup = true;
4294
+ return;
4295
+ }
4296
+ isAudioCacheCleanupRunning = true;
4297
+ void cleanupAudioCacheFiles().catch((error) => console.warn("Warning: failed to cleanup VOICEVOX player audio cache:", error)).finally(() => {
4298
+ isAudioCacheCleanupRunning = false;
4299
+ if (pendingAudioCacheCleanup) {
4300
+ pendingAudioCacheCleanup = false;
4301
+ scheduleAudioCacheCleanup(true);
4302
+ }
4303
+ });
4304
+ }
4305
+ var playerSessionState = /* @__PURE__ */ new Map();
4306
+ var MAX_TOOL_CONTENT_BYTES = 1024 * 1024;
4307
+ var DEFAULT_STATE_PAGE_LIMIT = 100;
4308
+ var MAX_STATE_PAGE_LIMIT = 1e3;
4309
+ var MAX_PERSISTED_STATES = 500;
4310
+ var MAX_STATE_AGE_MS = 30 * 24 * 60 * 60 * 1e3;
4311
+ var stateFilePath = join3(audioCacheDir, "player-state.json");
4312
+ function createAudioCacheKey(input) {
4313
+ const keyInput = input.audioQuery ? JSON.stringify({
4314
+ speaker: input.speaker,
4315
+ text: input.text,
4316
+ audioQuery: input.audioQuery
4317
+ }) : JSON.stringify({
4318
+ speaker: input.speaker,
4319
+ text: input.text,
4320
+ speedScale: Number(input.speedScale.toFixed(4)),
4321
+ intonationScale: input.intonationScale === void 0 ? null : Number(input.intonationScale.toFixed(4)),
4322
+ volumeScale: input.volumeScale === void 0 ? null : Number(input.volumeScale.toFixed(4)),
4323
+ prePhonemeLength: input.prePhonemeLength === void 0 ? null : Number(input.prePhonemeLength.toFixed(4)),
4324
+ postPhonemeLength: input.postPhonemeLength === void 0 ? null : Number(input.postPhonemeLength.toFixed(4)),
4325
+ pauseLengthScale: input.pauseLengthScale === void 0 ? null : Number(input.pauseLengthScale.toFixed(4)),
4326
+ accentPhrases: input.accentPhrases ?? null
4327
+ });
4328
+ return createHash("sha256").update(keyInput).digest("hex");
4329
+ }
4330
+ function readCachedAudioBase64(cacheKey2) {
4331
+ const inMemory = audioCacheMem.get(cacheKey2);
4332
+ if (inMemory) return inMemory;
4333
+ if (!isAudioDiskCacheEnabled) return null;
4334
+ const filePath = join3(audioCacheDir, `${cacheKey2}.txt`);
4335
+ try {
4336
+ const base64 = readFileSync(filePath, "utf-8").trim();
4337
+ if (base64.length > 0) {
4338
+ audioCacheMem.set(cacheKey2, base64);
4339
+ return base64;
4340
+ }
4341
+ } catch {
4342
+ }
4343
+ return null;
4344
+ }
4345
+ async function writeCachedAudioBase64(cacheKey2, base64) {
4346
+ audioCacheMem.set(cacheKey2, base64);
4347
+ if (!isAudioDiskCacheEnabled) return;
4348
+ const filePath = join3(audioCacheDir, `${cacheKey2}.txt`);
4349
+ try {
4350
+ await writeFile2(filePath, base64, "utf-8");
4351
+ scheduleAudioCacheCleanup();
4352
+ } catch (error) {
4353
+ console.warn("Warning: failed to write VOICEVOX player cache:", error);
4354
+ }
4355
+ }
4356
+ async function saveSessionStateToDisk() {
4357
+ try {
4358
+ const now = Date.now();
4359
+ const validEntries = [...playerSessionState.entries()].filter(([, state]) => now - state.updatedAt <= MAX_STATE_AGE_MS).sort((a, b) => b[1].updatedAt - a[1].updatedAt).slice(0, MAX_PERSISTED_STATES);
4360
+ playerSessionState.clear();
4361
+ for (const [key, state] of validEntries) {
4362
+ playerSessionState.set(key, state);
4363
+ }
4364
+ const payload = JSON.stringify({
4365
+ version: 1,
4366
+ savedAt: now,
4367
+ entries: validEntries
4368
+ });
4369
+ const tempPath = `${stateFilePath}.tmp`;
4370
+ await writeFile2(tempPath, payload, "utf-8");
4371
+ await rename(tempPath, stateFilePath);
4372
+ } catch (error) {
4373
+ console.warn("Warning: failed to persist player state:", error);
4374
+ }
4375
+ }
4376
+ var saveDebounceTimer = null;
4377
+ function scheduleStateSave() {
4378
+ if (saveDebounceTimer !== null) clearTimeout(saveDebounceTimer);
4379
+ saveDebounceTimer = setTimeout(() => {
4380
+ saveDebounceTimer = null;
4381
+ saveSessionStateToDisk().catch((e) => console.warn("Warning: failed to persist player state:", e));
4382
+ }, 300);
4383
+ }
4384
+ function loadSessionStateFromDisk() {
4385
+ try {
4386
+ const raw2 = readFileSync(stateFilePath, "utf-8");
4387
+ const parsed = JSON.parse(raw2);
4388
+ if (!Array.isArray(parsed.entries)) return;
4389
+ const now = Date.now();
4390
+ for (const entry of parsed.entries) {
4391
+ if (!Array.isArray(entry) || entry.length !== 2) continue;
4392
+ const [key, state] = entry;
4393
+ if (!key || typeof key !== "string") continue;
4394
+ if (!state || typeof state.updatedAt !== "number" || !Array.isArray(state.segments)) continue;
4395
+ if (now - state.updatedAt > MAX_STATE_AGE_MS) continue;
4396
+ playerSessionState.set(key, state);
4397
+ }
4398
+ } catch {
4399
+ }
4400
+ }
4401
+ function setSessionState(key, state) {
4402
+ playerSessionState.set(key, state);
4403
+ scheduleStateSave();
4404
+ }
4405
+ function getSessionState(viewUUID, sessionId) {
4406
+ if (viewUUID) {
4407
+ const s2 = playerSessionState.get(viewUUID);
4408
+ if (s2) return s2;
4409
+ }
4410
+ const key = sessionId ?? "global";
4411
+ const s = playerSessionState.get(key);
4412
+ if (s) return s;
4413
+ return void 0;
4414
+ }
4415
+ function initializePlayerStorage(config2) {
4416
+ if (playerStorageInitialized) return;
4417
+ playerStorageInitialized = true;
4418
+ audioCacheDir = config2.playerCacheDir || audioCacheDir;
4419
+ stateFilePath = config2.playerStateFile || join3(audioCacheDir, "player-state.json");
4420
+ audioCacheEnabledFlag = config2.playerAudioCacheEnabled !== false;
4421
+ audioCacheTtlDays = Number.isFinite(config2.playerAudioCacheTtlDays) ? config2.playerAudioCacheTtlDays : DEFAULT_AUDIO_CACHE_TTL_DAYS;
4422
+ audioCacheMaxMb = Number.isFinite(config2.playerAudioCacheMaxMb) ? config2.playerAudioCacheMaxMb : DEFAULT_AUDIO_CACHE_MAX_MB;
4423
+ const cachePolicy = resolveAudioCachePolicy({
4424
+ enabledFlag: audioCacheEnabledFlag,
4425
+ ttlDays: audioCacheTtlDays,
4426
+ maxMb: audioCacheMaxMb
4427
+ });
4428
+ isAudioDiskCacheEnabled = cachePolicy.isDiskCacheEnabled;
4429
+ audioCacheTtlMs = cachePolicy.ttlMs;
4430
+ audioCacheMaxBytes = cachePolicy.maxBytes;
4431
+ try {
4432
+ mkdirSync2(audioCacheDir, { recursive: true });
4433
+ if (isAudioDiskCacheEnabled) {
4434
+ scheduleAudioCacheCleanup(true);
4435
+ }
4436
+ } catch (error) {
4437
+ console.warn("Warning: failed to create VOICEVOX player cache directory:", error);
4438
+ }
4439
+ try {
4440
+ mkdirSync2(dirname2(stateFilePath), { recursive: true });
4441
+ } catch (error) {
4442
+ console.warn("Warning: failed to prepare player state directory:", error);
4443
+ }
4444
+ loadSessionStateFromDisk();
4445
+ }
4446
+ function registerPlayerTools(deps) {
4447
+ const { server: server2, config: config2, disabledTools } = deps;
4448
+ initializePlayerStorage(config2);
4449
+ const playerVoicevoxApi = new VoicevoxApi(config2.voicevoxUrl);
4450
+ const getSpeakerList = async () => {
4451
+ if (speakerCache) return speakerCache;
4452
+ try {
4453
+ const speakers = await playerVoicevoxApi.getSpeakers();
4454
+ speakerCache = speakers.flatMap(
4455
+ (speaker) => speaker.styles.map((style) => ({
4456
+ id: style.id,
4457
+ name: style.name,
4458
+ characterName: speaker.name,
4459
+ uuid: speaker.speaker_uuid
4460
+ }))
4461
+ );
4462
+ return speakerCache;
4463
+ } catch {
4464
+ return [];
4465
+ }
4466
+ };
4467
+ const getSpeakerName = async (speakerId) => {
4468
+ const list = await getSpeakerList();
4469
+ const found = list?.find((s) => s.id === speakerId);
4470
+ return found ? `${found.characterName}\uFF08${found.name}\uFF09` : `Speaker ${speakerId}`;
4471
+ };
4472
+ const resolveSpeakerNames = async (speakerIds) => {
4473
+ const uniqueSpeakerIds = [...new Set(speakerIds)];
4474
+ const entries = await Promise.all(uniqueSpeakerIds.map(async (id) => [id, await getSpeakerName(id)]));
4475
+ return new Map(entries);
4476
+ };
4477
+ const getUserDictionaryWords = async () => {
4478
+ const dictionary = await playerVoicevoxApi.getUserDictionary();
4479
+ return Object.entries(dictionary).map(([wordUuid, word]) => ({
4480
+ wordUuid,
4481
+ surface: word.surface,
4482
+ pronunciation: word.pronunciation,
4483
+ accentType: word.accent_type,
4484
+ priority: word.priority
4485
+ }));
4486
+ };
4487
+ const synthesizeWithCache = async ({
4488
+ text,
4489
+ speaker,
4490
+ audioQuery,
4491
+ speedScale,
4492
+ intonationScale,
4493
+ volumeScale,
4494
+ prePhonemeLength,
4495
+ postPhonemeLength,
4496
+ pauseLengthScale,
4497
+ accentPhrases
4498
+ }) => {
4499
+ const speakerName = await getSpeakerName(speaker);
4500
+ let effectiveAudioQuery = audioQuery;
4501
+ if (audioQuery && accentPhrases && accentPhrases.length > 0 && audioQuery.accent_phrases?.length > 0) {
4502
+ try {
4503
+ const updated = await playerVoicevoxApi.updateMoraData(audioQuery.accent_phrases, speaker);
4504
+ effectiveAudioQuery = { ...audioQuery, accent_phrases: updated };
4505
+ } catch (e) {
4506
+ console.warn("[synthesizeWithCache] /mora_data \u518D\u8A08\u7B97\u5931\u6557\u3001\u5143\u306E\u30D4\u30C3\u30C1\u5024\u3092\u4F7F\u7528:", e);
4507
+ }
4508
+ }
4509
+ const cacheKey2 = createAudioCacheKey({
4510
+ text,
4511
+ speaker,
4512
+ audioQuery: effectiveAudioQuery,
4513
+ speedScale,
4514
+ intonationScale,
4515
+ volumeScale,
4516
+ prePhonemeLength,
4517
+ postPhonemeLength,
4518
+ pauseLengthScale,
4519
+ accentPhrases
4520
+ });
4521
+ const cachedBase64 = readCachedAudioBase64(cacheKey2);
4522
+ if (cachedBase64) {
4523
+ let cachedQuery = effectiveAudioQuery;
4524
+ if (!cachedQuery) {
4525
+ const generated = await playerVoicevoxApi.generateQuery(text, speaker);
4526
+ if (accentPhrases) generated.accent_phrases = accentPhrases;
4527
+ generated.speedScale = speedScale;
4528
+ if (intonationScale !== void 0) generated.intonationScale = intonationScale;
4529
+ if (volumeScale !== void 0) generated.volumeScale = volumeScale;
4530
+ if (prePhonemeLength !== void 0) generated.prePhonemeLength = prePhonemeLength;
4531
+ if (postPhonemeLength !== void 0) generated.postPhonemeLength = postPhonemeLength;
4532
+ if (pauseLengthScale !== void 0) generated.pauseLengthScale = pauseLengthScale;
4533
+ cachedQuery = generated;
4534
+ }
4535
+ return {
4536
+ audioBase64: cachedBase64,
4537
+ text,
4538
+ speaker,
4539
+ speakerName,
4540
+ kana: cachedQuery?.kana,
4541
+ audioQuery: cachedQuery,
4542
+ speedScale: cachedQuery?.speedScale ?? speedScale,
4543
+ intonationScale: cachedQuery?.intonationScale ?? intonationScale,
4544
+ volumeScale: cachedQuery?.volumeScale ?? volumeScale,
4545
+ prePhonemeLength: cachedQuery?.prePhonemeLength ?? prePhonemeLength,
4546
+ postPhonemeLength: cachedQuery?.postPhonemeLength ?? postPhonemeLength,
4547
+ pauseLengthScale: cachedQuery?.pauseLengthScale ?? pauseLengthScale,
4548
+ accentPhrases: cachedQuery?.accent_phrases ?? accentPhrases
4549
+ };
4550
+ }
4551
+ const resolvedQuery = effectiveAudioQuery ? { ...effectiveAudioQuery } : await playerVoicevoxApi.generateQuery(text, speaker);
4552
+ if (!effectiveAudioQuery && accentPhrases) resolvedQuery.accent_phrases = accentPhrases;
4553
+ if (!effectiveAudioQuery) {
4554
+ resolvedQuery.speedScale = speedScale;
4555
+ if (intonationScale !== void 0) resolvedQuery.intonationScale = intonationScale;
4556
+ if (volumeScale !== void 0) resolvedQuery.volumeScale = volumeScale;
4557
+ if (prePhonemeLength !== void 0) resolvedQuery.prePhonemeLength = prePhonemeLength;
4558
+ if (postPhonemeLength !== void 0) resolvedQuery.postPhonemeLength = postPhonemeLength;
4559
+ if (pauseLengthScale !== void 0) resolvedQuery.pauseLengthScale = pauseLengthScale;
4560
+ }
4561
+ const audioData = await playerVoicevoxApi.synthesize(resolvedQuery, speaker);
4562
+ const base64Audio = Buffer.from(audioData).toString("base64");
4563
+ await writeCachedAudioBase64(cacheKey2, base64Audio);
4564
+ return {
4565
+ audioBase64: base64Audio,
4566
+ text,
4567
+ speaker,
4568
+ speakerName,
4569
+ kana: resolvedQuery.kana,
4570
+ audioQuery: resolvedQuery,
4571
+ accentPhrases: resolvedQuery.accent_phrases,
4572
+ speedScale: resolvedQuery.speedScale,
4573
+ intonationScale: resolvedQuery.intonationScale,
4574
+ volumeScale: resolvedQuery.volumeScale,
4575
+ prePhonemeLength: resolvedQuery.prePhonemeLength,
4576
+ postPhonemeLength: resolvedQuery.postPhonemeLength,
4577
+ pauseLengthScale: resolvedQuery.pauseLengthScale
4578
+ };
4579
+ };
4580
+ registerAppResource(
4581
+ server2,
4582
+ "VOICEVOX Player",
4583
+ playerResourceUri,
4584
+ {
4585
+ description: "Audio player UI for VOICEVOX TTS",
4586
+ mimeType: RESOURCE_MIME_TYPE
4587
+ },
4588
+ async () => ({
4589
+ contents: [
4590
+ {
4591
+ uri: playerResourceUri,
4592
+ mimeType: RESOURCE_MIME_TYPE,
4593
+ text: playerHtml,
4594
+ _meta: {
4595
+ ui: {
4596
+ csp: {},
4597
+ ...config2.playerDomain ? { domain: config2.playerDomain } : {}
4598
+ }
4599
+ }
4600
+ }
4601
+ ]
4602
+ })
4603
+ );
4604
+ registerAppToolIfEnabled(
4605
+ server2,
4606
+ disabledTools,
4607
+ "open_dictionary_ui",
4608
+ {
4609
+ title: "Open Dictionary UI",
4610
+ description: "Open the user dictionary manager UI for VOICEVOX.",
4611
+ annotations: {
4612
+ readOnlyHint: false,
4613
+ destructiveHint: false,
4614
+ idempotentHint: true,
4615
+ openWorldHint: true
4616
+ },
4617
+ _meta: { ui: { resourceUri: playerResourceUri } }
4618
+ },
4619
+ async () => {
4620
+ try {
4621
+ const words = await getUserDictionaryWords();
4622
+ const notice = "\u8F9E\u66F8\u5909\u66F4\u306F\u65E2\u5B58\u30C8\u30E9\u30C3\u30AF\u306B\u81EA\u52D5\u53CD\u6620\u3055\u308C\u307E\u305B\u3093\u3002Player\u3067\u518D\u751F\u6210\u3059\u308B\u3068\u53CD\u6620\u3055\u308C\u307E\u3059\u3002";
4623
+ return {
4624
+ content: [{ type: "text", text: `Dictionary manager opened. ${words.length} word(s).` }],
4625
+ structuredContent: {
4626
+ mode: "dictionary",
4627
+ dictionaryWords: words,
4628
+ dictionaryNotice: notice
4629
+ },
4630
+ _meta: {
4631
+ mode: "dictionary",
4632
+ dictionaryWords: words,
4633
+ dictionaryNotice: notice
4634
+ }
4635
+ };
4636
+ } catch (error) {
4637
+ return createErrorResponse(error);
4638
+ }
4639
+ }
4640
+ );
4641
+ registerAppToolIfEnabled(
4642
+ server2,
4643
+ disabledTools,
4644
+ "speak_player",
4645
+ {
4646
+ title: "Speak Player",
4647
+ description: 'Create a VOICEVOX player session and display the UI. Returns viewUUID \u2014 save it and pass to resynthesize_player / get_player_state for subsequent operations. Multi-speaker format: "1:Hello\\n2:World". Audio synthesis is performed by the player UI when needed.',
4648
+ inputSchema: {
4649
+ text: z2.string().describe('Text to synthesize. Multi-speaker format: "1:Hello\\n2:World" (speaker ID prefix per line).'),
4650
+ speaker: z2.number().optional().describe("Default speaker ID (optional)"),
4651
+ speedScale: z2.number().optional().describe("Playback speed (optional, default from environment)")
4652
+ },
4653
+ annotations: {
4654
+ readOnlyHint: false,
4655
+ destructiveHint: false,
4656
+ idempotentHint: false,
4657
+ openWorldHint: true
4658
+ },
4659
+ _meta: { ui: { resourceUri: playerResourceUri } }
4660
+ },
4661
+ async ({
4662
+ text,
4663
+ speaker,
4664
+ speedScale
4665
+ }, extra) => {
4666
+ try {
4667
+ if (!text?.trim()) {
4668
+ throw new Error("text is required");
4669
+ }
4670
+ const parsedSegments = parseStringInput(text);
4671
+ if (parsedSegments.length === 0) {
4672
+ throw new Error("Text is empty");
4673
+ }
4674
+ const effectiveSpeaker = getEffectiveSpeaker(speaker, extra.sessionId) ?? config2.defaultSpeaker;
4675
+ const effectiveSpeed = speedScale ?? config2.defaultSpeedScale;
4676
+ const baseSegments = parsedSegments.map((s) => ({
4677
+ text: s.text,
4678
+ speaker: s.speaker ?? effectiveSpeaker,
4679
+ speedScale: effectiveSpeed
4680
+ }));
4681
+ const speakerNameMap = await resolveSpeakerNames(baseSegments.map((s) => s.speaker));
4682
+ const viewUUID = randomUUID2();
4683
+ setSessionState(viewUUID, {
4684
+ segments: baseSegments.map((s) => ({
4685
+ text: s.text,
4686
+ speaker: s.speaker,
4687
+ speakerName: speakerNameMap.get(s.speaker),
4688
+ speedScale: s.speedScale
4689
+ })),
4690
+ updatedAt: Date.now()
4691
+ });
4692
+ const fullText = parsedSegments.map((s) => s.text).join(" ");
4693
+ const textPreview = fullText.slice(0, 60) + (fullText.length > 60 ? "..." : "");
4694
+ const uiSegments = baseSegments.map((s) => ({
4695
+ text: s.text,
4696
+ speaker: s.speaker,
4697
+ speakerName: speakerNameMap.get(s.speaker),
4698
+ speedScale: s.speedScale
4699
+ }));
4700
+ return {
4701
+ content: [
4702
+ {
4703
+ type: "text",
4704
+ text: `Voicevox Player started. viewUUID: ${viewUUID} \u300C${textPreview}\u300D`
4705
+ }
4706
+ ],
4707
+ structuredContent: {
4708
+ viewUUID,
4709
+ autoPlay: config2.autoPlay,
4710
+ segments: uiSegments
4711
+ },
4712
+ _meta: {
4713
+ viewUUID,
4714
+ autoPlay: config2.autoPlay,
4715
+ segments: uiSegments
4716
+ }
4717
+ };
4718
+ } catch (error) {
4719
+ return createErrorResponse(error);
4720
+ }
4721
+ }
4722
+ );
4723
+ registerAppToolIfEnabled(
4724
+ server2,
4725
+ disabledTools,
4726
+ "resynthesize_player",
4727
+ {
4728
+ title: "Resynthesize Player",
4729
+ description: "Update player segments for a new player instance (new viewUUID every call). Typical loop: get_player_state (fetch additional pages if hasMore) -> edit segment parameters -> resynthesize_player -> use returned viewUUID for the next loop. Audio synthesis is performed by the player UI when needed.",
4730
+ inputSchema: {
4731
+ segments: z2.array(
4732
+ z2.object({
4733
+ text: z2.string().describe("Segment text"),
4734
+ speaker: z2.number().optional().describe("Speaker ID"),
4735
+ speedScale: z2.number().optional().describe("Playback speed"),
4736
+ intonationScale: z2.number().optional().describe("Intonation scale (\u6291\u63DA)"),
4737
+ volumeScale: z2.number().optional().describe("Volume scale (\u97F3\u91CF)"),
4738
+ prePhonemeLength: z2.number().optional().describe("Pre-phoneme silence in seconds"),
4739
+ postPhonemeLength: z2.number().optional().describe("Post-phoneme silence in seconds"),
4740
+ pauseLengthScale: z2.number().optional().describe("Pause length scale between phrases (\u9593\u306E\u9577\u3055)"),
4741
+ accentPhrases: z2.array(
4742
+ z2.object({
4743
+ moras: z2.array(
4744
+ z2.object({
4745
+ text: z2.string(),
4746
+ consonant: z2.string().nullable().optional(),
4747
+ consonant_length: z2.number().nullable().optional(),
4748
+ vowel: z2.string(),
4749
+ vowel_length: z2.number(),
4750
+ pitch: z2.number()
4751
+ })
4752
+ ),
4753
+ accent: z2.number().int(),
4754
+ pause_mora: z2.object({
4755
+ text: z2.string(),
4756
+ consonant: z2.string().nullable().optional(),
4757
+ consonant_length: z2.number().nullable().optional(),
4758
+ vowel: z2.string(),
4759
+ vowel_length: z2.number(),
4760
+ pitch: z2.number()
4761
+ }).nullable().optional(),
4762
+ is_interrogative: z2.boolean().nullable().optional()
4763
+ })
4764
+ ).optional().describe("Accent phrases")
4765
+ })
4766
+ ).describe(
4767
+ "Full segment list to update. Start from get_player_state.segments, edit needed fields, and send the complete array."
4768
+ ),
4769
+ autoPlay: z2.boolean().optional().describe("Auto-play when loaded (default: true)")
4770
+ },
4771
+ annotations: {
4772
+ readOnlyHint: false,
4773
+ destructiveHint: false,
4774
+ idempotentHint: false,
4775
+ openWorldHint: true
4776
+ },
4777
+ _meta: { ui: { resourceUri: playerResourceUri } }
4778
+ },
4779
+ async ({
4780
+ segments,
4781
+ autoPlay
4782
+ }, extra) => {
4783
+ try {
4784
+ if (!segments || segments.length === 0) {
4785
+ throw new Error("segments is required");
4786
+ }
4787
+ const effectiveDefaultSpeaker = getEffectiveSpeaker(void 0, extra.sessionId) ?? config2.defaultSpeaker;
4788
+ const effectiveSpeed = config2.defaultSpeedScale;
4789
+ const effectiveAutoPlay = autoPlay ?? config2.autoPlay;
4790
+ const viewUUID = randomUUID2();
4791
+ const normalizedSegments = segments.map((seg) => ({
4792
+ text: seg.text,
4793
+ speaker: seg.speaker ?? effectiveDefaultSpeaker,
4794
+ speedScale: seg.speedScale ?? effectiveSpeed,
4795
+ intonationScale: seg.intonationScale,
4796
+ volumeScale: seg.volumeScale,
4797
+ prePhonemeLength: seg.prePhonemeLength,
4798
+ postPhonemeLength: seg.postPhonemeLength,
4799
+ pauseLengthScale: seg.pauseLengthScale,
4800
+ accentPhrases: seg.accentPhrases
4801
+ }));
4802
+ const speakerNameMap = await resolveSpeakerNames(normalizedSegments.map((seg) => seg.speaker));
4803
+ setSessionState(viewUUID, {
4804
+ segments: normalizedSegments.map((seg) => ({
4805
+ text: seg.text,
4806
+ speaker: seg.speaker,
4807
+ speakerName: speakerNameMap.get(seg.speaker),
4808
+ speedScale: seg.speedScale,
4809
+ intonationScale: seg.intonationScale,
4810
+ volumeScale: seg.volumeScale,
4811
+ prePhonemeLength: seg.prePhonemeLength,
4812
+ postPhonemeLength: seg.postPhonemeLength,
4813
+ pauseLengthScale: seg.pauseLengthScale,
4814
+ accentPhrases: seg.accentPhrases
4815
+ })),
4816
+ updatedAt: Date.now()
4817
+ });
4818
+ const uiSegments = normalizedSegments.map((seg) => ({
4819
+ text: seg.text,
4820
+ speaker: seg.speaker,
4821
+ speakerName: speakerNameMap.get(seg.speaker),
4822
+ speedScale: seg.speedScale,
4823
+ intonationScale: seg.intonationScale,
4824
+ volumeScale: seg.volumeScale,
4825
+ prePhonemeLength: seg.prePhonemeLength,
4826
+ postPhonemeLength: seg.postPhonemeLength,
4827
+ pauseLengthScale: seg.pauseLengthScale,
4828
+ accentPhrases: seg.accentPhrases
4829
+ }));
4830
+ return {
4831
+ content: [
4832
+ {
4833
+ type: "text",
4834
+ text: `Voicevox Player updated. viewUUID: ${viewUUID} (${segments.length} segment(s))`
4835
+ }
4836
+ ],
4837
+ structuredContent: {
4838
+ viewUUID,
4839
+ autoPlay: effectiveAutoPlay,
4840
+ segments: uiSegments
4841
+ },
4842
+ _meta: {
4843
+ viewUUID,
4844
+ autoPlay: effectiveAutoPlay,
4845
+ segments: uiSegments
4846
+ }
4847
+ };
4848
+ } catch (error) {
4849
+ return createErrorResponse(error);
4850
+ }
4851
+ }
4852
+ );
4853
+ registerPlayerUITools(deps, {
4854
+ playerVoicevoxApi,
4855
+ playerResourceUri,
4856
+ synthesizeWithCache,
4857
+ setSessionState,
4858
+ getSessionState: (key) => playerSessionState.get(key),
4859
+ getSpeakerList
4860
+ });
4861
+ registerToolIfEnabled(
4862
+ server2,
4863
+ disabledTools,
4864
+ "get_player_state",
4865
+ {
4866
+ title: "Get VOICEVOX Player State",
4867
+ description: "Returns paged editable player state for AI tuning. Use the latest viewUUID from speak_player/resynthesize_player. If hasMore is true, call again with nextCursor to continue.",
4868
+ inputSchema: {
4869
+ viewUUID: z2.string().optional().describe("Player instance ID from speak_player/resynthesize_player. Always pass the latest viewUUID."),
4870
+ cursor: z2.number().int().min(0).optional().describe("Start index in segments array (default: 0)"),
4871
+ limit: z2.number().int().min(1).max(MAX_STATE_PAGE_LIMIT).optional().describe(
4872
+ `Max segments per page (default: ${DEFAULT_STATE_PAGE_LIMIT}, max: ${MAX_STATE_PAGE_LIMIT}). Server may return fewer segments when needed.`
4873
+ )
4874
+ },
4875
+ annotations: {
4876
+ readOnlyHint: true,
4877
+ destructiveHint: false,
4878
+ idempotentHint: true,
4879
+ openWorldHint: false
4880
+ }
4881
+ },
4882
+ async ({ viewUUID, cursor, limit }, extra) => {
4883
+ try {
4884
+ const state = getSessionState(viewUUID, extra?.sessionId);
4885
+ if (!state) {
4886
+ return {
4887
+ content: [
4888
+ {
4889
+ type: "text",
4890
+ text: JSON.stringify({
4891
+ segments: [],
4892
+ updatedAt: 0,
4893
+ total: 0,
4894
+ cursor: 0,
4895
+ limit: limit ?? DEFAULT_STATE_PAGE_LIMIT,
4896
+ hasMore: false,
4897
+ nextCursor: null,
4898
+ message: "No player state available. Play something first."
4899
+ })
4900
+ }
4901
+ ]
4902
+ };
4903
+ }
4904
+ const total = state.segments.length;
4905
+ const effectiveCursor = Math.min(cursor ?? 0, total);
4906
+ const requestedLimit = limit ?? DEFAULT_STATE_PAGE_LIMIT;
4907
+ const effectiveLimit = Math.min(requestedLimit, MAX_STATE_PAGE_LIMIT);
4908
+ let pageEnd = Math.min(total, effectiveCursor + effectiveLimit);
4909
+ let pageSegments = state.segments.slice(effectiveCursor, pageEnd);
4910
+ const buildPayload = () => {
4911
+ const hasMore = pageEnd < total;
4912
+ return {
4913
+ segments: pageSegments,
4914
+ updatedAt: state.updatedAt,
4915
+ total,
4916
+ cursor: effectiveCursor,
4917
+ limit: effectiveLimit,
4918
+ hasMore,
4919
+ nextCursor: hasMore ? pageEnd : null
4920
+ };
4921
+ };
4922
+ let payload = buildPayload();
4923
+ let payloadText = JSON.stringify(payload);
4924
+ while (Buffer.byteLength(payloadText, "utf8") > MAX_TOOL_CONTENT_BYTES && pageSegments.length > 0) {
4925
+ pageEnd -= 1;
4926
+ pageSegments = state.segments.slice(effectiveCursor, pageEnd);
4927
+ payload = buildPayload();
4928
+ payloadText = JSON.stringify(payload);
4929
+ }
4930
+ if (Buffer.byteLength(payloadText, "utf8") > MAX_TOOL_CONTENT_BYTES) {
4931
+ return {
4932
+ content: [
4933
+ {
4934
+ type: "text",
4935
+ text: JSON.stringify({
4936
+ segments: [],
4937
+ updatedAt: state.updatedAt,
4938
+ total,
4939
+ cursor: effectiveCursor,
4940
+ limit: effectiveLimit,
4941
+ hasMore: effectiveCursor < total,
4942
+ nextCursor: effectiveCursor < total ? effectiveCursor : null,
4943
+ message: "Player state is too large for this request. Request a later cursor or reduce source text size."
4944
+ })
4945
+ }
4946
+ ]
4947
+ };
4948
+ }
4949
+ if (pageSegments.length === 0 && effectiveCursor < total) {
4950
+ return {
4951
+ content: [
4952
+ {
4953
+ type: "text",
4954
+ text: JSON.stringify({
4955
+ segments: [],
4956
+ updatedAt: state.updatedAt,
4957
+ total,
4958
+ cursor: effectiveCursor,
4959
+ limit: effectiveLimit,
4960
+ hasMore: true,
4961
+ nextCursor: effectiveCursor,
4962
+ message: "Current segment is too large to include. Advance cursor or reduce segment text size."
4963
+ })
4964
+ }
4965
+ ]
4966
+ };
4967
+ }
4968
+ return {
4969
+ content: [{ type: "text", text: payloadText }]
4970
+ };
4971
+ } catch (error) {
4972
+ return createErrorResponse(error);
4973
+ }
4974
+ }
4975
+ );
4976
+ }
4977
+
4978
+ // src/tools/speak.ts
4979
+ import * as z3 from "zod/v4";
4980
+ function buildSpeakInputSchema(restrictions) {
4981
+ const schema = {
4982
+ text: z3.string().describe(
4983
+ 'Text split by line breaks (\\n). IMPORTANT: Each line = one speech unit (processed and played separately). Keep the FIRST LINE SHORT for quick playback start - audio begins as soon as the first line is synthesized. Example: "Hi!\\nThis is a longer explanation that follows." Optional speaker prefix per line: "1:Hello\\n2:World"'
4984
+ ),
4985
+ query: z3.string().optional().describe("Voice synthesis query"),
4986
+ speaker: z3.number().optional().describe("Default speaker ID (optional)"),
4987
+ speedScale: z3.number().optional().describe("Playback speed (optional, default from environment)")
4988
+ };
4989
+ if (!restrictions.immediate) {
4990
+ schema.immediate = z3.boolean().optional().describe(
4991
+ "If true, stops current playback and plays new audio immediately. If false, waits for current playback to finish. Default depends on environment variable."
4992
+ );
4993
+ }
4994
+ if (!restrictions.waitForStart) {
4995
+ schema.waitForStart = z3.boolean().optional().describe("Wait for playback to start (optional, default: false)");
4996
+ }
4997
+ if (!restrictions.waitForEnd) {
4998
+ schema.waitForEnd = z3.boolean().optional().describe("Wait for playback to end (optional, default: false)");
4999
+ }
5000
+ return schema;
5001
+ }
5002
+ function registerSpeakTool(deps) {
5003
+ const { server: server2, voicevoxClient, config: config2, disabledTools, restrictions } = deps;
5004
+ registerToolIfEnabled(
5005
+ server2,
5006
+ disabledTools,
5007
+ "speak",
5008
+ {
5009
+ title: "Speak",
5010
+ description: "Convert text to speech and play it. Text is split by line breaks (\\n) into separate speech units. Each line is processed as an independent audio segment.",
5011
+ inputSchema: buildSpeakInputSchema(restrictions),
5012
+ annotations: {
5013
+ readOnlyHint: false,
5014
+ destructiveHint: false,
5015
+ idempotentHint: false,
5016
+ openWorldHint: true
5017
+ }
5018
+ },
5019
+ async ({
5020
+ text,
5021
+ speaker,
5022
+ query,
3546
5023
  speedScale,
3547
5024
  immediate,
3548
5025
  waitForStart,
@@ -3580,7 +5057,7 @@ function registerSpeakerTools(deps) {
3580
5057
  registerToolIfEnabled(
3581
5058
  server2,
3582
5059
  disabledTools,
3583
- "ping_voicevox",
5060
+ "ping",
3584
5061
  {
3585
5062
  title: "Ping VOICEVOX",
3586
5063
  description: "Check if VOICEVOX Engine is running and reachable",
@@ -3622,7 +5099,7 @@ function registerSpeakerTools(deps) {
3622
5099
  async () => {
3623
5100
  try {
3624
5101
  await voicevoxClient.clearQueue();
3625
- return createSuccessResponse("\u30B9\u30D4\u30FC\u30AB\u30FC\u3092\u505C\u6B62\u3057\u307E\u3057\u305F");
5102
+ return createSuccessResponse("Speaker stopped successfully");
3626
5103
  } catch (error) {
3627
5104
  return createErrorResponse(error);
3628
5105
  }
@@ -3661,7 +5138,7 @@ function registerSpeakerTools(deps) {
3661
5138
  }
3662
5139
 
3663
5140
  // src/tools/synthesize.ts
3664
- import * as z3 from "zod/v4";
5141
+ import * as z4 from "zod/v4";
3665
5142
  function registerSynthesizeTool(deps) {
3666
5143
  const { server: server2, voicevoxClient, disabledTools } = deps;
3667
5144
  registerToolIfEnabled(
@@ -3678,11 +5155,11 @@ function registerSynthesizeTool(deps) {
3678
5155
  openWorldHint: true
3679
5156
  },
3680
5157
  inputSchema: {
3681
- text: z3.string().optional().describe("Text for voice synthesis (if both query and text provided, query takes precedence)"),
3682
- query: z3.string().optional().describe("Voice synthesis query"),
3683
- output: z3.string().describe("Output path for the audio file"),
3684
- speaker: z3.number().optional().describe("Default speaker ID (optional)"),
3685
- speedScale: z3.number().optional().describe("Playback speed (optional, default from environment)")
5158
+ text: z4.string().optional().describe("Text for voice synthesis (if both query and text provided, query takes precedence)"),
5159
+ query: z4.string().optional().describe("Voice synthesis query"),
5160
+ output: z4.string().describe("Output path for the audio file"),
5161
+ speaker: z4.number().optional().describe("Default speaker ID (optional)"),
5162
+ speedScale: z4.number().optional().describe("Playback speed (optional, default from environment)")
3686
5163
  }
3687
5164
  },
3688
5165
  async ({
@@ -3703,7 +5180,7 @@ function registerSynthesizeTool(deps) {
3703
5180
  const filePath = await voicevoxClient.generateAudioFile(text, output, effectiveSpeaker, speedScale);
3704
5181
  return createSuccessResponse(filePath);
3705
5182
  }
3706
- throw new Error("query\u30D1\u30E9\u30E1\u30FC\u30BF\u3068text\u30D1\u30E9\u30E1\u30FC\u30BF\u306E\u3069\u3061\u3089\u304B\u3092\u6307\u5B9A\u3057\u3066\u304F\u3060\u3055\u3044");
5183
+ throw new Error('Either "query" or "text" parameter must be specified');
3707
5184
  } catch (error) {
3708
5185
  return createErrorResponse(error);
3709
5186
  }
@@ -3715,8 +5192,8 @@ function registerSynthesizeTool(deps) {
3715
5192
  var config = getConfig();
3716
5193
  function createServer() {
3717
5194
  const server2 = new McpServer({
3718
- name: "MCP TTS Voicevox",
3719
- version: "0.6.1",
5195
+ name: "mcp-tts-voicevox",
5196
+ version: "0.7.1",
3720
5197
  description: "A Voicevox server that converts text to speech for playback and saving."
3721
5198
  });
3722
5199
  const voicevoxClient = new VoicevoxClient({
@@ -3745,7 +5222,7 @@ function createServer() {
3745
5222
  var server = createServer();
3746
5223
 
3747
5224
  // src/index.ts
3748
- var __dirname2 = dirname2(fileURLToPath2(import.meta.url));
5225
+ var __dirname2 = dirname3(fileURLToPath2(import.meta.url));
3749
5226
  function isCLI() {
3750
5227
  if (!isNodejs() || !process.argv) return false;
3751
5228
  const isNpmStart = process.env?.npm_lifecycle_event === "start";
@@ -3799,12 +5276,22 @@ Options:
3799
5276
 
3800
5277
  Tool Options:
3801
5278
  --disable-tools <tools> Comma-separated list of tools to disable
3802
- (Allowed: speak, speak_player, ping_voicevox,
3803
- synthesize_file, stop_speaker, get_speakers)
5279
+ (e.g.: speak, speak_player, ping, synthesize_file,
5280
+ stop_speaker, get_speakers)
5281
+ The "voicevox_" prefix is added automatically.
3804
5282
 
3805
5283
  UI Player Options:
3806
5284
  --auto-play Auto-play audio in UI player (default)
3807
5285
  --no-auto-play Require manual play in UI player
5286
+ --player-export Enable track export(download) in UI player (default)
5287
+ --no-player-export Disable track export(download) in UI player
5288
+ --player-export-dir <dir> Default output directory for exported tracks
5289
+ --player-cache-dir <dir> Player cache directory
5290
+ --player-state-file <path> Persisted player state file path
5291
+ --player-audio-cache Enable disk audio cache for player (default)
5292
+ --no-player-audio-cache Disable disk audio cache for player
5293
+ --player-audio-cache-ttl-days <days> Audio cache retention days (0 disables, -1 unlimited)
5294
+ --player-audio-cache-max-mb <mb> Audio cache size cap in MB (0 disables, -1 unlimited)
3808
5295
 
3809
5296
  Server Options:
3810
5297
  --http Enable HTTP server mode (remote MCP)
@@ -3812,6 +5299,7 @@ Options:
3812
5299
  --host <host> HTTP server host (default: 0.0.0.0)
3813
5300
  --allowed-hosts <hosts> Comma-separated list of allowed hosts (default: localhost,127.0.0.1,[::1])
3814
5301
  --allowed-origins <origins> Comma-separated list of allowed origins
5302
+ --api-key <key> Require matching API key via X-API-Key or Authorization: Bearer
3815
5303
 
3816
5304
  Examples:
3817
5305
  npx @kajidog/mcp-tts-voicevox --url http://192.168.1.50:50021 --speaker 3
@@ -3828,7 +5316,7 @@ async function startMCPServer() {
3828
5316
  process.exit(0);
3829
5317
  }
3830
5318
  if (process.argv.includes("--version") || process.argv.includes("-v")) {
3831
- const pkg = JSON.parse(readFileSync2(join2(__dirname2, "../package.json"), "utf-8"));
5319
+ const pkg = JSON.parse(readFileSync2(join4(__dirname2, "../package.json"), "utf-8"));
3832
5320
  console.log(`@kajidog/mcp-tts-voicevox v${pkg.version}`);
3833
5321
  process.exit(0);
3834
5322
  }