@kajidog/mcp-tts-voicevox 0.6.1 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1706 -218
- package/dist/index.js.map +1 -1
- package/dist/mcp-app.html +56 -56
- package/dist/stdio.js +1674 -223
- package/dist/stdio.js.map +1 -1
- package/package.json +6 -5
- package/README.md +0 -479
package/dist/index.js
CHANGED
|
@@ -429,7 +429,7 @@ var init_dist = __esm({
|
|
|
429
429
|
});
|
|
430
430
|
if (!chunk) {
|
|
431
431
|
if (i === 1) {
|
|
432
|
-
await new Promise((
|
|
432
|
+
await new Promise((resolve2) => setTimeout(resolve2));
|
|
433
433
|
maxReadCount = 3;
|
|
434
434
|
continue;
|
|
435
435
|
}
|
|
@@ -576,7 +576,7 @@ var init_dist = __esm({
|
|
|
576
576
|
|
|
577
577
|
// src/index.ts
|
|
578
578
|
import { readFileSync as readFileSync2 } from "fs";
|
|
579
|
-
import { dirname as
|
|
579
|
+
import { dirname as dirname3, join as join4 } from "path";
|
|
580
580
|
import { fileURLToPath as fileURLToPath2 } from "url";
|
|
581
581
|
|
|
582
582
|
// ../../packages/mcp-core/dist/config.js
|
|
@@ -620,6 +620,12 @@ function parseBaseCliArgs(argv = process.argv.slice(2)) {
|
|
|
620
620
|
i++;
|
|
621
621
|
}
|
|
622
622
|
break;
|
|
623
|
+
case "--api-key":
|
|
624
|
+
if (nextArg && !nextArg.startsWith("-")) {
|
|
625
|
+
config2.apiKey = nextArg;
|
|
626
|
+
i++;
|
|
627
|
+
}
|
|
628
|
+
break;
|
|
623
629
|
}
|
|
624
630
|
}
|
|
625
631
|
return config2;
|
|
@@ -641,6 +647,9 @@ function parseBaseEnvVars(env = process.env) {
|
|
|
641
647
|
if (env.MCP_ALLOWED_ORIGINS) {
|
|
642
648
|
config2.allowedOrigins = env.MCP_ALLOWED_ORIGINS.split(",").map((o) => o.trim());
|
|
643
649
|
}
|
|
650
|
+
if (env.MCP_API_KEY) {
|
|
651
|
+
config2.apiKey = env.MCP_API_KEY;
|
|
652
|
+
}
|
|
644
653
|
return config2;
|
|
645
654
|
}
|
|
646
655
|
function filterUndefined(obj) {
|
|
@@ -1648,9 +1657,9 @@ var Context = class {
|
|
|
1648
1657
|
* })
|
|
1649
1658
|
* ```
|
|
1650
1659
|
*/
|
|
1651
|
-
json = (
|
|
1660
|
+
json = (object3, arg, headers) => {
|
|
1652
1661
|
return this.#newResponse(
|
|
1653
|
-
JSON.stringify(
|
|
1662
|
+
JSON.stringify(object3),
|
|
1654
1663
|
arg,
|
|
1655
1664
|
setDefaultContentType("application/json", headers)
|
|
1656
1665
|
);
|
|
@@ -2805,6 +2814,13 @@ function forbiddenError(message) {
|
|
|
2805
2814
|
id: null
|
|
2806
2815
|
};
|
|
2807
2816
|
}
|
|
2817
|
+
function unauthorizedError(message) {
|
|
2818
|
+
return {
|
|
2819
|
+
jsonrpc: "2.0",
|
|
2820
|
+
error: { code: -32001, message },
|
|
2821
|
+
id: null
|
|
2822
|
+
};
|
|
2823
|
+
}
|
|
2808
2824
|
function validateOrigin(config2) {
|
|
2809
2825
|
return async (c, next) => {
|
|
2810
2826
|
const origin = c.req.header("Origin");
|
|
@@ -2847,6 +2863,22 @@ function validateHost(config2) {
|
|
|
2847
2863
|
return next();
|
|
2848
2864
|
};
|
|
2849
2865
|
}
|
|
2866
|
+
function validateApiKey(config2) {
|
|
2867
|
+
return async (c, next) => {
|
|
2868
|
+
if (!config2.apiKey || c.req.method === "OPTIONS") {
|
|
2869
|
+
return next();
|
|
2870
|
+
}
|
|
2871
|
+
const xApiKey = c.req.header("X-API-Key");
|
|
2872
|
+
const authorization = c.req.header("Authorization");
|
|
2873
|
+
const bearerToken = authorization?.startsWith("Bearer ") ? authorization.slice(7).trim() : void 0;
|
|
2874
|
+
const providedKey = xApiKey ?? bearerToken;
|
|
2875
|
+
if (providedKey !== config2.apiKey) {
|
|
2876
|
+
console.log("Rejected request with invalid API key");
|
|
2877
|
+
return c.json(unauthorizedError("Unauthorized: Invalid API key"), { status: 401 });
|
|
2878
|
+
}
|
|
2879
|
+
return next();
|
|
2880
|
+
};
|
|
2881
|
+
}
|
|
2850
2882
|
function createHttpApp(options) {
|
|
2851
2883
|
const { server: server2, config: config2, serverFactory, extraCorsHeaders = [], onSessionInitialized, onSessionClosed } = options;
|
|
2852
2884
|
const transports = /* @__PURE__ */ new Map();
|
|
@@ -2912,6 +2944,8 @@ function createHttpApp(options) {
|
|
|
2912
2944
|
"mcp-session-id",
|
|
2913
2945
|
"Last-Event-ID",
|
|
2914
2946
|
"mcp-protocol-version",
|
|
2947
|
+
"X-API-Key",
|
|
2948
|
+
"Authorization",
|
|
2915
2949
|
...extraCorsHeaders
|
|
2916
2950
|
];
|
|
2917
2951
|
app.use("/mcp", cors({
|
|
@@ -2922,6 +2956,7 @@ function createHttpApp(options) {
|
|
|
2922
2956
|
}));
|
|
2923
2957
|
app.use("/mcp", validateOrigin(config2));
|
|
2924
2958
|
app.use("/mcp", validateHost(config2));
|
|
2959
|
+
app.use("/mcp", validateApiKey(config2));
|
|
2925
2960
|
app.all("/mcp", handleMCP);
|
|
2926
2961
|
app.get("/health", handleHealth);
|
|
2927
2962
|
return app;
|
|
@@ -2969,7 +3004,7 @@ async function startHttpServer(options) {
|
|
|
2969
3004
|
console.error(`Health check: http://${info.address}:${info.port}/health`);
|
|
2970
3005
|
});
|
|
2971
3006
|
}
|
|
2972
|
-
await new Promise((
|
|
3007
|
+
await new Promise((resolve2) => setTimeout(resolve2, 1e3));
|
|
2973
3008
|
console.error("HTTP server startup completed");
|
|
2974
3009
|
} catch (error) {
|
|
2975
3010
|
console.error("HTTP server startup failed:", error);
|
|
@@ -3016,6 +3051,7 @@ async function launchServer(options) {
|
|
|
3016
3051
|
}
|
|
3017
3052
|
|
|
3018
3053
|
// src/config.ts
|
|
3054
|
+
import { join } from "path";
|
|
3019
3055
|
var defaultConfig = {
|
|
3020
3056
|
...defaultBaseConfig,
|
|
3021
3057
|
voicevoxUrl: "http://localhost:50021",
|
|
@@ -3028,7 +3064,15 @@ var defaultConfig = {
|
|
|
3028
3064
|
restrictImmediate: false,
|
|
3029
3065
|
restrictWaitForStart: false,
|
|
3030
3066
|
restrictWaitForEnd: false,
|
|
3067
|
+
playerDomain: "",
|
|
3031
3068
|
autoPlay: true,
|
|
3069
|
+
playerExportEnabled: true,
|
|
3070
|
+
playerExportDir: join(process.cwd(), "voicevox-player-exports"),
|
|
3071
|
+
playerCacheDir: join(process.cwd(), ".voicevox-player-cache"),
|
|
3072
|
+
playerStateFile: join(process.cwd(), ".voicevox-player-cache", "player-state.json"),
|
|
3073
|
+
playerAudioCacheEnabled: true,
|
|
3074
|
+
playerAudioCacheTtlDays: 30,
|
|
3075
|
+
playerAudioCacheMaxMb: 512,
|
|
3032
3076
|
disabledTools: []
|
|
3033
3077
|
};
|
|
3034
3078
|
function parseCliArgs(argv = process.argv.slice(2)) {
|
|
@@ -3095,6 +3139,48 @@ function parseCliArgs(argv = process.argv.slice(2)) {
|
|
|
3095
3139
|
case "--no-auto-play":
|
|
3096
3140
|
config2.autoPlay = false;
|
|
3097
3141
|
break;
|
|
3142
|
+
case "--player-export":
|
|
3143
|
+
config2.playerExportEnabled = true;
|
|
3144
|
+
break;
|
|
3145
|
+
case "--no-player-export":
|
|
3146
|
+
config2.playerExportEnabled = false;
|
|
3147
|
+
break;
|
|
3148
|
+
case "--player-export-dir":
|
|
3149
|
+
if (nextArg && !nextArg.startsWith("-")) {
|
|
3150
|
+
config2.playerExportDir = nextArg;
|
|
3151
|
+
i++;
|
|
3152
|
+
}
|
|
3153
|
+
break;
|
|
3154
|
+
case "--player-cache-dir":
|
|
3155
|
+
if (nextArg && !nextArg.startsWith("-")) {
|
|
3156
|
+
config2.playerCacheDir = nextArg;
|
|
3157
|
+
i++;
|
|
3158
|
+
}
|
|
3159
|
+
break;
|
|
3160
|
+
case "--player-state-file":
|
|
3161
|
+
if (nextArg && !nextArg.startsWith("-")) {
|
|
3162
|
+
config2.playerStateFile = nextArg;
|
|
3163
|
+
i++;
|
|
3164
|
+
}
|
|
3165
|
+
break;
|
|
3166
|
+
case "--player-audio-cache":
|
|
3167
|
+
config2.playerAudioCacheEnabled = true;
|
|
3168
|
+
break;
|
|
3169
|
+
case "--no-player-audio-cache":
|
|
3170
|
+
config2.playerAudioCacheEnabled = false;
|
|
3171
|
+
break;
|
|
3172
|
+
case "--player-audio-cache-ttl-days":
|
|
3173
|
+
if (nextArg && !nextArg.startsWith("-")) {
|
|
3174
|
+
config2.playerAudioCacheTtlDays = Number(nextArg);
|
|
3175
|
+
i++;
|
|
3176
|
+
}
|
|
3177
|
+
break;
|
|
3178
|
+
case "--player-audio-cache-max-mb":
|
|
3179
|
+
if (nextArg && !nextArg.startsWith("-")) {
|
|
3180
|
+
config2.playerAudioCacheMaxMb = Number(nextArg);
|
|
3181
|
+
i++;
|
|
3182
|
+
}
|
|
3183
|
+
break;
|
|
3098
3184
|
case "--disable-tools":
|
|
3099
3185
|
if (nextArg && !nextArg.startsWith("-")) {
|
|
3100
3186
|
config2.disabledTools = nextArg.split(",").map((t) => t.trim());
|
|
@@ -3138,9 +3224,35 @@ function parseEnvVars(env = process.env) {
|
|
|
3138
3224
|
if (env.VOICEVOX_RESTRICT_WAIT_FOR_END === "true") {
|
|
3139
3225
|
config2.restrictWaitForEnd = true;
|
|
3140
3226
|
}
|
|
3227
|
+
if (env.VOICEVOX_PLAYER_DOMAIN) {
|
|
3228
|
+
config2.playerDomain = env.VOICEVOX_PLAYER_DOMAIN;
|
|
3229
|
+
}
|
|
3141
3230
|
if (env.VOICEVOX_AUTO_PLAY !== void 0) {
|
|
3142
3231
|
config2.autoPlay = env.VOICEVOX_AUTO_PLAY !== "false";
|
|
3143
3232
|
}
|
|
3233
|
+
if (env.VOICEVOX_PLAYER_EXPORT_ENABLED !== void 0) {
|
|
3234
|
+
config2.playerExportEnabled = env.VOICEVOX_PLAYER_EXPORT_ENABLED !== "false";
|
|
3235
|
+
}
|
|
3236
|
+
if (env.VOICEVOX_PLAYER_EXPORT_DIR) {
|
|
3237
|
+
config2.playerExportDir = env.VOICEVOX_PLAYER_EXPORT_DIR;
|
|
3238
|
+
}
|
|
3239
|
+
if (env.VOICEVOX_PLAYER_CACHE_DIR) {
|
|
3240
|
+
config2.playerCacheDir = env.VOICEVOX_PLAYER_CACHE_DIR;
|
|
3241
|
+
}
|
|
3242
|
+
if (env.VOICEVOX_PLAYER_STATE_FILE) {
|
|
3243
|
+
config2.playerStateFile = env.VOICEVOX_PLAYER_STATE_FILE;
|
|
3244
|
+
}
|
|
3245
|
+
if (env.VOICEVOX_PLAYER_AUDIO_CACHE_ENABLED !== void 0) {
|
|
3246
|
+
config2.playerAudioCacheEnabled = env.VOICEVOX_PLAYER_AUDIO_CACHE_ENABLED !== "false";
|
|
3247
|
+
}
|
|
3248
|
+
if (env.VOICEVOX_PLAYER_AUDIO_CACHE_TTL_DAYS !== void 0) {
|
|
3249
|
+
const ttlDays = Number(env.VOICEVOX_PLAYER_AUDIO_CACHE_TTL_DAYS);
|
|
3250
|
+
if (Number.isFinite(ttlDays)) config2.playerAudioCacheTtlDays = ttlDays;
|
|
3251
|
+
}
|
|
3252
|
+
if (env.VOICEVOX_PLAYER_AUDIO_CACHE_MAX_MB !== void 0) {
|
|
3253
|
+
const maxMb = Number(env.VOICEVOX_PLAYER_AUDIO_CACHE_MAX_MB);
|
|
3254
|
+
if (Number.isFinite(maxMb)) config2.playerAudioCacheMaxMb = maxMb;
|
|
3255
|
+
}
|
|
3144
3256
|
if (env.VOICEVOX_DISABLED_TOOLS) {
|
|
3145
3257
|
config2.disabledTools = env.VOICEVOX_DISABLED_TOOLS.split(",").map((t) => t.trim());
|
|
3146
3258
|
}
|
|
@@ -3149,11 +3261,16 @@ function parseEnvVars(env = process.env) {
|
|
|
3149
3261
|
function getConfig(argv, env) {
|
|
3150
3262
|
const cliConfig = parseCliArgs(argv);
|
|
3151
3263
|
const envConfig = parseEnvVars(env);
|
|
3152
|
-
|
|
3264
|
+
const merged = {
|
|
3153
3265
|
...defaultConfig,
|
|
3154
3266
|
...filterUndefined(envConfig),
|
|
3155
3267
|
...filterUndefined(cliConfig)
|
|
3156
3268
|
};
|
|
3269
|
+
const isPlayerStateFileExplicit = envConfig.playerStateFile !== void 0 || cliConfig.playerStateFile !== void 0;
|
|
3270
|
+
if (!isPlayerStateFileExplicit) {
|
|
3271
|
+
merged.playerStateFile = join(merged.playerCacheDir, "player-state.json");
|
|
3272
|
+
}
|
|
3273
|
+
return merged;
|
|
3157
3274
|
}
|
|
3158
3275
|
|
|
3159
3276
|
// src/server.ts
|
|
@@ -3161,29 +3278,81 @@ import { VoicevoxClient } from "@kajidog/voicevox-client";
|
|
|
3161
3278
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
3162
3279
|
|
|
3163
3280
|
// src/tools/player.ts
|
|
3164
|
-
import { randomUUID as randomUUID2 } from "crypto";
|
|
3165
|
-
import { readFileSync } from "fs";
|
|
3166
|
-
import {
|
|
3281
|
+
import { createHash, randomUUID as randomUUID2 } from "crypto";
|
|
3282
|
+
import { mkdirSync as mkdirSync2, readFileSync } from "fs";
|
|
3283
|
+
import { readdir, rename, stat, unlink, writeFile as writeFile2 } from "fs/promises";
|
|
3284
|
+
import { basename, dirname as dirname2, join as join3 } from "path";
|
|
3167
3285
|
import { fileURLToPath } from "url";
|
|
3168
3286
|
import { VoicevoxApi } from "@kajidog/voicevox-client";
|
|
3169
3287
|
import { RESOURCE_MIME_TYPE, registerAppResource } from "@modelcontextprotocol/ext-apps/server";
|
|
3288
|
+
import * as z2 from "zod/v4";
|
|
3289
|
+
|
|
3290
|
+
// src/tools/player-cache-utils.ts
|
|
3291
|
+
function resolveAudioCachePolicy(input) {
|
|
3292
|
+
const isDiskCacheEnabled = input.enabledFlag && input.ttlDays !== 0 && input.maxMb !== 0;
|
|
3293
|
+
const ttlMs = input.ttlDays < 0 ? null : input.ttlDays * 24 * 60 * 60 * 1e3;
|
|
3294
|
+
const maxBytes = input.maxMb < 0 ? null : input.maxMb * 1024 * 1024;
|
|
3295
|
+
return { isDiskCacheEnabled, ttlMs, maxBytes };
|
|
3296
|
+
}
|
|
3297
|
+
function planAudioCacheCleanup(input) {
|
|
3298
|
+
const toDelete = /* @__PURE__ */ new Set();
|
|
3299
|
+
if (input.ttlMs !== null) {
|
|
3300
|
+
for (const entry of input.entries) {
|
|
3301
|
+
if (input.now - entry.mtimeMs > input.ttlMs) {
|
|
3302
|
+
toDelete.add(entry.path);
|
|
3303
|
+
}
|
|
3304
|
+
}
|
|
3305
|
+
}
|
|
3306
|
+
if (input.maxBytes !== null) {
|
|
3307
|
+
const kept = input.entries.filter((entry) => !toDelete.has(entry.path));
|
|
3308
|
+
let totalBytes = kept.reduce((sum, entry) => sum + entry.size, 0);
|
|
3309
|
+
if (totalBytes > input.maxBytes) {
|
|
3310
|
+
const byOldestFirst = [...kept].sort((a, b) => a.mtimeMs - b.mtimeMs);
|
|
3311
|
+
for (const entry of byOldestFirst) {
|
|
3312
|
+
if (totalBytes <= input.maxBytes) break;
|
|
3313
|
+
toDelete.add(entry.path);
|
|
3314
|
+
totalBytes -= entry.size;
|
|
3315
|
+
}
|
|
3316
|
+
}
|
|
3317
|
+
}
|
|
3318
|
+
return toDelete;
|
|
3319
|
+
}
|
|
3320
|
+
|
|
3321
|
+
// src/tools/player-ui-tools.ts
|
|
3322
|
+
import { spawn, spawnSync } from "child_process";
|
|
3323
|
+
import { constants, accessSync } from "fs";
|
|
3324
|
+
import { mkdir, writeFile } from "fs/promises";
|
|
3325
|
+
import { dirname, join as join2, resolve } from "path";
|
|
3170
3326
|
import * as z from "zod/v4";
|
|
3171
3327
|
|
|
3172
3328
|
// src/tools/registration.ts
|
|
3173
3329
|
import { registerAppTool } from "@modelcontextprotocol/ext-apps/server";
|
|
3330
|
+
var TOOL_PREFIX = "voicevox_";
|
|
3331
|
+
function addToolPrefix(name) {
|
|
3332
|
+
if (name.startsWith("_")) {
|
|
3333
|
+
return name;
|
|
3334
|
+
}
|
|
3335
|
+
return `${TOOL_PREFIX}${name}`;
|
|
3336
|
+
}
|
|
3337
|
+
function isToolDisabled(disabledTools, name) {
|
|
3338
|
+
const fullName = addToolPrefix(name);
|
|
3339
|
+
return disabledTools.has(name) || disabledTools.has(fullName);
|
|
3340
|
+
}
|
|
3174
3341
|
function registerToolIfEnabled(server2, disabledTools, name, definition, handler) {
|
|
3175
|
-
|
|
3176
|
-
|
|
3342
|
+
const fullName = addToolPrefix(name);
|
|
3343
|
+
if (isToolDisabled(disabledTools, name)) {
|
|
3344
|
+
console.error(`Tool "${fullName}" is disabled via configuration`);
|
|
3177
3345
|
return;
|
|
3178
3346
|
}
|
|
3179
|
-
server2.registerTool(
|
|
3347
|
+
server2.registerTool(fullName, definition, handler);
|
|
3180
3348
|
}
|
|
3181
3349
|
function registerAppToolIfEnabled(server2, disabledTools, name, definition, handler) {
|
|
3182
|
-
|
|
3183
|
-
|
|
3350
|
+
const fullName = addToolPrefix(name);
|
|
3351
|
+
if (isToolDisabled(disabledTools, name)) {
|
|
3352
|
+
console.error(`Tool "${fullName}" is disabled via configuration`);
|
|
3184
3353
|
return;
|
|
3185
3354
|
}
|
|
3186
|
-
registerAppTool(server2,
|
|
3355
|
+
registerAppTool(server2, fullName, definition, handler);
|
|
3187
3356
|
}
|
|
3188
3357
|
|
|
3189
3358
|
// src/tools/utils.ts
|
|
@@ -3191,7 +3360,7 @@ var createErrorResponse = (error) => ({
|
|
|
3191
3360
|
content: [
|
|
3192
3361
|
{
|
|
3193
3362
|
type: "text",
|
|
3194
|
-
text:
|
|
3363
|
+
text: `Error: ${error instanceof Error ? error.message : String(error)}`
|
|
3195
3364
|
}
|
|
3196
3365
|
],
|
|
3197
3366
|
isError: true
|
|
@@ -3239,115 +3408,157 @@ var processTextInput = async (voicevoxClient, text, speaker, speedScale, playbac
|
|
|
3239
3408
|
});
|
|
3240
3409
|
};
|
|
3241
3410
|
|
|
3242
|
-
// src/tools/player.ts
|
|
3243
|
-
var
|
|
3244
|
-
|
|
3245
|
-
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
|
|
3411
|
+
// src/tools/player-ui-tools.ts
|
|
3412
|
+
var commandExistsCache = /* @__PURE__ */ new Map();
|
|
3413
|
+
function commandExists(command) {
|
|
3414
|
+
if (commandExistsCache.has(command)) return commandExistsCache.get(command);
|
|
3415
|
+
if (process.platform === "win32" && command === "explorer") {
|
|
3416
|
+
commandExistsCache.set(command, true);
|
|
3417
|
+
return true;
|
|
3418
|
+
}
|
|
3419
|
+
const checkCmd = process.platform === "win32" ? "where" : "which";
|
|
3420
|
+
const result = spawnSync(checkCmd, [command], { stdio: "ignore" });
|
|
3421
|
+
const exists = result.status === 0;
|
|
3422
|
+
commandExistsCache.set(command, exists);
|
|
3423
|
+
return exists;
|
|
3424
|
+
}
|
|
3425
|
+
function canOpenExplorer() {
|
|
3426
|
+
if (process.platform === "win32") return commandExists("explorer");
|
|
3427
|
+
if (process.platform === "darwin") return commandExists("open");
|
|
3428
|
+
if (process.platform === "linux") {
|
|
3429
|
+
const hasDisplay = Boolean(process.env.DISPLAY || process.env.WAYLAND_DISPLAY);
|
|
3430
|
+
return hasDisplay && commandExists("xdg-open");
|
|
3431
|
+
}
|
|
3432
|
+
return false;
|
|
3433
|
+
}
|
|
3434
|
+
function canChooseDirectoryDialog() {
|
|
3435
|
+
return process.platform === "win32" || process.platform === "darwin";
|
|
3436
|
+
}
|
|
3437
|
+
function sanitizeFilePart(input, fallback) {
|
|
3438
|
+
const value = input.trim().replace(/[<>:"/\\|?*\x00-\x1f]/g, "_").replace(/\s+/g, "_").slice(0, 40);
|
|
3439
|
+
return value.length > 0 ? value : fallback;
|
|
3440
|
+
}
|
|
3441
|
+
function openDirectoryInExplorer(directoryPath) {
|
|
3249
3442
|
try {
|
|
3250
|
-
const
|
|
3251
|
-
|
|
3443
|
+
const child = process.platform === "win32" ? spawn("explorer", [directoryPath], { detached: true, stdio: "ignore" }) : process.platform === "darwin" ? spawn("open", [directoryPath], { detached: true, stdio: "ignore" }) : spawn("xdg-open", [directoryPath], { detached: true, stdio: "ignore" });
|
|
3444
|
+
child.unref();
|
|
3445
|
+
return true;
|
|
3252
3446
|
} catch {
|
|
3253
|
-
|
|
3254
|
-
playerHtml = "<html><body><p>Player UI not available. Please build @kajidog/player-ui.</p></body></html>";
|
|
3447
|
+
return false;
|
|
3255
3448
|
}
|
|
3256
3449
|
}
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3271
|
-
|
|
3272
|
-
|
|
3273
|
-
|
|
3274
|
-
|
|
3275
|
-
|
|
3276
|
-
|
|
3450
|
+
function showDirectoryPicker(defaultPath) {
|
|
3451
|
+
return new Promise((resolve2) => {
|
|
3452
|
+
if (process.platform === "win32") {
|
|
3453
|
+
const defaultPathB64 = defaultPath ? Buffer.from(defaultPath).toString("base64") : "";
|
|
3454
|
+
const psScript = `
|
|
3455
|
+
Add-Type -AssemblyName System.Windows.Forms
|
|
3456
|
+
$form = New-Object System.Windows.Forms.Form
|
|
3457
|
+
$form.TopMost = $true
|
|
3458
|
+
$form.ShowInTaskbar = $false
|
|
3459
|
+
$form.WindowState = 'Minimized'
|
|
3460
|
+
$dialog = New-Object System.Windows.Forms.FolderBrowserDialog
|
|
3461
|
+
$dialog.Description = "Select Export Folder"
|
|
3462
|
+
${defaultPathB64 ? `$dialog.SelectedPath = [System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String("${defaultPathB64}"))` : ""}
|
|
3463
|
+
$dialog.ShowNewFolderButton = $true
|
|
3464
|
+
if ($dialog.ShowDialog($form) -eq [System.Windows.Forms.DialogResult]::OK) {
|
|
3465
|
+
Write-Output $dialog.SelectedPath
|
|
3466
|
+
}
|
|
3467
|
+
`;
|
|
3468
|
+
const child = spawn("powershell", ["-NoProfile", "-Command", psScript], { stdio: ["ignore", "pipe", "ignore"] });
|
|
3469
|
+
let output = "";
|
|
3470
|
+
child.stdout.on("data", (data) => {
|
|
3471
|
+
output += data.toString();
|
|
3472
|
+
});
|
|
3473
|
+
child.on("close", () => {
|
|
3474
|
+
const path = output.trim();
|
|
3475
|
+
resolve2(path || null);
|
|
3476
|
+
});
|
|
3477
|
+
} else if (process.platform === "darwin") {
|
|
3478
|
+
const script = `on run argv
|
|
3479
|
+
try
|
|
3480
|
+
${defaultPath ? "set defaultArg to item 1 of argv" : ""}
|
|
3481
|
+
return POSIX path of (choose folder with prompt "Select Export Folder" ${defaultPath ? "default location POSIX file defaultArg" : ""})
|
|
3482
|
+
on error
|
|
3483
|
+
return ""
|
|
3484
|
+
end try
|
|
3485
|
+
end run`;
|
|
3486
|
+
const args = ["-e", script];
|
|
3487
|
+
if (defaultPath) args.push(defaultPath);
|
|
3488
|
+
const child = spawn("osascript", args, { stdio: ["ignore", "pipe", "ignore"] });
|
|
3489
|
+
let output = "";
|
|
3490
|
+
child.stdout.on("data", (data) => {
|
|
3491
|
+
output += data.toString();
|
|
3492
|
+
});
|
|
3493
|
+
child.on("close", () => {
|
|
3494
|
+
const path = output.trim();
|
|
3495
|
+
resolve2(path || null);
|
|
3496
|
+
});
|
|
3497
|
+
} else {
|
|
3498
|
+
resolve2(null);
|
|
3277
3499
|
}
|
|
3278
|
-
};
|
|
3279
|
-
|
|
3280
|
-
|
|
3281
|
-
|
|
3282
|
-
|
|
3283
|
-
|
|
3500
|
+
});
|
|
3501
|
+
}
|
|
3502
|
+
function isKatakana(input) {
|
|
3503
|
+
return /^[ァ-ヶー]+$/.test(input);
|
|
3504
|
+
}
|
|
3505
|
+
function estimateAccentType(pronunciation) {
|
|
3506
|
+
const smallKana = /* @__PURE__ */ new Set(["\u30E3", "\u30E5", "\u30E7", "\u30A1", "\u30A3", "\u30A5", "\u30A7", "\u30A9", "\u30EE"]);
|
|
3507
|
+
let moraCount = 0;
|
|
3508
|
+
for (const char of pronunciation) {
|
|
3509
|
+
if (char === "\u30FC") continue;
|
|
3510
|
+
if (smallKana.has(char)) continue;
|
|
3511
|
+
moraCount += 1;
|
|
3512
|
+
}
|
|
3513
|
+
return Math.max(1, moraCount);
|
|
3514
|
+
}
|
|
3515
|
+
function normalizeUserDictionaryWords(dictionary) {
|
|
3516
|
+
return Object.entries(dictionary).map(([wordUuid, word]) => ({
|
|
3517
|
+
wordUuid,
|
|
3518
|
+
surface: word.surface,
|
|
3519
|
+
pronunciation: word.pronunciation,
|
|
3520
|
+
accentType: word.accent_type,
|
|
3521
|
+
priority: word.priority
|
|
3522
|
+
}));
|
|
3523
|
+
}
|
|
3524
|
+
var moraSchema = z.object({
|
|
3525
|
+
text: z.string(),
|
|
3526
|
+
consonant: z.string().nullable().optional(),
|
|
3527
|
+
consonant_length: z.number().nullable().optional(),
|
|
3528
|
+
vowel: z.string(),
|
|
3529
|
+
vowel_length: z.number(),
|
|
3530
|
+
pitch: z.number()
|
|
3531
|
+
});
|
|
3532
|
+
var accentPhraseSchema = z.object({
|
|
3533
|
+
moras: z.array(moraSchema),
|
|
3534
|
+
accent: z.number().int(),
|
|
3535
|
+
pause_mora: moraSchema.nullable().optional(),
|
|
3536
|
+
is_interrogative: z.boolean().nullable().optional()
|
|
3537
|
+
});
|
|
3538
|
+
var audioQuerySchema = z.object({
|
|
3539
|
+
accent_phrases: z.array(accentPhraseSchema),
|
|
3540
|
+
speedScale: z.number(),
|
|
3541
|
+
pitchScale: z.number(),
|
|
3542
|
+
intonationScale: z.number(),
|
|
3543
|
+
volumeScale: z.number(),
|
|
3544
|
+
prePhonemeLength: z.number(),
|
|
3545
|
+
postPhonemeLength: z.number(),
|
|
3546
|
+
outputSamplingRate: z.number(),
|
|
3547
|
+
outputStereo: z.boolean(),
|
|
3548
|
+
kana: z.string().optional(),
|
|
3549
|
+
pauseLengthScale: z.number().optional()
|
|
3550
|
+
});
|
|
3551
|
+
function registerPlayerUITools(deps, shared) {
|
|
3552
|
+
const { server: server2, disabledTools, config: config2 } = deps;
|
|
3553
|
+
const {
|
|
3554
|
+
playerVoicevoxApi,
|
|
3555
|
+
playerResourceUri: playerResourceUri2,
|
|
3556
|
+
synthesizeWithCache,
|
|
3557
|
+
setSessionState: setSessionState2,
|
|
3558
|
+
getSessionState: getSessionState2,
|
|
3559
|
+
getSpeakerList
|
|
3560
|
+
} = shared;
|
|
3284
3561
|
const speakerIconCache = /* @__PURE__ */ new Map();
|
|
3285
|
-
registerAppResource(
|
|
3286
|
-
server2,
|
|
3287
|
-
"VOICEVOX Player",
|
|
3288
|
-
playerResourceUri,
|
|
3289
|
-
{
|
|
3290
|
-
description: "Audio player UI for VOICEVOX TTS",
|
|
3291
|
-
mimeType: RESOURCE_MIME_TYPE
|
|
3292
|
-
},
|
|
3293
|
-
async () => ({
|
|
3294
|
-
contents: [{ uri: playerResourceUri, mimeType: RESOURCE_MIME_TYPE, text: playerHtml }]
|
|
3295
|
-
})
|
|
3296
|
-
);
|
|
3297
|
-
registerAppToolIfEnabled(
|
|
3298
|
-
server2,
|
|
3299
|
-
disabledTools,
|
|
3300
|
-
"speak_player",
|
|
3301
|
-
{
|
|
3302
|
-
title: "Speak Player",
|
|
3303
|
-
description: 'Convert text to speech and display an audio player in the UI. Audio is played in the browser, not on the server. Does not use the playback queue. Supports multi-speaker dialogue: prefix each line with speaker ID like "1:Hello\\n2:World".',
|
|
3304
|
-
inputSchema: {
|
|
3305
|
-
text: z.string().describe(
|
|
3306
|
-
'Text to convert to speech. Supports multi-speaker dialogue format with speaker ID prefix per line: "1:Hello\\n2:World". Each line is synthesized with the specified speaker and played sequentially.'
|
|
3307
|
-
),
|
|
3308
|
-
speaker: z.number().optional().describe("Speaker ID (optional)"),
|
|
3309
|
-
speedScale: z.number().optional().describe("Playback speed (optional, default from environment)"),
|
|
3310
|
-
autoPlay: z.boolean().optional().describe("Auto-play audio when loaded (default: true)")
|
|
3311
|
-
},
|
|
3312
|
-
annotations: {
|
|
3313
|
-
readOnlyHint: true,
|
|
3314
|
-
destructiveHint: false,
|
|
3315
|
-
idempotentHint: false,
|
|
3316
|
-
openWorldHint: true
|
|
3317
|
-
},
|
|
3318
|
-
_meta: { ui: { resourceUri: playerResourceUri } }
|
|
3319
|
-
},
|
|
3320
|
-
async ({
|
|
3321
|
-
text,
|
|
3322
|
-
speaker,
|
|
3323
|
-
speedScale,
|
|
3324
|
-
autoPlay
|
|
3325
|
-
}, extra) => {
|
|
3326
|
-
try {
|
|
3327
|
-
const effectiveSpeaker = getEffectiveSpeaker(speaker, extra.sessionId) ?? config2.defaultSpeaker;
|
|
3328
|
-
const speed = speedScale ?? config2.defaultSpeedScale;
|
|
3329
|
-
const segments = parseStringInput(text);
|
|
3330
|
-
const firstSegment = segments[0];
|
|
3331
|
-
if (!firstSegment) {
|
|
3332
|
-
throw new Error("\u30C6\u30AD\u30B9\u30C8\u304C\u7A7A\u3067\u3059");
|
|
3333
|
-
}
|
|
3334
|
-
const speakerId = firstSegment.speaker ?? effectiveSpeaker;
|
|
3335
|
-
const speakerName = await getSpeakerName(speakerId);
|
|
3336
|
-
const fullText = segments.map((s) => s.text).join(" ");
|
|
3337
|
-
return {
|
|
3338
|
-
content: [
|
|
3339
|
-
{
|
|
3340
|
-
type: "text",
|
|
3341
|
-
text: `Voicevox Player started: ${speakerName} \u300C${fullText.slice(0, 50)}${fullText.length > 50 ? "..." : ""}\u300D`
|
|
3342
|
-
}
|
|
3343
|
-
],
|
|
3344
|
-
_meta: { viewUUID: randomUUID2() }
|
|
3345
|
-
};
|
|
3346
|
-
} catch (error) {
|
|
3347
|
-
return createErrorResponse(error);
|
|
3348
|
-
}
|
|
3349
|
-
}
|
|
3350
|
-
);
|
|
3351
3562
|
registerAppToolIfEnabled(
|
|
3352
3563
|
server2,
|
|
3353
3564
|
disabledTools,
|
|
@@ -3357,7 +3568,7 @@ function registerPlayerTools(deps) {
|
|
|
3357
3568
|
description: "Get speaker list for the player UI. This tool is only callable from the app UI.",
|
|
3358
3569
|
_meta: {
|
|
3359
3570
|
ui: {
|
|
3360
|
-
resourceUri:
|
|
3571
|
+
resourceUri: playerResourceUri2,
|
|
3361
3572
|
visibility: ["app"]
|
|
3362
3573
|
}
|
|
3363
3574
|
}
|
|
@@ -3383,7 +3594,7 @@ function registerPlayerTools(deps) {
|
|
|
3383
3594
|
},
|
|
3384
3595
|
_meta: {
|
|
3385
3596
|
ui: {
|
|
3386
|
-
resourceUri:
|
|
3597
|
+
resourceUri: playerResourceUri2,
|
|
3387
3598
|
visibility: ["app"]
|
|
3388
3599
|
}
|
|
3389
3600
|
}
|
|
@@ -3406,87 +3617,234 @@ function registerPlayerTools(deps) {
|
|
|
3406
3617
|
}
|
|
3407
3618
|
}
|
|
3408
3619
|
);
|
|
3620
|
+
registerAppToolIfEnabled(
|
|
3621
|
+
server2,
|
|
3622
|
+
disabledTools,
|
|
3623
|
+
"_save_player_state_for_player",
|
|
3624
|
+
{
|
|
3625
|
+
title: "Save Player State (Player)",
|
|
3626
|
+
description: "Persist current player segments to server state without synthesizing audio. Only callable from the app UI.",
|
|
3627
|
+
inputSchema: {
|
|
3628
|
+
viewUUID: z.string().optional().describe("Player instance ID to associate this state with"),
|
|
3629
|
+
segments: z.array(
|
|
3630
|
+
z.object({
|
|
3631
|
+
text: z.string(),
|
|
3632
|
+
speaker: z.number(),
|
|
3633
|
+
speedScale: z.number().optional(),
|
|
3634
|
+
intonationScale: z.number().optional(),
|
|
3635
|
+
volumeScale: z.number().optional(),
|
|
3636
|
+
prePhonemeLength: z.number().optional(),
|
|
3637
|
+
postPhonemeLength: z.number().optional(),
|
|
3638
|
+
pauseLengthScale: z.number().optional(),
|
|
3639
|
+
audioQuery: audioQuerySchema.optional(),
|
|
3640
|
+
accentPhrases: z.array(accentPhraseSchema).optional()
|
|
3641
|
+
})
|
|
3642
|
+
).describe("Full current player segment list to persist")
|
|
3643
|
+
},
|
|
3644
|
+
_meta: {
|
|
3645
|
+
ui: {
|
|
3646
|
+
resourceUri: playerResourceUri2,
|
|
3647
|
+
visibility: ["app"]
|
|
3648
|
+
}
|
|
3649
|
+
}
|
|
3650
|
+
},
|
|
3651
|
+
async ({
|
|
3652
|
+
viewUUID,
|
|
3653
|
+
segments
|
|
3654
|
+
}, extra) => {
|
|
3655
|
+
try {
|
|
3656
|
+
if (!segments || segments.length === 0) {
|
|
3657
|
+
throw new Error("segments is required");
|
|
3658
|
+
}
|
|
3659
|
+
const stateKey = viewUUID ?? extra?.sessionId ?? "global";
|
|
3660
|
+
const effectiveDefaultSpeaker = config2.defaultSpeaker;
|
|
3661
|
+
const effectiveSpeed = config2.defaultSpeedScale;
|
|
3662
|
+
const list = await getSpeakerList();
|
|
3663
|
+
const speakerNameMap = /* @__PURE__ */ new Map();
|
|
3664
|
+
for (const speakerId of [...new Set(segments.map((seg) => seg.speaker ?? effectiveDefaultSpeaker))]) {
|
|
3665
|
+
const found = list.find((entry) => entry.id === speakerId);
|
|
3666
|
+
speakerNameMap.set(speakerId, found ? `${found.characterName}\uFF08${found.name}\uFF09` : `Speaker ${speakerId}`);
|
|
3667
|
+
}
|
|
3668
|
+
setSessionState2(stateKey, {
|
|
3669
|
+
segments: segments.map((seg) => {
|
|
3670
|
+
const speakerId = seg.speaker ?? effectiveDefaultSpeaker;
|
|
3671
|
+
return {
|
|
3672
|
+
text: seg.text,
|
|
3673
|
+
speaker: speakerId,
|
|
3674
|
+
speakerName: speakerNameMap.get(speakerId) ?? `Speaker ${speakerId}`,
|
|
3675
|
+
kana: seg.audioQuery?.kana,
|
|
3676
|
+
speedScale: seg.speedScale ?? effectiveSpeed,
|
|
3677
|
+
intonationScale: seg.intonationScale,
|
|
3678
|
+
volumeScale: seg.volumeScale,
|
|
3679
|
+
prePhonemeLength: seg.prePhonemeLength,
|
|
3680
|
+
postPhonemeLength: seg.postPhonemeLength,
|
|
3681
|
+
pauseLengthScale: seg.pauseLengthScale,
|
|
3682
|
+
audioQuery: seg.audioQuery,
|
|
3683
|
+
accentPhrases: seg.audioQuery?.accent_phrases ?? seg.accentPhrases
|
|
3684
|
+
};
|
|
3685
|
+
}),
|
|
3686
|
+
updatedAt: Date.now()
|
|
3687
|
+
});
|
|
3688
|
+
return {
|
|
3689
|
+
content: [{ type: "text", text: JSON.stringify({ ok: true, viewUUID: stateKey, count: segments.length }) }]
|
|
3690
|
+
};
|
|
3691
|
+
} catch (error) {
|
|
3692
|
+
return createErrorResponse(error);
|
|
3693
|
+
}
|
|
3694
|
+
}
|
|
3695
|
+
);
|
|
3409
3696
|
registerAppToolIfEnabled(
|
|
3410
3697
|
server2,
|
|
3411
3698
|
disabledTools,
|
|
3412
3699
|
"_resynthesize_for_player",
|
|
3413
3700
|
{
|
|
3414
3701
|
title: "Resynthesize (Player)",
|
|
3415
|
-
description: "Re-synthesize audio with a different speaker. Only callable from the app UI.",
|
|
3702
|
+
description: "Re-synthesize audio with a different speaker or updated parameters. Only callable from the app UI.",
|
|
3416
3703
|
inputSchema: {
|
|
3704
|
+
viewUUID: z.string().optional().describe("Player instance ID to associate this synthesis with"),
|
|
3417
3705
|
text: z.string().describe("Text to re-synthesize"),
|
|
3418
3706
|
speaker: z.number().optional().describe("Speaker ID (uses server default if omitted)"),
|
|
3707
|
+
audioQuery: audioQuerySchema.optional().describe("Audio query to synthesize from (preferred over text parameters)"),
|
|
3419
3708
|
speedScale: z.number().optional().describe("Playback speed (uses server default if omitted)"),
|
|
3709
|
+
intonationScale: z.number().optional().describe("Intonation scale \u6291\u63DA (optional)"),
|
|
3710
|
+
volumeScale: z.number().optional().describe("Volume scale \u97F3\u91CF (optional)"),
|
|
3711
|
+
prePhonemeLength: z.number().optional().describe("Pre-phoneme silence length in seconds (optional)"),
|
|
3712
|
+
postPhonemeLength: z.number().optional().describe("Post-phoneme silence length in seconds (optional)"),
|
|
3713
|
+
pauseLengthScale: z.number().optional().describe("Pause length scale between phrases \u9593\u306E\u9577\u3055 (optional)"),
|
|
3714
|
+
accentPhrases: z.array(accentPhraseSchema).optional().describe("Accent phrases override"),
|
|
3420
3715
|
autoPlay: z.boolean().optional().describe("Auto-play audio when loaded (uses server config if omitted)"),
|
|
3716
|
+
segmentIndex: z.number().int().min(0).optional().describe("Segment index for single-segment state update"),
|
|
3717
|
+
persistState: z.boolean().optional().describe("Persist player state to server store (default: true)"),
|
|
3421
3718
|
segments: z.array(
|
|
3422
3719
|
z.object({
|
|
3423
3720
|
text: z.string(),
|
|
3424
|
-
speaker: z.number()
|
|
3721
|
+
speaker: z.number(),
|
|
3722
|
+
speedScale: z.number().optional(),
|
|
3723
|
+
intonationScale: z.number().optional(),
|
|
3724
|
+
volumeScale: z.number().optional(),
|
|
3725
|
+
prePhonemeLength: z.number().optional(),
|
|
3726
|
+
postPhonemeLength: z.number().optional(),
|
|
3727
|
+
pauseLengthScale: z.number().optional(),
|
|
3728
|
+
audioQuery: audioQuerySchema.optional(),
|
|
3729
|
+
accentPhrases: z.array(accentPhraseSchema).optional()
|
|
3425
3730
|
})
|
|
3426
|
-
).optional().describe("
|
|
3731
|
+
).optional().describe("All current player segments \u2014 pass the full list to update server state")
|
|
3427
3732
|
},
|
|
3428
3733
|
_meta: {
|
|
3429
3734
|
ui: {
|
|
3430
|
-
resourceUri:
|
|
3735
|
+
resourceUri: playerResourceUri2,
|
|
3431
3736
|
visibility: ["app"]
|
|
3432
3737
|
}
|
|
3433
3738
|
}
|
|
3434
3739
|
},
|
|
3435
3740
|
async ({
|
|
3741
|
+
viewUUID,
|
|
3436
3742
|
text,
|
|
3437
3743
|
speaker,
|
|
3744
|
+
audioQuery,
|
|
3438
3745
|
speedScale,
|
|
3746
|
+
intonationScale,
|
|
3747
|
+
volumeScale,
|
|
3748
|
+
prePhonemeLength,
|
|
3749
|
+
postPhonemeLength,
|
|
3750
|
+
pauseLengthScale,
|
|
3751
|
+
accentPhrases,
|
|
3439
3752
|
autoPlay,
|
|
3753
|
+
segmentIndex,
|
|
3754
|
+
persistState,
|
|
3440
3755
|
segments
|
|
3441
|
-
}) => {
|
|
3756
|
+
}, extra) => {
|
|
3442
3757
|
try {
|
|
3443
3758
|
const effectiveSpeed = speedScale ?? config2.defaultSpeedScale;
|
|
3444
3759
|
const effectiveAutoPlay = autoPlay ?? config2.autoPlay;
|
|
3760
|
+
const shouldPersistState = persistState !== false;
|
|
3445
3761
|
const effectiveDefaultSpeaker = speaker ?? config2.defaultSpeaker;
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
|
-
|
|
3449
|
-
|
|
3450
|
-
|
|
3451
|
-
|
|
3452
|
-
|
|
3453
|
-
|
|
3454
|
-
|
|
3762
|
+
const stateKey = viewUUID ?? extra?.sessionId ?? "global";
|
|
3763
|
+
if (segments && segments.length > 0 && shouldPersistState) {
|
|
3764
|
+
const list = await getSpeakerList();
|
|
3765
|
+
const speakerNameMap = /* @__PURE__ */ new Map();
|
|
3766
|
+
for (const speakerId of [...new Set(segments.map((seg) => seg.speaker ?? effectiveDefaultSpeaker))]) {
|
|
3767
|
+
const found = list.find((entry) => entry.id === speakerId);
|
|
3768
|
+
speakerNameMap.set(speakerId, found ? `${found.characterName}\uFF08${found.name}\uFF09` : `Speaker ${speakerId}`);
|
|
3769
|
+
}
|
|
3770
|
+
setSessionState2(stateKey, {
|
|
3771
|
+
segments: segments.map((seg) => {
|
|
3772
|
+
const speakerId = seg.speaker ?? effectiveDefaultSpeaker;
|
|
3455
3773
|
return {
|
|
3456
|
-
audioBase64: base64Audio2,
|
|
3457
3774
|
text: seg.text,
|
|
3458
|
-
speaker:
|
|
3459
|
-
speakerName:
|
|
3775
|
+
speaker: speakerId,
|
|
3776
|
+
speakerName: speakerNameMap.get(speakerId) ?? `Speaker ${speakerId}`,
|
|
3777
|
+
kana: seg.audioQuery?.kana,
|
|
3778
|
+
speedScale: seg.speedScale ?? effectiveSpeed,
|
|
3779
|
+
intonationScale: seg.intonationScale,
|
|
3780
|
+
volumeScale: seg.volumeScale,
|
|
3781
|
+
prePhonemeLength: seg.prePhonemeLength,
|
|
3782
|
+
postPhonemeLength: seg.postPhonemeLength,
|
|
3783
|
+
pauseLengthScale: seg.pauseLengthScale,
|
|
3784
|
+
audioQuery: seg.audioQuery,
|
|
3785
|
+
accentPhrases: seg.audioQuery?.accent_phrases ?? seg.accentPhrases
|
|
3460
3786
|
};
|
|
3461
|
-
})
|
|
3462
|
-
|
|
3463
|
-
|
|
3464
|
-
|
|
3465
|
-
|
|
3466
|
-
|
|
3467
|
-
|
|
3468
|
-
|
|
3469
|
-
|
|
3470
|
-
|
|
3471
|
-
|
|
3472
|
-
|
|
3473
|
-
|
|
3787
|
+
}),
|
|
3788
|
+
updatedAt: Date.now()
|
|
3789
|
+
});
|
|
3790
|
+
}
|
|
3791
|
+
const result = await synthesizeWithCache({
|
|
3792
|
+
text,
|
|
3793
|
+
speaker: effectiveDefaultSpeaker,
|
|
3794
|
+
audioQuery,
|
|
3795
|
+
speedScale: effectiveSpeed,
|
|
3796
|
+
intonationScale,
|
|
3797
|
+
volumeScale,
|
|
3798
|
+
prePhonemeLength,
|
|
3799
|
+
postPhonemeLength,
|
|
3800
|
+
pauseLengthScale,
|
|
3801
|
+
accentPhrases
|
|
3802
|
+
});
|
|
3803
|
+
if (shouldPersistState && segmentIndex !== void 0) {
|
|
3804
|
+
const prev = getSessionState2(stateKey);
|
|
3805
|
+
if (prev?.segments[segmentIndex]) {
|
|
3806
|
+
const nextSegments = prev.segments.slice();
|
|
3807
|
+
nextSegments[segmentIndex] = {
|
|
3808
|
+
...nextSegments[segmentIndex],
|
|
3809
|
+
text: result.text,
|
|
3810
|
+
speaker: result.speaker,
|
|
3811
|
+
speakerName: result.speakerName,
|
|
3812
|
+
kana: result.kana,
|
|
3813
|
+
audioQuery: result.audioQuery,
|
|
3814
|
+
accentPhrases: result.accentPhrases,
|
|
3815
|
+
speedScale: result.speedScale,
|
|
3816
|
+
intonationScale: result.intonationScale,
|
|
3817
|
+
volumeScale: result.volumeScale,
|
|
3818
|
+
prePhonemeLength: result.prePhonemeLength,
|
|
3819
|
+
postPhonemeLength: result.postPhonemeLength,
|
|
3820
|
+
pauseLengthScale: result.pauseLengthScale
|
|
3821
|
+
};
|
|
3822
|
+
setSessionState2(stateKey, {
|
|
3823
|
+
segments: nextSegments,
|
|
3824
|
+
updatedAt: Date.now()
|
|
3825
|
+
});
|
|
3826
|
+
}
|
|
3474
3827
|
}
|
|
3475
|
-
const audioQuery = await playerVoicevoxApi.generateQuery(text, effectiveDefaultSpeaker);
|
|
3476
|
-
audioQuery.speedScale = effectiveSpeed;
|
|
3477
|
-
const audioData = await playerVoicevoxApi.synthesize(audioQuery, effectiveDefaultSpeaker);
|
|
3478
|
-
const base64Audio = Buffer.from(audioData).toString("base64");
|
|
3479
|
-
const speakerName = await getSpeakerName(effectiveDefaultSpeaker);
|
|
3480
3828
|
return {
|
|
3481
3829
|
content: [
|
|
3482
3830
|
{
|
|
3483
3831
|
type: "text",
|
|
3484
3832
|
text: JSON.stringify({
|
|
3485
|
-
audioBase64:
|
|
3486
|
-
text,
|
|
3487
|
-
speaker:
|
|
3488
|
-
speakerName,
|
|
3489
|
-
|
|
3833
|
+
audioBase64: result.audioBase64,
|
|
3834
|
+
text: result.text,
|
|
3835
|
+
speaker: result.speaker,
|
|
3836
|
+
speakerName: result.speakerName,
|
|
3837
|
+
kana: result.kana,
|
|
3838
|
+
audioQuery: result.audioQuery,
|
|
3839
|
+
accentPhrases: result.accentPhrases,
|
|
3840
|
+
speedScale: result.speedScale,
|
|
3841
|
+
intonationScale: result.intonationScale,
|
|
3842
|
+
volumeScale: result.volumeScale,
|
|
3843
|
+
prePhonemeLength: result.prePhonemeLength,
|
|
3844
|
+
postPhonemeLength: result.postPhonemeLength,
|
|
3845
|
+
pauseLengthScale: result.pauseLengthScale,
|
|
3846
|
+
autoPlay: effectiveAutoPlay,
|
|
3847
|
+
viewUUID
|
|
3490
3848
|
})
|
|
3491
3849
|
}
|
|
3492
3850
|
]
|
|
@@ -3496,53 +3854,1172 @@ function registerPlayerTools(deps) {
|
|
|
3496
3854
|
}
|
|
3497
3855
|
}
|
|
3498
3856
|
);
|
|
3499
|
-
|
|
3500
|
-
|
|
3501
|
-
// src/tools/speak.ts
|
|
3502
|
-
import * as z2 from "zod/v4";
|
|
3503
|
-
function buildSpeakInputSchema(restrictions) {
|
|
3504
|
-
const schema = {
|
|
3505
|
-
text: z2.string().describe(
|
|
3506
|
-
'Text split by line breaks (\\n). IMPORTANT: Each line = one speech unit (processed and played separately). Keep the FIRST LINE SHORT for quick playback start - audio begins as soon as the first line is synthesized. Example: "Hi!\\nThis is a longer explanation that follows." Optional speaker prefix per line: "1:Hello\\n2:World"'
|
|
3507
|
-
),
|
|
3508
|
-
query: z2.string().optional().describe("Voice synthesis query"),
|
|
3509
|
-
speaker: z2.number().optional().describe("Default speaker ID (optional)"),
|
|
3510
|
-
speedScale: z2.number().optional().describe("Playback speed (optional, default from environment)")
|
|
3511
|
-
};
|
|
3512
|
-
if (!restrictions.immediate) {
|
|
3513
|
-
schema.immediate = z2.boolean().optional().describe(
|
|
3514
|
-
"If true, stops current playback and plays new audio immediately. If false, waits for current playback to finish. Default depends on environment variable."
|
|
3515
|
-
);
|
|
3516
|
-
}
|
|
3517
|
-
if (!restrictions.waitForStart) {
|
|
3518
|
-
schema.waitForStart = z2.boolean().optional().describe("Wait for playback to start (optional, default: false)");
|
|
3519
|
-
}
|
|
3520
|
-
if (!restrictions.waitForEnd) {
|
|
3521
|
-
schema.waitForEnd = z2.boolean().optional().describe("Wait for playback to end (optional, default: false)");
|
|
3522
|
-
}
|
|
3523
|
-
return schema;
|
|
3524
|
-
}
|
|
3525
|
-
function registerSpeakTool(deps) {
|
|
3526
|
-
const { server: server2, voicevoxClient, config: config2, disabledTools, restrictions } = deps;
|
|
3527
|
-
registerToolIfEnabled(
|
|
3857
|
+
registerAppToolIfEnabled(
|
|
3528
3858
|
server2,
|
|
3529
3859
|
disabledTools,
|
|
3530
|
-
"
|
|
3860
|
+
"_get_user_dictionary_for_player",
|
|
3531
3861
|
{
|
|
3532
|
-
title: "
|
|
3533
|
-
description: "
|
|
3534
|
-
|
|
3535
|
-
|
|
3536
|
-
|
|
3537
|
-
|
|
3538
|
-
|
|
3539
|
-
openWorldHint: true
|
|
3862
|
+
title: "Get User Dictionary (Player)",
|
|
3863
|
+
description: "Get VOICEVOX user dictionary words for the dictionary manager UI.",
|
|
3864
|
+
_meta: {
|
|
3865
|
+
ui: {
|
|
3866
|
+
resourceUri: playerResourceUri2,
|
|
3867
|
+
visibility: ["app"]
|
|
3868
|
+
}
|
|
3540
3869
|
}
|
|
3541
3870
|
},
|
|
3542
|
-
async ({
|
|
3543
|
-
|
|
3544
|
-
|
|
3545
|
-
|
|
3871
|
+
async () => {
|
|
3872
|
+
try {
|
|
3873
|
+
const dictionary = await playerVoicevoxApi.getUserDictionary();
|
|
3874
|
+
return {
|
|
3875
|
+
content: [{ type: "text", text: JSON.stringify({ words: normalizeUserDictionaryWords(dictionary) }) }]
|
|
3876
|
+
};
|
|
3877
|
+
} catch (error) {
|
|
3878
|
+
return createErrorResponse(error);
|
|
3879
|
+
}
|
|
3880
|
+
}
|
|
3881
|
+
);
|
|
3882
|
+
registerAppToolIfEnabled(
|
|
3883
|
+
server2,
|
|
3884
|
+
disabledTools,
|
|
3885
|
+
"_add_user_dictionary_word_for_player",
|
|
3886
|
+
{
|
|
3887
|
+
title: "Add User Dictionary Word (Player)",
|
|
3888
|
+
description: "Add a word to VOICEVOX user dictionary.",
|
|
3889
|
+
inputSchema: {
|
|
3890
|
+
surface: z.string().describe("Word surface form"),
|
|
3891
|
+
pronunciation: z.string().describe("Katakana reading"),
|
|
3892
|
+
priority: z.number().int().min(0).max(10).optional().describe("Priority 0-10")
|
|
3893
|
+
},
|
|
3894
|
+
_meta: {
|
|
3895
|
+
ui: {
|
|
3896
|
+
resourceUri: playerResourceUri2,
|
|
3897
|
+
visibility: ["app"]
|
|
3898
|
+
}
|
|
3899
|
+
}
|
|
3900
|
+
},
|
|
3901
|
+
async ({
|
|
3902
|
+
surface,
|
|
3903
|
+
pronunciation,
|
|
3904
|
+
priority
|
|
3905
|
+
}) => {
|
|
3906
|
+
try {
|
|
3907
|
+
const normalizedSurface = surface.trim();
|
|
3908
|
+
const normalizedPronunciation = pronunciation.trim();
|
|
3909
|
+
if (!normalizedSurface) throw new Error("surface is required");
|
|
3910
|
+
if (!normalizedPronunciation) throw new Error("pronunciation is required");
|
|
3911
|
+
if (!isKatakana(normalizedPronunciation)) throw new Error("pronunciation must be Katakana");
|
|
3912
|
+
await playerVoicevoxApi.addUserDictionaryWord({
|
|
3913
|
+
surface: normalizedSurface,
|
|
3914
|
+
pronunciation: normalizedPronunciation,
|
|
3915
|
+
accentType: estimateAccentType(normalizedPronunciation),
|
|
3916
|
+
priority: priority ?? 5
|
|
3917
|
+
});
|
|
3918
|
+
const dictionary = await playerVoicevoxApi.getUserDictionary();
|
|
3919
|
+
return {
|
|
3920
|
+
content: [{ type: "text", text: JSON.stringify({ words: normalizeUserDictionaryWords(dictionary) }) }]
|
|
3921
|
+
};
|
|
3922
|
+
} catch (error) {
|
|
3923
|
+
return createErrorResponse(error);
|
|
3924
|
+
}
|
|
3925
|
+
}
|
|
3926
|
+
);
|
|
3927
|
+
registerAppToolIfEnabled(
|
|
3928
|
+
server2,
|
|
3929
|
+
disabledTools,
|
|
3930
|
+
"_update_user_dictionary_word_for_player",
|
|
3931
|
+
{
|
|
3932
|
+
title: "Update User Dictionary Word (Player)",
|
|
3933
|
+
description: "Update a VOICEVOX user dictionary word.",
|
|
3934
|
+
inputSchema: {
|
|
3935
|
+
wordUuid: z.string().describe("Dictionary word UUID"),
|
|
3936
|
+
surface: z.string().describe("Word surface form"),
|
|
3937
|
+
pronunciation: z.string().describe("Katakana reading"),
|
|
3938
|
+
priority: z.number().int().min(0).max(10).optional().describe("Priority 0-10")
|
|
3939
|
+
},
|
|
3940
|
+
_meta: {
|
|
3941
|
+
ui: {
|
|
3942
|
+
resourceUri: playerResourceUri2,
|
|
3943
|
+
visibility: ["app"]
|
|
3944
|
+
}
|
|
3945
|
+
}
|
|
3946
|
+
},
|
|
3947
|
+
async ({
|
|
3948
|
+
wordUuid,
|
|
3949
|
+
surface,
|
|
3950
|
+
pronunciation,
|
|
3951
|
+
priority
|
|
3952
|
+
}) => {
|
|
3953
|
+
try {
|
|
3954
|
+
const normalizedSurface = surface.trim();
|
|
3955
|
+
const normalizedPronunciation = pronunciation.trim();
|
|
3956
|
+
if (!wordUuid.trim()) throw new Error("wordUuid is required");
|
|
3957
|
+
if (!normalizedSurface) throw new Error("surface is required");
|
|
3958
|
+
if (!normalizedPronunciation) throw new Error("pronunciation is required");
|
|
3959
|
+
if (!isKatakana(normalizedPronunciation)) throw new Error("pronunciation must be Katakana");
|
|
3960
|
+
await playerVoicevoxApi.updateUserDictionaryWord({
|
|
3961
|
+
wordUuid: wordUuid.trim(),
|
|
3962
|
+
surface: normalizedSurface,
|
|
3963
|
+
pronunciation: normalizedPronunciation,
|
|
3964
|
+
accentType: estimateAccentType(normalizedPronunciation),
|
|
3965
|
+
priority: priority ?? 5
|
|
3966
|
+
});
|
|
3967
|
+
const dictionary = await playerVoicevoxApi.getUserDictionary();
|
|
3968
|
+
return {
|
|
3969
|
+
content: [{ type: "text", text: JSON.stringify({ words: normalizeUserDictionaryWords(dictionary) }) }]
|
|
3970
|
+
};
|
|
3971
|
+
} catch (error) {
|
|
3972
|
+
return createErrorResponse(error);
|
|
3973
|
+
}
|
|
3974
|
+
}
|
|
3975
|
+
);
|
|
3976
|
+
registerAppToolIfEnabled(
|
|
3977
|
+
server2,
|
|
3978
|
+
disabledTools,
|
|
3979
|
+
"_delete_user_dictionary_word_for_player",
|
|
3980
|
+
{
|
|
3981
|
+
title: "Delete User Dictionary Word (Player)",
|
|
3982
|
+
description: "Delete a VOICEVOX user dictionary word.",
|
|
3983
|
+
inputSchema: {
|
|
3984
|
+
wordUuid: z.string().describe("Dictionary word UUID")
|
|
3985
|
+
},
|
|
3986
|
+
_meta: {
|
|
3987
|
+
ui: {
|
|
3988
|
+
resourceUri: playerResourceUri2,
|
|
3989
|
+
visibility: ["app"]
|
|
3990
|
+
}
|
|
3991
|
+
}
|
|
3992
|
+
},
|
|
3993
|
+
async ({ wordUuid }) => {
|
|
3994
|
+
try {
|
|
3995
|
+
const normalizedWordUuid = wordUuid.trim();
|
|
3996
|
+
if (!normalizedWordUuid) throw new Error("wordUuid is required");
|
|
3997
|
+
await playerVoicevoxApi.deleteUserDictionaryWord(normalizedWordUuid);
|
|
3998
|
+
const dictionary = await playerVoicevoxApi.getUserDictionary();
|
|
3999
|
+
return {
|
|
4000
|
+
content: [{ type: "text", text: JSON.stringify({ words: normalizeUserDictionaryWords(dictionary) }) }]
|
|
4001
|
+
};
|
|
4002
|
+
} catch (error) {
|
|
4003
|
+
return createErrorResponse(error);
|
|
4004
|
+
}
|
|
4005
|
+
}
|
|
4006
|
+
);
|
|
4007
|
+
registerAppToolIfEnabled(
|
|
4008
|
+
server2,
|
|
4009
|
+
disabledTools,
|
|
4010
|
+
"_preview_dictionary_word_for_player",
|
|
4011
|
+
{
|
|
4012
|
+
title: "Preview Dictionary Word (Player)",
|
|
4013
|
+
description: "Preview pronunciation with a random speaker.",
|
|
4014
|
+
inputSchema: {
|
|
4015
|
+
text: z.string().describe("Text to preview")
|
|
4016
|
+
},
|
|
4017
|
+
_meta: {
|
|
4018
|
+
ui: {
|
|
4019
|
+
resourceUri: playerResourceUri2,
|
|
4020
|
+
visibility: ["app"]
|
|
4021
|
+
}
|
|
4022
|
+
}
|
|
4023
|
+
},
|
|
4024
|
+
async ({ text }) => {
|
|
4025
|
+
try {
|
|
4026
|
+
const normalizedText = text.trim();
|
|
4027
|
+
if (!normalizedText) throw new Error("text is required");
|
|
4028
|
+
const speakers = await getSpeakerList();
|
|
4029
|
+
if (speakers.length === 0) throw new Error("No speakers available");
|
|
4030
|
+
const randomSpeaker = speakers[Math.floor(Math.random() * speakers.length)];
|
|
4031
|
+
const result = await synthesizeWithCache({
|
|
4032
|
+
text: normalizedText,
|
|
4033
|
+
speaker: randomSpeaker.id,
|
|
4034
|
+
speedScale: config2.defaultSpeedScale
|
|
4035
|
+
});
|
|
4036
|
+
return {
|
|
4037
|
+
content: [
|
|
4038
|
+
{
|
|
4039
|
+
type: "text",
|
|
4040
|
+
text: JSON.stringify({
|
|
4041
|
+
audioBase64: result.audioBase64,
|
|
4042
|
+
speaker: result.speaker,
|
|
4043
|
+
speakerName: result.speakerName,
|
|
4044
|
+
kana: result.kana
|
|
4045
|
+
})
|
|
4046
|
+
}
|
|
4047
|
+
]
|
|
4048
|
+
};
|
|
4049
|
+
} catch (error) {
|
|
4050
|
+
return createErrorResponse(error);
|
|
4051
|
+
}
|
|
4052
|
+
}
|
|
4053
|
+
);
|
|
4054
|
+
registerAppToolIfEnabled(
|
|
4055
|
+
server2,
|
|
4056
|
+
disabledTools,
|
|
4057
|
+
"_get_export_capability_for_player",
|
|
4058
|
+
{
|
|
4059
|
+
title: "Get Export Capability (Player)",
|
|
4060
|
+
description: "Return whether track export + folder open is available for player UI.",
|
|
4061
|
+
_meta: {
|
|
4062
|
+
ui: {
|
|
4063
|
+
resourceUri: playerResourceUri2,
|
|
4064
|
+
visibility: ["app"]
|
|
4065
|
+
}
|
|
4066
|
+
}
|
|
4067
|
+
},
|
|
4068
|
+
async () => {
|
|
4069
|
+
const canExport = config2.playerExportEnabled;
|
|
4070
|
+
const canChooseDirectory = canExport && canChooseDirectoryDialog();
|
|
4071
|
+
const canOpenDirectory = canExport && canOpenExplorer();
|
|
4072
|
+
return {
|
|
4073
|
+
content: [
|
|
4074
|
+
{
|
|
4075
|
+
type: "text",
|
|
4076
|
+
text: JSON.stringify({
|
|
4077
|
+
available: canExport,
|
|
4078
|
+
canChooseDirectory,
|
|
4079
|
+
canOpenDirectory,
|
|
4080
|
+
defaultOutputDir: config2.playerExportDir
|
|
4081
|
+
})
|
|
4082
|
+
}
|
|
4083
|
+
]
|
|
4084
|
+
};
|
|
4085
|
+
}
|
|
4086
|
+
);
|
|
4087
|
+
registerAppToolIfEnabled(
|
|
4088
|
+
server2,
|
|
4089
|
+
disabledTools,
|
|
4090
|
+
"_select_directory_for_player",
|
|
4091
|
+
{
|
|
4092
|
+
title: "Select Export Directory (Player)",
|
|
4093
|
+
description: "Open a native OS directory picker dialog, to be called from the player UI.",
|
|
4094
|
+
inputSchema: {
|
|
4095
|
+
defaultPath: z.string().optional().describe("Default directory path to show")
|
|
4096
|
+
},
|
|
4097
|
+
_meta: {
|
|
4098
|
+
ui: {
|
|
4099
|
+
resourceUri: playerResourceUri2,
|
|
4100
|
+
visibility: ["app"]
|
|
4101
|
+
}
|
|
4102
|
+
}
|
|
4103
|
+
},
|
|
4104
|
+
async ({ defaultPath }) => {
|
|
4105
|
+
try {
|
|
4106
|
+
const selected = await showDirectoryPicker(defaultPath || config2.playerExportDir);
|
|
4107
|
+
return {
|
|
4108
|
+
content: [
|
|
4109
|
+
{
|
|
4110
|
+
type: "text",
|
|
4111
|
+
text: JSON.stringify({ path: selected })
|
|
4112
|
+
}
|
|
4113
|
+
]
|
|
4114
|
+
};
|
|
4115
|
+
} catch (error) {
|
|
4116
|
+
return createErrorResponse(error);
|
|
4117
|
+
}
|
|
4118
|
+
}
|
|
4119
|
+
);
|
|
4120
|
+
registerAppToolIfEnabled(
|
|
4121
|
+
server2,
|
|
4122
|
+
disabledTools,
|
|
4123
|
+
"_export_tracks_for_player",
|
|
4124
|
+
{
|
|
4125
|
+
title: "Export Tracks (Player)",
|
|
4126
|
+
description: "Save player tracks as wav files and open the target folder in file explorer.",
|
|
4127
|
+
inputSchema: {
|
|
4128
|
+
outputDir: z.string().optional().describe("Output directory path (optional)"),
|
|
4129
|
+
segments: z.array(
|
|
4130
|
+
z.object({
|
|
4131
|
+
audioBase64: z.string().describe("WAV data in base64"),
|
|
4132
|
+
text: z.string().describe("Segment text"),
|
|
4133
|
+
speaker: z.number().describe("Speaker ID"),
|
|
4134
|
+
speakerName: z.string().describe("Speaker display name")
|
|
4135
|
+
})
|
|
4136
|
+
).describe("Tracks to export")
|
|
4137
|
+
},
|
|
4138
|
+
_meta: {
|
|
4139
|
+
ui: {
|
|
4140
|
+
resourceUri: playerResourceUri2,
|
|
4141
|
+
visibility: ["app"]
|
|
4142
|
+
}
|
|
4143
|
+
}
|
|
4144
|
+
},
|
|
4145
|
+
async ({
|
|
4146
|
+
outputDir,
|
|
4147
|
+
segments
|
|
4148
|
+
}) => {
|
|
4149
|
+
try {
|
|
4150
|
+
if (!config2.playerExportEnabled) {
|
|
4151
|
+
throw new Error("Track export is disabled by VOICEVOX_PLAYER_EXPORT_ENABLED=false");
|
|
4152
|
+
}
|
|
4153
|
+
if (!segments || segments.length === 0) {
|
|
4154
|
+
throw new Error("No tracks to export");
|
|
4155
|
+
}
|
|
4156
|
+
const rawTarget = outputDir?.trim() || config2.playerExportDir;
|
|
4157
|
+
const targetDir = resolve(rawTarget);
|
|
4158
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
4159
|
+
const sessionDir = join2(targetDir, `voicevox-${timestamp}`);
|
|
4160
|
+
await mkdir(sessionDir, { recursive: true });
|
|
4161
|
+
const files = [];
|
|
4162
|
+
for (let i = 0; i < segments.length; i++) {
|
|
4163
|
+
const seg = segments[i];
|
|
4164
|
+
const indexPart = String(i + 1).padStart(2, "0");
|
|
4165
|
+
const speakerPart = sanitizeFilePart(seg.speakerName || `speaker-${seg.speaker}`, `speaker-${seg.speaker}`);
|
|
4166
|
+
const textPart = sanitizeFilePart(seg.text, `segment-${i + 1}`);
|
|
4167
|
+
const fileName = `${indexPart}-${speakerPart}-${textPart}.wav`;
|
|
4168
|
+
const filePath = join2(sessionDir, fileName);
|
|
4169
|
+
await writeFile(filePath, Buffer.from(seg.audioBase64, "base64"));
|
|
4170
|
+
files.push(filePath);
|
|
4171
|
+
}
|
|
4172
|
+
let warning;
|
|
4173
|
+
let openedDirectory = false;
|
|
4174
|
+
if (canOpenExplorer()) {
|
|
4175
|
+
if (process.platform === "win32") {
|
|
4176
|
+
try {
|
|
4177
|
+
const child = spawn("explorer.exe", [sessionDir], { detached: true, stdio: "ignore" });
|
|
4178
|
+
child.unref();
|
|
4179
|
+
openedDirectory = true;
|
|
4180
|
+
} catch (e) {
|
|
4181
|
+
console.error("Failed to open explorer:", e);
|
|
4182
|
+
warning = `WAV\u30D5\u30A1\u30A4\u30EB\u306F\u4FDD\u5B58\u3055\u308C\u307E\u3057\u305F\u304C\u3001\u30D5\u30A9\u30EB\u30C0\u3092\u958B\u3051\u307E\u305B\u3093\u3067\u3057\u305F: ${sessionDir}`;
|
|
4183
|
+
}
|
|
4184
|
+
} else if (openDirectoryInExplorer(sessionDir)) {
|
|
4185
|
+
openedDirectory = true;
|
|
4186
|
+
} else {
|
|
4187
|
+
warning = `WAV\u30D5\u30A1\u30A4\u30EB\u306F\u4FDD\u5B58\u3055\u308C\u307E\u3057\u305F\u304C\u3001\u30D5\u30A9\u30EB\u30C0\u3092\u958B\u3051\u307E\u305B\u3093\u3067\u3057\u305F: ${sessionDir}`;
|
|
4188
|
+
}
|
|
4189
|
+
} else {
|
|
4190
|
+
warning = `WAV\u30D5\u30A1\u30A4\u30EB\u306F\u4FDD\u5B58\u3055\u308C\u307E\u3057\u305F\u3002\u73FE\u5728\u306E\u74B0\u5883\u3067\u306F\u30D5\u30A9\u30EB\u30C0\u81EA\u52D5\u30AA\u30FC\u30D7\u30F3\u306B\u5BFE\u5FDC\u3057\u3066\u3044\u307E\u305B\u3093: ${sessionDir}`;
|
|
4191
|
+
}
|
|
4192
|
+
return {
|
|
4193
|
+
content: [
|
|
4194
|
+
{
|
|
4195
|
+
type: "text",
|
|
4196
|
+
text: JSON.stringify({
|
|
4197
|
+
ok: true,
|
|
4198
|
+
outputDir: sessionDir,
|
|
4199
|
+
count: files.length,
|
|
4200
|
+
files,
|
|
4201
|
+
openedDirectory,
|
|
4202
|
+
warning
|
|
4203
|
+
})
|
|
4204
|
+
}
|
|
4205
|
+
]
|
|
4206
|
+
};
|
|
4207
|
+
} catch (error) {
|
|
4208
|
+
return createErrorResponse(error);
|
|
4209
|
+
}
|
|
4210
|
+
}
|
|
4211
|
+
);
|
|
4212
|
+
}
|
|
4213
|
+
|
|
4214
|
+
// src/tools/player.ts
|
|
4215
|
+
var __dirname = typeof import.meta.dirname === "string" ? import.meta.dirname : dirname2(fileURLToPath(import.meta.url));
|
|
4216
|
+
var playerHtml;
|
|
4217
|
+
try {
|
|
4218
|
+
const htmlPath = join3(__dirname, "mcp-app.html");
|
|
4219
|
+
playerHtml = readFileSync(htmlPath, "utf-8");
|
|
4220
|
+
} catch {
|
|
4221
|
+
try {
|
|
4222
|
+
const htmlPath = join3(__dirname, "..", "..", "node_modules", "@kajidog", "player-ui", "dist", "mcp-app.html");
|
|
4223
|
+
playerHtml = readFileSync(htmlPath, "utf-8");
|
|
4224
|
+
} catch {
|
|
4225
|
+
console.error("Warning: player-ui HTML not found. Please build @kajidog/player-ui first.");
|
|
4226
|
+
playerHtml = "<html><body><p>Player UI not available. Please build @kajidog/player-ui.</p></body></html>";
|
|
4227
|
+
}
|
|
4228
|
+
}
|
|
4229
|
+
var playerResourceUri = "ui://speak-player/player.html";
|
|
4230
|
+
var speakerCache = null;
|
|
4231
|
+
var playerStorageInitialized = false;
|
|
4232
|
+
var audioCacheDir = join3(process.cwd(), ".voicevox-player-cache");
|
|
4233
|
+
var audioCacheMem = /* @__PURE__ */ new Map();
|
|
4234
|
+
var AUDIO_CACHE_FILE_PATTERN = /^[a-f0-9]{64}\.txt$/;
|
|
4235
|
+
var DEFAULT_AUDIO_CACHE_TTL_DAYS = 30;
|
|
4236
|
+
var DEFAULT_AUDIO_CACHE_MAX_MB = 512;
|
|
4237
|
+
var AUDIO_CACHE_CLEANUP_EVERY_WRITES = 20;
|
|
4238
|
+
var audioCacheEnabledFlag = true;
|
|
4239
|
+
var audioCacheTtlDays = DEFAULT_AUDIO_CACHE_TTL_DAYS;
|
|
4240
|
+
var audioCacheMaxMb = DEFAULT_AUDIO_CACHE_MAX_MB;
|
|
4241
|
+
var isAudioDiskCacheEnabled = audioCacheEnabledFlag && audioCacheTtlDays !== 0 && audioCacheMaxMb !== 0;
|
|
4242
|
+
var audioCacheTtlMs = audioCacheTtlDays < 0 ? null : audioCacheTtlDays * 24 * 60 * 60 * 1e3;
|
|
4243
|
+
var audioCacheMaxBytes = audioCacheMaxMb < 0 ? null : audioCacheMaxMb * 1024 * 1024;
|
|
4244
|
+
var isAudioCacheCleanupRunning = false;
|
|
4245
|
+
var pendingAudioCacheCleanup = false;
|
|
4246
|
+
var writesSinceLastAudioCleanup = 0;
|
|
4247
|
+
async function cleanupAudioCacheFiles() {
|
|
4248
|
+
if (!isAudioDiskCacheEnabled) return;
|
|
4249
|
+
try {
|
|
4250
|
+
const entries = await readdir(audioCacheDir, { withFileTypes: true });
|
|
4251
|
+
const now = Date.now();
|
|
4252
|
+
const files = [];
|
|
4253
|
+
for (const entry of entries) {
|
|
4254
|
+
if (!entry.isFile() || !AUDIO_CACHE_FILE_PATTERN.test(entry.name)) continue;
|
|
4255
|
+
const filePath = join3(audioCacheDir, entry.name);
|
|
4256
|
+
let fileStat;
|
|
4257
|
+
try {
|
|
4258
|
+
fileStat = await stat(filePath);
|
|
4259
|
+
} catch {
|
|
4260
|
+
continue;
|
|
4261
|
+
}
|
|
4262
|
+
files.push({ name: entry.name, path: filePath, size: fileStat.size, mtimeMs: fileStat.mtimeMs });
|
|
4263
|
+
}
|
|
4264
|
+
const toDelete = planAudioCacheCleanup({
|
|
4265
|
+
entries: files,
|
|
4266
|
+
now,
|
|
4267
|
+
ttlMs: audioCacheTtlMs,
|
|
4268
|
+
maxBytes: audioCacheMaxBytes
|
|
4269
|
+
});
|
|
4270
|
+
if (toDelete.size === 0) return;
|
|
4271
|
+
for (const path of toDelete) {
|
|
4272
|
+
try {
|
|
4273
|
+
await unlink(path);
|
|
4274
|
+
} catch {
|
|
4275
|
+
}
|
|
4276
|
+
const fileName = basename(path);
|
|
4277
|
+
if (fileName.endsWith(".txt")) {
|
|
4278
|
+
audioCacheMem.delete(fileName.slice(0, -4));
|
|
4279
|
+
}
|
|
4280
|
+
}
|
|
4281
|
+
} catch (error) {
|
|
4282
|
+
console.warn("Warning: failed to cleanup VOICEVOX player audio cache:", error);
|
|
4283
|
+
}
|
|
4284
|
+
}
|
|
4285
|
+
function scheduleAudioCacheCleanup(force = false) {
|
|
4286
|
+
if (!isAudioDiskCacheEnabled) return;
|
|
4287
|
+
if (!force) {
|
|
4288
|
+
writesSinceLastAudioCleanup += 1;
|
|
4289
|
+
if (writesSinceLastAudioCleanup < AUDIO_CACHE_CLEANUP_EVERY_WRITES) return;
|
|
4290
|
+
}
|
|
4291
|
+
writesSinceLastAudioCleanup = 0;
|
|
4292
|
+
if (isAudioCacheCleanupRunning) {
|
|
4293
|
+
pendingAudioCacheCleanup = true;
|
|
4294
|
+
return;
|
|
4295
|
+
}
|
|
4296
|
+
isAudioCacheCleanupRunning = true;
|
|
4297
|
+
void cleanupAudioCacheFiles().catch((error) => console.warn("Warning: failed to cleanup VOICEVOX player audio cache:", error)).finally(() => {
|
|
4298
|
+
isAudioCacheCleanupRunning = false;
|
|
4299
|
+
if (pendingAudioCacheCleanup) {
|
|
4300
|
+
pendingAudioCacheCleanup = false;
|
|
4301
|
+
scheduleAudioCacheCleanup(true);
|
|
4302
|
+
}
|
|
4303
|
+
});
|
|
4304
|
+
}
|
|
4305
|
+
var playerSessionState = /* @__PURE__ */ new Map();
|
|
4306
|
+
var MAX_TOOL_CONTENT_BYTES = 1024 * 1024;
|
|
4307
|
+
var DEFAULT_STATE_PAGE_LIMIT = 100;
|
|
4308
|
+
var MAX_STATE_PAGE_LIMIT = 1e3;
|
|
4309
|
+
var MAX_PERSISTED_STATES = 500;
|
|
4310
|
+
var MAX_STATE_AGE_MS = 30 * 24 * 60 * 60 * 1e3;
|
|
4311
|
+
var stateFilePath = join3(audioCacheDir, "player-state.json");
|
|
4312
|
+
function createAudioCacheKey(input) {
|
|
4313
|
+
const keyInput = input.audioQuery ? JSON.stringify({
|
|
4314
|
+
speaker: input.speaker,
|
|
4315
|
+
text: input.text,
|
|
4316
|
+
audioQuery: input.audioQuery
|
|
4317
|
+
}) : JSON.stringify({
|
|
4318
|
+
speaker: input.speaker,
|
|
4319
|
+
text: input.text,
|
|
4320
|
+
speedScale: Number(input.speedScale.toFixed(4)),
|
|
4321
|
+
intonationScale: input.intonationScale === void 0 ? null : Number(input.intonationScale.toFixed(4)),
|
|
4322
|
+
volumeScale: input.volumeScale === void 0 ? null : Number(input.volumeScale.toFixed(4)),
|
|
4323
|
+
prePhonemeLength: input.prePhonemeLength === void 0 ? null : Number(input.prePhonemeLength.toFixed(4)),
|
|
4324
|
+
postPhonemeLength: input.postPhonemeLength === void 0 ? null : Number(input.postPhonemeLength.toFixed(4)),
|
|
4325
|
+
pauseLengthScale: input.pauseLengthScale === void 0 ? null : Number(input.pauseLengthScale.toFixed(4)),
|
|
4326
|
+
accentPhrases: input.accentPhrases ?? null
|
|
4327
|
+
});
|
|
4328
|
+
return createHash("sha256").update(keyInput).digest("hex");
|
|
4329
|
+
}
|
|
4330
|
+
function readCachedAudioBase64(cacheKey2) {
|
|
4331
|
+
const inMemory = audioCacheMem.get(cacheKey2);
|
|
4332
|
+
if (inMemory) return inMemory;
|
|
4333
|
+
if (!isAudioDiskCacheEnabled) return null;
|
|
4334
|
+
const filePath = join3(audioCacheDir, `${cacheKey2}.txt`);
|
|
4335
|
+
try {
|
|
4336
|
+
const base64 = readFileSync(filePath, "utf-8").trim();
|
|
4337
|
+
if (base64.length > 0) {
|
|
4338
|
+
audioCacheMem.set(cacheKey2, base64);
|
|
4339
|
+
return base64;
|
|
4340
|
+
}
|
|
4341
|
+
} catch {
|
|
4342
|
+
}
|
|
4343
|
+
return null;
|
|
4344
|
+
}
|
|
4345
|
+
async function writeCachedAudioBase64(cacheKey2, base64) {
|
|
4346
|
+
audioCacheMem.set(cacheKey2, base64);
|
|
4347
|
+
if (!isAudioDiskCacheEnabled) return;
|
|
4348
|
+
const filePath = join3(audioCacheDir, `${cacheKey2}.txt`);
|
|
4349
|
+
try {
|
|
4350
|
+
await writeFile2(filePath, base64, "utf-8");
|
|
4351
|
+
scheduleAudioCacheCleanup();
|
|
4352
|
+
} catch (error) {
|
|
4353
|
+
console.warn("Warning: failed to write VOICEVOX player cache:", error);
|
|
4354
|
+
}
|
|
4355
|
+
}
|
|
4356
|
+
async function saveSessionStateToDisk() {
|
|
4357
|
+
try {
|
|
4358
|
+
const now = Date.now();
|
|
4359
|
+
const validEntries = [...playerSessionState.entries()].filter(([, state]) => now - state.updatedAt <= MAX_STATE_AGE_MS).sort((a, b) => b[1].updatedAt - a[1].updatedAt).slice(0, MAX_PERSISTED_STATES);
|
|
4360
|
+
playerSessionState.clear();
|
|
4361
|
+
for (const [key, state] of validEntries) {
|
|
4362
|
+
playerSessionState.set(key, state);
|
|
4363
|
+
}
|
|
4364
|
+
const payload = JSON.stringify({
|
|
4365
|
+
version: 1,
|
|
4366
|
+
savedAt: now,
|
|
4367
|
+
entries: validEntries
|
|
4368
|
+
});
|
|
4369
|
+
const tempPath = `${stateFilePath}.tmp`;
|
|
4370
|
+
await writeFile2(tempPath, payload, "utf-8");
|
|
4371
|
+
await rename(tempPath, stateFilePath);
|
|
4372
|
+
} catch (error) {
|
|
4373
|
+
console.warn("Warning: failed to persist player state:", error);
|
|
4374
|
+
}
|
|
4375
|
+
}
|
|
4376
|
+
var saveDebounceTimer = null;
|
|
4377
|
+
function scheduleStateSave() {
|
|
4378
|
+
if (saveDebounceTimer !== null) clearTimeout(saveDebounceTimer);
|
|
4379
|
+
saveDebounceTimer = setTimeout(() => {
|
|
4380
|
+
saveDebounceTimer = null;
|
|
4381
|
+
saveSessionStateToDisk().catch((e) => console.warn("Warning: failed to persist player state:", e));
|
|
4382
|
+
}, 300);
|
|
4383
|
+
}
|
|
4384
|
+
function loadSessionStateFromDisk() {
|
|
4385
|
+
try {
|
|
4386
|
+
const raw2 = readFileSync(stateFilePath, "utf-8");
|
|
4387
|
+
const parsed = JSON.parse(raw2);
|
|
4388
|
+
if (!Array.isArray(parsed.entries)) return;
|
|
4389
|
+
const now = Date.now();
|
|
4390
|
+
for (const entry of parsed.entries) {
|
|
4391
|
+
if (!Array.isArray(entry) || entry.length !== 2) continue;
|
|
4392
|
+
const [key, state] = entry;
|
|
4393
|
+
if (!key || typeof key !== "string") continue;
|
|
4394
|
+
if (!state || typeof state.updatedAt !== "number" || !Array.isArray(state.segments)) continue;
|
|
4395
|
+
if (now - state.updatedAt > MAX_STATE_AGE_MS) continue;
|
|
4396
|
+
playerSessionState.set(key, state);
|
|
4397
|
+
}
|
|
4398
|
+
} catch {
|
|
4399
|
+
}
|
|
4400
|
+
}
|
|
4401
|
+
function setSessionState(key, state) {
|
|
4402
|
+
playerSessionState.set(key, state);
|
|
4403
|
+
scheduleStateSave();
|
|
4404
|
+
}
|
|
4405
|
+
function getSessionState(viewUUID, sessionId) {
|
|
4406
|
+
if (viewUUID) {
|
|
4407
|
+
const s2 = playerSessionState.get(viewUUID);
|
|
4408
|
+
if (s2) return s2;
|
|
4409
|
+
}
|
|
4410
|
+
const key = sessionId ?? "global";
|
|
4411
|
+
const s = playerSessionState.get(key);
|
|
4412
|
+
if (s) return s;
|
|
4413
|
+
return void 0;
|
|
4414
|
+
}
|
|
4415
|
+
function initializePlayerStorage(config2) {
|
|
4416
|
+
if (playerStorageInitialized) return;
|
|
4417
|
+
playerStorageInitialized = true;
|
|
4418
|
+
audioCacheDir = config2.playerCacheDir || audioCacheDir;
|
|
4419
|
+
stateFilePath = config2.playerStateFile || join3(audioCacheDir, "player-state.json");
|
|
4420
|
+
audioCacheEnabledFlag = config2.playerAudioCacheEnabled !== false;
|
|
4421
|
+
audioCacheTtlDays = Number.isFinite(config2.playerAudioCacheTtlDays) ? config2.playerAudioCacheTtlDays : DEFAULT_AUDIO_CACHE_TTL_DAYS;
|
|
4422
|
+
audioCacheMaxMb = Number.isFinite(config2.playerAudioCacheMaxMb) ? config2.playerAudioCacheMaxMb : DEFAULT_AUDIO_CACHE_MAX_MB;
|
|
4423
|
+
const cachePolicy = resolveAudioCachePolicy({
|
|
4424
|
+
enabledFlag: audioCacheEnabledFlag,
|
|
4425
|
+
ttlDays: audioCacheTtlDays,
|
|
4426
|
+
maxMb: audioCacheMaxMb
|
|
4427
|
+
});
|
|
4428
|
+
isAudioDiskCacheEnabled = cachePolicy.isDiskCacheEnabled;
|
|
4429
|
+
audioCacheTtlMs = cachePolicy.ttlMs;
|
|
4430
|
+
audioCacheMaxBytes = cachePolicy.maxBytes;
|
|
4431
|
+
try {
|
|
4432
|
+
mkdirSync2(audioCacheDir, { recursive: true });
|
|
4433
|
+
if (isAudioDiskCacheEnabled) {
|
|
4434
|
+
scheduleAudioCacheCleanup(true);
|
|
4435
|
+
}
|
|
4436
|
+
} catch (error) {
|
|
4437
|
+
console.warn("Warning: failed to create VOICEVOX player cache directory:", error);
|
|
4438
|
+
}
|
|
4439
|
+
try {
|
|
4440
|
+
mkdirSync2(dirname2(stateFilePath), { recursive: true });
|
|
4441
|
+
} catch (error) {
|
|
4442
|
+
console.warn("Warning: failed to prepare player state directory:", error);
|
|
4443
|
+
}
|
|
4444
|
+
loadSessionStateFromDisk();
|
|
4445
|
+
}
|
|
4446
|
+
function registerPlayerTools(deps) {
|
|
4447
|
+
const { server: server2, config: config2, disabledTools } = deps;
|
|
4448
|
+
initializePlayerStorage(config2);
|
|
4449
|
+
const playerVoicevoxApi = new VoicevoxApi(config2.voicevoxUrl);
|
|
4450
|
+
const getSpeakerList = async () => {
|
|
4451
|
+
if (speakerCache) return speakerCache;
|
|
4452
|
+
try {
|
|
4453
|
+
const speakers = await playerVoicevoxApi.getSpeakers();
|
|
4454
|
+
speakerCache = speakers.flatMap(
|
|
4455
|
+
(speaker) => speaker.styles.map((style) => ({
|
|
4456
|
+
id: style.id,
|
|
4457
|
+
name: style.name,
|
|
4458
|
+
characterName: speaker.name,
|
|
4459
|
+
uuid: speaker.speaker_uuid
|
|
4460
|
+
}))
|
|
4461
|
+
);
|
|
4462
|
+
return speakerCache;
|
|
4463
|
+
} catch {
|
|
4464
|
+
return [];
|
|
4465
|
+
}
|
|
4466
|
+
};
|
|
4467
|
+
const getSpeakerName = async (speakerId) => {
|
|
4468
|
+
const list = await getSpeakerList();
|
|
4469
|
+
const found = list?.find((s) => s.id === speakerId);
|
|
4470
|
+
return found ? `${found.characterName}\uFF08${found.name}\uFF09` : `Speaker ${speakerId}`;
|
|
4471
|
+
};
|
|
4472
|
+
const resolveSpeakerNames = async (speakerIds) => {
|
|
4473
|
+
const uniqueSpeakerIds = [...new Set(speakerIds)];
|
|
4474
|
+
const entries = await Promise.all(uniqueSpeakerIds.map(async (id) => [id, await getSpeakerName(id)]));
|
|
4475
|
+
return new Map(entries);
|
|
4476
|
+
};
|
|
4477
|
+
const getUserDictionaryWords = async () => {
|
|
4478
|
+
const dictionary = await playerVoicevoxApi.getUserDictionary();
|
|
4479
|
+
return Object.entries(dictionary).map(([wordUuid, word]) => ({
|
|
4480
|
+
wordUuid,
|
|
4481
|
+
surface: word.surface,
|
|
4482
|
+
pronunciation: word.pronunciation,
|
|
4483
|
+
accentType: word.accent_type,
|
|
4484
|
+
priority: word.priority
|
|
4485
|
+
}));
|
|
4486
|
+
};
|
|
4487
|
+
const synthesizeWithCache = async ({
|
|
4488
|
+
text,
|
|
4489
|
+
speaker,
|
|
4490
|
+
audioQuery,
|
|
4491
|
+
speedScale,
|
|
4492
|
+
intonationScale,
|
|
4493
|
+
volumeScale,
|
|
4494
|
+
prePhonemeLength,
|
|
4495
|
+
postPhonemeLength,
|
|
4496
|
+
pauseLengthScale,
|
|
4497
|
+
accentPhrases
|
|
4498
|
+
}) => {
|
|
4499
|
+
const speakerName = await getSpeakerName(speaker);
|
|
4500
|
+
let effectiveAudioQuery = audioQuery;
|
|
4501
|
+
if (audioQuery && accentPhrases && accentPhrases.length > 0 && audioQuery.accent_phrases?.length > 0) {
|
|
4502
|
+
try {
|
|
4503
|
+
const updated = await playerVoicevoxApi.updateMoraData(audioQuery.accent_phrases, speaker);
|
|
4504
|
+
effectiveAudioQuery = { ...audioQuery, accent_phrases: updated };
|
|
4505
|
+
} catch (e) {
|
|
4506
|
+
console.warn("[synthesizeWithCache] /mora_data \u518D\u8A08\u7B97\u5931\u6557\u3001\u5143\u306E\u30D4\u30C3\u30C1\u5024\u3092\u4F7F\u7528:", e);
|
|
4507
|
+
}
|
|
4508
|
+
}
|
|
4509
|
+
const cacheKey2 = createAudioCacheKey({
|
|
4510
|
+
text,
|
|
4511
|
+
speaker,
|
|
4512
|
+
audioQuery: effectiveAudioQuery,
|
|
4513
|
+
speedScale,
|
|
4514
|
+
intonationScale,
|
|
4515
|
+
volumeScale,
|
|
4516
|
+
prePhonemeLength,
|
|
4517
|
+
postPhonemeLength,
|
|
4518
|
+
pauseLengthScale,
|
|
4519
|
+
accentPhrases
|
|
4520
|
+
});
|
|
4521
|
+
const cachedBase64 = readCachedAudioBase64(cacheKey2);
|
|
4522
|
+
if (cachedBase64) {
|
|
4523
|
+
let cachedQuery = effectiveAudioQuery;
|
|
4524
|
+
if (!cachedQuery) {
|
|
4525
|
+
const generated = await playerVoicevoxApi.generateQuery(text, speaker);
|
|
4526
|
+
if (accentPhrases) generated.accent_phrases = accentPhrases;
|
|
4527
|
+
generated.speedScale = speedScale;
|
|
4528
|
+
if (intonationScale !== void 0) generated.intonationScale = intonationScale;
|
|
4529
|
+
if (volumeScale !== void 0) generated.volumeScale = volumeScale;
|
|
4530
|
+
if (prePhonemeLength !== void 0) generated.prePhonemeLength = prePhonemeLength;
|
|
4531
|
+
if (postPhonemeLength !== void 0) generated.postPhonemeLength = postPhonemeLength;
|
|
4532
|
+
if (pauseLengthScale !== void 0) generated.pauseLengthScale = pauseLengthScale;
|
|
4533
|
+
cachedQuery = generated;
|
|
4534
|
+
}
|
|
4535
|
+
return {
|
|
4536
|
+
audioBase64: cachedBase64,
|
|
4537
|
+
text,
|
|
4538
|
+
speaker,
|
|
4539
|
+
speakerName,
|
|
4540
|
+
kana: cachedQuery?.kana,
|
|
4541
|
+
audioQuery: cachedQuery,
|
|
4542
|
+
speedScale: cachedQuery?.speedScale ?? speedScale,
|
|
4543
|
+
intonationScale: cachedQuery?.intonationScale ?? intonationScale,
|
|
4544
|
+
volumeScale: cachedQuery?.volumeScale ?? volumeScale,
|
|
4545
|
+
prePhonemeLength: cachedQuery?.prePhonemeLength ?? prePhonemeLength,
|
|
4546
|
+
postPhonemeLength: cachedQuery?.postPhonemeLength ?? postPhonemeLength,
|
|
4547
|
+
pauseLengthScale: cachedQuery?.pauseLengthScale ?? pauseLengthScale,
|
|
4548
|
+
accentPhrases: cachedQuery?.accent_phrases ?? accentPhrases
|
|
4549
|
+
};
|
|
4550
|
+
}
|
|
4551
|
+
const resolvedQuery = effectiveAudioQuery ? { ...effectiveAudioQuery } : await playerVoicevoxApi.generateQuery(text, speaker);
|
|
4552
|
+
if (!effectiveAudioQuery && accentPhrases) resolvedQuery.accent_phrases = accentPhrases;
|
|
4553
|
+
if (!effectiveAudioQuery) {
|
|
4554
|
+
resolvedQuery.speedScale = speedScale;
|
|
4555
|
+
if (intonationScale !== void 0) resolvedQuery.intonationScale = intonationScale;
|
|
4556
|
+
if (volumeScale !== void 0) resolvedQuery.volumeScale = volumeScale;
|
|
4557
|
+
if (prePhonemeLength !== void 0) resolvedQuery.prePhonemeLength = prePhonemeLength;
|
|
4558
|
+
if (postPhonemeLength !== void 0) resolvedQuery.postPhonemeLength = postPhonemeLength;
|
|
4559
|
+
if (pauseLengthScale !== void 0) resolvedQuery.pauseLengthScale = pauseLengthScale;
|
|
4560
|
+
}
|
|
4561
|
+
const audioData = await playerVoicevoxApi.synthesize(resolvedQuery, speaker);
|
|
4562
|
+
const base64Audio = Buffer.from(audioData).toString("base64");
|
|
4563
|
+
await writeCachedAudioBase64(cacheKey2, base64Audio);
|
|
4564
|
+
return {
|
|
4565
|
+
audioBase64: base64Audio,
|
|
4566
|
+
text,
|
|
4567
|
+
speaker,
|
|
4568
|
+
speakerName,
|
|
4569
|
+
kana: resolvedQuery.kana,
|
|
4570
|
+
audioQuery: resolvedQuery,
|
|
4571
|
+
accentPhrases: resolvedQuery.accent_phrases,
|
|
4572
|
+
speedScale: resolvedQuery.speedScale,
|
|
4573
|
+
intonationScale: resolvedQuery.intonationScale,
|
|
4574
|
+
volumeScale: resolvedQuery.volumeScale,
|
|
4575
|
+
prePhonemeLength: resolvedQuery.prePhonemeLength,
|
|
4576
|
+
postPhonemeLength: resolvedQuery.postPhonemeLength,
|
|
4577
|
+
pauseLengthScale: resolvedQuery.pauseLengthScale
|
|
4578
|
+
};
|
|
4579
|
+
};
|
|
4580
|
+
registerAppResource(
|
|
4581
|
+
server2,
|
|
4582
|
+
"VOICEVOX Player",
|
|
4583
|
+
playerResourceUri,
|
|
4584
|
+
{
|
|
4585
|
+
description: "Audio player UI for VOICEVOX TTS",
|
|
4586
|
+
mimeType: RESOURCE_MIME_TYPE
|
|
4587
|
+
},
|
|
4588
|
+
async () => ({
|
|
4589
|
+
contents: [
|
|
4590
|
+
{
|
|
4591
|
+
uri: playerResourceUri,
|
|
4592
|
+
mimeType: RESOURCE_MIME_TYPE,
|
|
4593
|
+
text: playerHtml,
|
|
4594
|
+
_meta: {
|
|
4595
|
+
ui: {
|
|
4596
|
+
csp: {},
|
|
4597
|
+
...config2.playerDomain ? { domain: config2.playerDomain } : {}
|
|
4598
|
+
}
|
|
4599
|
+
}
|
|
4600
|
+
}
|
|
4601
|
+
]
|
|
4602
|
+
})
|
|
4603
|
+
);
|
|
4604
|
+
registerAppToolIfEnabled(
|
|
4605
|
+
server2,
|
|
4606
|
+
disabledTools,
|
|
4607
|
+
"open_dictionary_ui",
|
|
4608
|
+
{
|
|
4609
|
+
title: "Open Dictionary UI",
|
|
4610
|
+
description: "Open the user dictionary manager UI for VOICEVOX.",
|
|
4611
|
+
annotations: {
|
|
4612
|
+
readOnlyHint: false,
|
|
4613
|
+
destructiveHint: false,
|
|
4614
|
+
idempotentHint: true,
|
|
4615
|
+
openWorldHint: true
|
|
4616
|
+
},
|
|
4617
|
+
_meta: { ui: { resourceUri: playerResourceUri } }
|
|
4618
|
+
},
|
|
4619
|
+
async () => {
|
|
4620
|
+
try {
|
|
4621
|
+
const words = await getUserDictionaryWords();
|
|
4622
|
+
const notice = "\u8F9E\u66F8\u5909\u66F4\u306F\u65E2\u5B58\u30C8\u30E9\u30C3\u30AF\u306B\u81EA\u52D5\u53CD\u6620\u3055\u308C\u307E\u305B\u3093\u3002Player\u3067\u518D\u751F\u6210\u3059\u308B\u3068\u53CD\u6620\u3055\u308C\u307E\u3059\u3002";
|
|
4623
|
+
return {
|
|
4624
|
+
content: [{ type: "text", text: `Dictionary manager opened. ${words.length} word(s).` }],
|
|
4625
|
+
structuredContent: {
|
|
4626
|
+
mode: "dictionary",
|
|
4627
|
+
dictionaryWords: words,
|
|
4628
|
+
dictionaryNotice: notice
|
|
4629
|
+
},
|
|
4630
|
+
_meta: {
|
|
4631
|
+
mode: "dictionary",
|
|
4632
|
+
dictionaryWords: words,
|
|
4633
|
+
dictionaryNotice: notice
|
|
4634
|
+
}
|
|
4635
|
+
};
|
|
4636
|
+
} catch (error) {
|
|
4637
|
+
return createErrorResponse(error);
|
|
4638
|
+
}
|
|
4639
|
+
}
|
|
4640
|
+
);
|
|
4641
|
+
registerAppToolIfEnabled(
|
|
4642
|
+
server2,
|
|
4643
|
+
disabledTools,
|
|
4644
|
+
"speak_player",
|
|
4645
|
+
{
|
|
4646
|
+
title: "Speak Player",
|
|
4647
|
+
description: 'Create a VOICEVOX player session and display the UI. Returns viewUUID \u2014 save it and pass to resynthesize_player / get_player_state for subsequent operations. Multi-speaker format: "1:Hello\\n2:World". Audio synthesis is performed by the player UI when needed.',
|
|
4648
|
+
inputSchema: {
|
|
4649
|
+
text: z2.string().describe('Text to synthesize. Multi-speaker format: "1:Hello\\n2:World" (speaker ID prefix per line).'),
|
|
4650
|
+
speaker: z2.number().optional().describe("Default speaker ID (optional)"),
|
|
4651
|
+
speedScale: z2.number().optional().describe("Playback speed (optional, default from environment)")
|
|
4652
|
+
},
|
|
4653
|
+
annotations: {
|
|
4654
|
+
readOnlyHint: false,
|
|
4655
|
+
destructiveHint: false,
|
|
4656
|
+
idempotentHint: false,
|
|
4657
|
+
openWorldHint: true
|
|
4658
|
+
},
|
|
4659
|
+
_meta: { ui: { resourceUri: playerResourceUri } }
|
|
4660
|
+
},
|
|
4661
|
+
async ({
|
|
4662
|
+
text,
|
|
4663
|
+
speaker,
|
|
4664
|
+
speedScale
|
|
4665
|
+
}, extra) => {
|
|
4666
|
+
try {
|
|
4667
|
+
if (!text?.trim()) {
|
|
4668
|
+
throw new Error("text is required");
|
|
4669
|
+
}
|
|
4670
|
+
const parsedSegments = parseStringInput(text);
|
|
4671
|
+
if (parsedSegments.length === 0) {
|
|
4672
|
+
throw new Error("Text is empty");
|
|
4673
|
+
}
|
|
4674
|
+
const effectiveSpeaker = getEffectiveSpeaker(speaker, extra.sessionId) ?? config2.defaultSpeaker;
|
|
4675
|
+
const effectiveSpeed = speedScale ?? config2.defaultSpeedScale;
|
|
4676
|
+
const baseSegments = parsedSegments.map((s) => ({
|
|
4677
|
+
text: s.text,
|
|
4678
|
+
speaker: s.speaker ?? effectiveSpeaker,
|
|
4679
|
+
speedScale: effectiveSpeed
|
|
4680
|
+
}));
|
|
4681
|
+
const speakerNameMap = await resolveSpeakerNames(baseSegments.map((s) => s.speaker));
|
|
4682
|
+
const viewUUID = randomUUID2();
|
|
4683
|
+
setSessionState(viewUUID, {
|
|
4684
|
+
segments: baseSegments.map((s) => ({
|
|
4685
|
+
text: s.text,
|
|
4686
|
+
speaker: s.speaker,
|
|
4687
|
+
speakerName: speakerNameMap.get(s.speaker),
|
|
4688
|
+
speedScale: s.speedScale
|
|
4689
|
+
})),
|
|
4690
|
+
updatedAt: Date.now()
|
|
4691
|
+
});
|
|
4692
|
+
const fullText = parsedSegments.map((s) => s.text).join(" ");
|
|
4693
|
+
const textPreview = fullText.slice(0, 60) + (fullText.length > 60 ? "..." : "");
|
|
4694
|
+
const uiSegments = baseSegments.map((s) => ({
|
|
4695
|
+
text: s.text,
|
|
4696
|
+
speaker: s.speaker,
|
|
4697
|
+
speakerName: speakerNameMap.get(s.speaker),
|
|
4698
|
+
speedScale: s.speedScale
|
|
4699
|
+
}));
|
|
4700
|
+
return {
|
|
4701
|
+
content: [
|
|
4702
|
+
{
|
|
4703
|
+
type: "text",
|
|
4704
|
+
text: `Voicevox Player started. viewUUID: ${viewUUID} \u300C${textPreview}\u300D`
|
|
4705
|
+
}
|
|
4706
|
+
],
|
|
4707
|
+
structuredContent: {
|
|
4708
|
+
viewUUID,
|
|
4709
|
+
autoPlay: config2.autoPlay,
|
|
4710
|
+
segments: uiSegments
|
|
4711
|
+
},
|
|
4712
|
+
_meta: {
|
|
4713
|
+
viewUUID,
|
|
4714
|
+
autoPlay: config2.autoPlay,
|
|
4715
|
+
segments: uiSegments
|
|
4716
|
+
}
|
|
4717
|
+
};
|
|
4718
|
+
} catch (error) {
|
|
4719
|
+
return createErrorResponse(error);
|
|
4720
|
+
}
|
|
4721
|
+
}
|
|
4722
|
+
);
|
|
4723
|
+
registerAppToolIfEnabled(
|
|
4724
|
+
server2,
|
|
4725
|
+
disabledTools,
|
|
4726
|
+
"resynthesize_player",
|
|
4727
|
+
{
|
|
4728
|
+
title: "Resynthesize Player",
|
|
4729
|
+
description: "Update player segments for a new player instance (new viewUUID every call). Typical loop: get_player_state (fetch additional pages if hasMore) -> edit segment parameters -> resynthesize_player -> use returned viewUUID for the next loop. Audio synthesis is performed by the player UI when needed.",
|
|
4730
|
+
inputSchema: {
|
|
4731
|
+
segments: z2.array(
|
|
4732
|
+
z2.object({
|
|
4733
|
+
text: z2.string().describe("Segment text"),
|
|
4734
|
+
speaker: z2.number().optional().describe("Speaker ID"),
|
|
4735
|
+
speedScale: z2.number().optional().describe("Playback speed"),
|
|
4736
|
+
intonationScale: z2.number().optional().describe("Intonation scale (\u6291\u63DA)"),
|
|
4737
|
+
volumeScale: z2.number().optional().describe("Volume scale (\u97F3\u91CF)"),
|
|
4738
|
+
prePhonemeLength: z2.number().optional().describe("Pre-phoneme silence in seconds"),
|
|
4739
|
+
postPhonemeLength: z2.number().optional().describe("Post-phoneme silence in seconds"),
|
|
4740
|
+
pauseLengthScale: z2.number().optional().describe("Pause length scale between phrases (\u9593\u306E\u9577\u3055)"),
|
|
4741
|
+
accentPhrases: z2.array(
|
|
4742
|
+
z2.object({
|
|
4743
|
+
moras: z2.array(
|
|
4744
|
+
z2.object({
|
|
4745
|
+
text: z2.string(),
|
|
4746
|
+
consonant: z2.string().nullable().optional(),
|
|
4747
|
+
consonant_length: z2.number().nullable().optional(),
|
|
4748
|
+
vowel: z2.string(),
|
|
4749
|
+
vowel_length: z2.number(),
|
|
4750
|
+
pitch: z2.number()
|
|
4751
|
+
})
|
|
4752
|
+
),
|
|
4753
|
+
accent: z2.number().int(),
|
|
4754
|
+
pause_mora: z2.object({
|
|
4755
|
+
text: z2.string(),
|
|
4756
|
+
consonant: z2.string().nullable().optional(),
|
|
4757
|
+
consonant_length: z2.number().nullable().optional(),
|
|
4758
|
+
vowel: z2.string(),
|
|
4759
|
+
vowel_length: z2.number(),
|
|
4760
|
+
pitch: z2.number()
|
|
4761
|
+
}).nullable().optional(),
|
|
4762
|
+
is_interrogative: z2.boolean().nullable().optional()
|
|
4763
|
+
})
|
|
4764
|
+
).optional().describe("Accent phrases")
|
|
4765
|
+
})
|
|
4766
|
+
).describe(
|
|
4767
|
+
"Full segment list to update. Start from get_player_state.segments, edit needed fields, and send the complete array."
|
|
4768
|
+
),
|
|
4769
|
+
autoPlay: z2.boolean().optional().describe("Auto-play when loaded (default: true)")
|
|
4770
|
+
},
|
|
4771
|
+
annotations: {
|
|
4772
|
+
readOnlyHint: false,
|
|
4773
|
+
destructiveHint: false,
|
|
4774
|
+
idempotentHint: false,
|
|
4775
|
+
openWorldHint: true
|
|
4776
|
+
},
|
|
4777
|
+
_meta: { ui: { resourceUri: playerResourceUri } }
|
|
4778
|
+
},
|
|
4779
|
+
async ({
|
|
4780
|
+
segments,
|
|
4781
|
+
autoPlay
|
|
4782
|
+
}, extra) => {
|
|
4783
|
+
try {
|
|
4784
|
+
if (!segments || segments.length === 0) {
|
|
4785
|
+
throw new Error("segments is required");
|
|
4786
|
+
}
|
|
4787
|
+
const effectiveDefaultSpeaker = getEffectiveSpeaker(void 0, extra.sessionId) ?? config2.defaultSpeaker;
|
|
4788
|
+
const effectiveSpeed = config2.defaultSpeedScale;
|
|
4789
|
+
const effectiveAutoPlay = autoPlay ?? config2.autoPlay;
|
|
4790
|
+
const viewUUID = randomUUID2();
|
|
4791
|
+
const normalizedSegments = segments.map((seg) => ({
|
|
4792
|
+
text: seg.text,
|
|
4793
|
+
speaker: seg.speaker ?? effectiveDefaultSpeaker,
|
|
4794
|
+
speedScale: seg.speedScale ?? effectiveSpeed,
|
|
4795
|
+
intonationScale: seg.intonationScale,
|
|
4796
|
+
volumeScale: seg.volumeScale,
|
|
4797
|
+
prePhonemeLength: seg.prePhonemeLength,
|
|
4798
|
+
postPhonemeLength: seg.postPhonemeLength,
|
|
4799
|
+
pauseLengthScale: seg.pauseLengthScale,
|
|
4800
|
+
accentPhrases: seg.accentPhrases
|
|
4801
|
+
}));
|
|
4802
|
+
const speakerNameMap = await resolveSpeakerNames(normalizedSegments.map((seg) => seg.speaker));
|
|
4803
|
+
setSessionState(viewUUID, {
|
|
4804
|
+
segments: normalizedSegments.map((seg) => ({
|
|
4805
|
+
text: seg.text,
|
|
4806
|
+
speaker: seg.speaker,
|
|
4807
|
+
speakerName: speakerNameMap.get(seg.speaker),
|
|
4808
|
+
speedScale: seg.speedScale,
|
|
4809
|
+
intonationScale: seg.intonationScale,
|
|
4810
|
+
volumeScale: seg.volumeScale,
|
|
4811
|
+
prePhonemeLength: seg.prePhonemeLength,
|
|
4812
|
+
postPhonemeLength: seg.postPhonemeLength,
|
|
4813
|
+
pauseLengthScale: seg.pauseLengthScale,
|
|
4814
|
+
accentPhrases: seg.accentPhrases
|
|
4815
|
+
})),
|
|
4816
|
+
updatedAt: Date.now()
|
|
4817
|
+
});
|
|
4818
|
+
const uiSegments = normalizedSegments.map((seg) => ({
|
|
4819
|
+
text: seg.text,
|
|
4820
|
+
speaker: seg.speaker,
|
|
4821
|
+
speakerName: speakerNameMap.get(seg.speaker),
|
|
4822
|
+
speedScale: seg.speedScale,
|
|
4823
|
+
intonationScale: seg.intonationScale,
|
|
4824
|
+
volumeScale: seg.volumeScale,
|
|
4825
|
+
prePhonemeLength: seg.prePhonemeLength,
|
|
4826
|
+
postPhonemeLength: seg.postPhonemeLength,
|
|
4827
|
+
pauseLengthScale: seg.pauseLengthScale,
|
|
4828
|
+
accentPhrases: seg.accentPhrases
|
|
4829
|
+
}));
|
|
4830
|
+
return {
|
|
4831
|
+
content: [
|
|
4832
|
+
{
|
|
4833
|
+
type: "text",
|
|
4834
|
+
text: `Voicevox Player updated. viewUUID: ${viewUUID} (${segments.length} segment(s))`
|
|
4835
|
+
}
|
|
4836
|
+
],
|
|
4837
|
+
structuredContent: {
|
|
4838
|
+
viewUUID,
|
|
4839
|
+
autoPlay: effectiveAutoPlay,
|
|
4840
|
+
segments: uiSegments
|
|
4841
|
+
},
|
|
4842
|
+
_meta: {
|
|
4843
|
+
viewUUID,
|
|
4844
|
+
autoPlay: effectiveAutoPlay,
|
|
4845
|
+
segments: uiSegments
|
|
4846
|
+
}
|
|
4847
|
+
};
|
|
4848
|
+
} catch (error) {
|
|
4849
|
+
return createErrorResponse(error);
|
|
4850
|
+
}
|
|
4851
|
+
}
|
|
4852
|
+
);
|
|
4853
|
+
registerPlayerUITools(deps, {
|
|
4854
|
+
playerVoicevoxApi,
|
|
4855
|
+
playerResourceUri,
|
|
4856
|
+
synthesizeWithCache,
|
|
4857
|
+
setSessionState,
|
|
4858
|
+
getSessionState: (key) => playerSessionState.get(key),
|
|
4859
|
+
getSpeakerList
|
|
4860
|
+
});
|
|
4861
|
+
registerToolIfEnabled(
|
|
4862
|
+
server2,
|
|
4863
|
+
disabledTools,
|
|
4864
|
+
"get_player_state",
|
|
4865
|
+
{
|
|
4866
|
+
title: "Get VOICEVOX Player State",
|
|
4867
|
+
description: "Returns paged editable player state for AI tuning. Use the latest viewUUID from speak_player/resynthesize_player. If hasMore is true, call again with nextCursor to continue.",
|
|
4868
|
+
inputSchema: {
|
|
4869
|
+
viewUUID: z2.string().optional().describe("Player instance ID from speak_player/resynthesize_player. Always pass the latest viewUUID."),
|
|
4870
|
+
cursor: z2.number().int().min(0).optional().describe("Start index in segments array (default: 0)"),
|
|
4871
|
+
limit: z2.number().int().min(1).max(MAX_STATE_PAGE_LIMIT).optional().describe(
|
|
4872
|
+
`Max segments per page (default: ${DEFAULT_STATE_PAGE_LIMIT}, max: ${MAX_STATE_PAGE_LIMIT}). Server may return fewer segments when needed.`
|
|
4873
|
+
)
|
|
4874
|
+
},
|
|
4875
|
+
annotations: {
|
|
4876
|
+
readOnlyHint: true,
|
|
4877
|
+
destructiveHint: false,
|
|
4878
|
+
idempotentHint: true,
|
|
4879
|
+
openWorldHint: false
|
|
4880
|
+
}
|
|
4881
|
+
},
|
|
4882
|
+
async ({ viewUUID, cursor, limit }, extra) => {
|
|
4883
|
+
try {
|
|
4884
|
+
const state = getSessionState(viewUUID, extra?.sessionId);
|
|
4885
|
+
if (!state) {
|
|
4886
|
+
return {
|
|
4887
|
+
content: [
|
|
4888
|
+
{
|
|
4889
|
+
type: "text",
|
|
4890
|
+
text: JSON.stringify({
|
|
4891
|
+
segments: [],
|
|
4892
|
+
updatedAt: 0,
|
|
4893
|
+
total: 0,
|
|
4894
|
+
cursor: 0,
|
|
4895
|
+
limit: limit ?? DEFAULT_STATE_PAGE_LIMIT,
|
|
4896
|
+
hasMore: false,
|
|
4897
|
+
nextCursor: null,
|
|
4898
|
+
message: "No player state available. Play something first."
|
|
4899
|
+
})
|
|
4900
|
+
}
|
|
4901
|
+
]
|
|
4902
|
+
};
|
|
4903
|
+
}
|
|
4904
|
+
const total = state.segments.length;
|
|
4905
|
+
const effectiveCursor = Math.min(cursor ?? 0, total);
|
|
4906
|
+
const requestedLimit = limit ?? DEFAULT_STATE_PAGE_LIMIT;
|
|
4907
|
+
const effectiveLimit = Math.min(requestedLimit, MAX_STATE_PAGE_LIMIT);
|
|
4908
|
+
let pageEnd = Math.min(total, effectiveCursor + effectiveLimit);
|
|
4909
|
+
let pageSegments = state.segments.slice(effectiveCursor, pageEnd);
|
|
4910
|
+
const buildPayload = () => {
|
|
4911
|
+
const hasMore = pageEnd < total;
|
|
4912
|
+
return {
|
|
4913
|
+
segments: pageSegments,
|
|
4914
|
+
updatedAt: state.updatedAt,
|
|
4915
|
+
total,
|
|
4916
|
+
cursor: effectiveCursor,
|
|
4917
|
+
limit: effectiveLimit,
|
|
4918
|
+
hasMore,
|
|
4919
|
+
nextCursor: hasMore ? pageEnd : null
|
|
4920
|
+
};
|
|
4921
|
+
};
|
|
4922
|
+
let payload = buildPayload();
|
|
4923
|
+
let payloadText = JSON.stringify(payload);
|
|
4924
|
+
while (Buffer.byteLength(payloadText, "utf8") > MAX_TOOL_CONTENT_BYTES && pageSegments.length > 0) {
|
|
4925
|
+
pageEnd -= 1;
|
|
4926
|
+
pageSegments = state.segments.slice(effectiveCursor, pageEnd);
|
|
4927
|
+
payload = buildPayload();
|
|
4928
|
+
payloadText = JSON.stringify(payload);
|
|
4929
|
+
}
|
|
4930
|
+
if (Buffer.byteLength(payloadText, "utf8") > MAX_TOOL_CONTENT_BYTES) {
|
|
4931
|
+
return {
|
|
4932
|
+
content: [
|
|
4933
|
+
{
|
|
4934
|
+
type: "text",
|
|
4935
|
+
text: JSON.stringify({
|
|
4936
|
+
segments: [],
|
|
4937
|
+
updatedAt: state.updatedAt,
|
|
4938
|
+
total,
|
|
4939
|
+
cursor: effectiveCursor,
|
|
4940
|
+
limit: effectiveLimit,
|
|
4941
|
+
hasMore: effectiveCursor < total,
|
|
4942
|
+
nextCursor: effectiveCursor < total ? effectiveCursor : null,
|
|
4943
|
+
message: "Player state is too large for this request. Request a later cursor or reduce source text size."
|
|
4944
|
+
})
|
|
4945
|
+
}
|
|
4946
|
+
]
|
|
4947
|
+
};
|
|
4948
|
+
}
|
|
4949
|
+
if (pageSegments.length === 0 && effectiveCursor < total) {
|
|
4950
|
+
return {
|
|
4951
|
+
content: [
|
|
4952
|
+
{
|
|
4953
|
+
type: "text",
|
|
4954
|
+
text: JSON.stringify({
|
|
4955
|
+
segments: [],
|
|
4956
|
+
updatedAt: state.updatedAt,
|
|
4957
|
+
total,
|
|
4958
|
+
cursor: effectiveCursor,
|
|
4959
|
+
limit: effectiveLimit,
|
|
4960
|
+
hasMore: true,
|
|
4961
|
+
nextCursor: effectiveCursor,
|
|
4962
|
+
message: "Current segment is too large to include. Advance cursor or reduce segment text size."
|
|
4963
|
+
})
|
|
4964
|
+
}
|
|
4965
|
+
]
|
|
4966
|
+
};
|
|
4967
|
+
}
|
|
4968
|
+
return {
|
|
4969
|
+
content: [{ type: "text", text: payloadText }]
|
|
4970
|
+
};
|
|
4971
|
+
} catch (error) {
|
|
4972
|
+
return createErrorResponse(error);
|
|
4973
|
+
}
|
|
4974
|
+
}
|
|
4975
|
+
);
|
|
4976
|
+
}
|
|
4977
|
+
|
|
4978
|
+
// src/tools/speak.ts
|
|
4979
|
+
import * as z3 from "zod/v4";
|
|
4980
|
+
function buildSpeakInputSchema(restrictions) {
|
|
4981
|
+
const schema = {
|
|
4982
|
+
text: z3.string().describe(
|
|
4983
|
+
'Text split by line breaks (\\n). IMPORTANT: Each line = one speech unit (processed and played separately). Keep the FIRST LINE SHORT for quick playback start - audio begins as soon as the first line is synthesized. Example: "Hi!\\nThis is a longer explanation that follows." Optional speaker prefix per line: "1:Hello\\n2:World"'
|
|
4984
|
+
),
|
|
4985
|
+
query: z3.string().optional().describe("Voice synthesis query"),
|
|
4986
|
+
speaker: z3.number().optional().describe("Default speaker ID (optional)"),
|
|
4987
|
+
speedScale: z3.number().optional().describe("Playback speed (optional, default from environment)")
|
|
4988
|
+
};
|
|
4989
|
+
if (!restrictions.immediate) {
|
|
4990
|
+
schema.immediate = z3.boolean().optional().describe(
|
|
4991
|
+
"If true, stops current playback and plays new audio immediately. If false, waits for current playback to finish. Default depends on environment variable."
|
|
4992
|
+
);
|
|
4993
|
+
}
|
|
4994
|
+
if (!restrictions.waitForStart) {
|
|
4995
|
+
schema.waitForStart = z3.boolean().optional().describe("Wait for playback to start (optional, default: false)");
|
|
4996
|
+
}
|
|
4997
|
+
if (!restrictions.waitForEnd) {
|
|
4998
|
+
schema.waitForEnd = z3.boolean().optional().describe("Wait for playback to end (optional, default: false)");
|
|
4999
|
+
}
|
|
5000
|
+
return schema;
|
|
5001
|
+
}
|
|
5002
|
+
function registerSpeakTool(deps) {
|
|
5003
|
+
const { server: server2, voicevoxClient, config: config2, disabledTools, restrictions } = deps;
|
|
5004
|
+
registerToolIfEnabled(
|
|
5005
|
+
server2,
|
|
5006
|
+
disabledTools,
|
|
5007
|
+
"speak",
|
|
5008
|
+
{
|
|
5009
|
+
title: "Speak",
|
|
5010
|
+
description: "Convert text to speech and play it. Text is split by line breaks (\\n) into separate speech units. Each line is processed as an independent audio segment.",
|
|
5011
|
+
inputSchema: buildSpeakInputSchema(restrictions),
|
|
5012
|
+
annotations: {
|
|
5013
|
+
readOnlyHint: false,
|
|
5014
|
+
destructiveHint: false,
|
|
5015
|
+
idempotentHint: false,
|
|
5016
|
+
openWorldHint: true
|
|
5017
|
+
}
|
|
5018
|
+
},
|
|
5019
|
+
async ({
|
|
5020
|
+
text,
|
|
5021
|
+
speaker,
|
|
5022
|
+
query,
|
|
3546
5023
|
speedScale,
|
|
3547
5024
|
immediate,
|
|
3548
5025
|
waitForStart,
|
|
@@ -3580,7 +5057,7 @@ function registerSpeakerTools(deps) {
|
|
|
3580
5057
|
registerToolIfEnabled(
|
|
3581
5058
|
server2,
|
|
3582
5059
|
disabledTools,
|
|
3583
|
-
"
|
|
5060
|
+
"ping",
|
|
3584
5061
|
{
|
|
3585
5062
|
title: "Ping VOICEVOX",
|
|
3586
5063
|
description: "Check if VOICEVOX Engine is running and reachable",
|
|
@@ -3622,7 +5099,7 @@ function registerSpeakerTools(deps) {
|
|
|
3622
5099
|
async () => {
|
|
3623
5100
|
try {
|
|
3624
5101
|
await voicevoxClient.clearQueue();
|
|
3625
|
-
return createSuccessResponse("
|
|
5102
|
+
return createSuccessResponse("Speaker stopped successfully");
|
|
3626
5103
|
} catch (error) {
|
|
3627
5104
|
return createErrorResponse(error);
|
|
3628
5105
|
}
|
|
@@ -3661,7 +5138,7 @@ function registerSpeakerTools(deps) {
|
|
|
3661
5138
|
}
|
|
3662
5139
|
|
|
3663
5140
|
// src/tools/synthesize.ts
|
|
3664
|
-
import * as
|
|
5141
|
+
import * as z4 from "zod/v4";
|
|
3665
5142
|
function registerSynthesizeTool(deps) {
|
|
3666
5143
|
const { server: server2, voicevoxClient, disabledTools } = deps;
|
|
3667
5144
|
registerToolIfEnabled(
|
|
@@ -3678,11 +5155,11 @@ function registerSynthesizeTool(deps) {
|
|
|
3678
5155
|
openWorldHint: true
|
|
3679
5156
|
},
|
|
3680
5157
|
inputSchema: {
|
|
3681
|
-
text:
|
|
3682
|
-
query:
|
|
3683
|
-
output:
|
|
3684
|
-
speaker:
|
|
3685
|
-
speedScale:
|
|
5158
|
+
text: z4.string().optional().describe("Text for voice synthesis (if both query and text provided, query takes precedence)"),
|
|
5159
|
+
query: z4.string().optional().describe("Voice synthesis query"),
|
|
5160
|
+
output: z4.string().describe("Output path for the audio file"),
|
|
5161
|
+
speaker: z4.number().optional().describe("Default speaker ID (optional)"),
|
|
5162
|
+
speedScale: z4.number().optional().describe("Playback speed (optional, default from environment)")
|
|
3686
5163
|
}
|
|
3687
5164
|
},
|
|
3688
5165
|
async ({
|
|
@@ -3703,7 +5180,7 @@ function registerSynthesizeTool(deps) {
|
|
|
3703
5180
|
const filePath = await voicevoxClient.generateAudioFile(text, output, effectiveSpeaker, speedScale);
|
|
3704
5181
|
return createSuccessResponse(filePath);
|
|
3705
5182
|
}
|
|
3706
|
-
throw new Error("query
|
|
5183
|
+
throw new Error('Either "query" or "text" parameter must be specified');
|
|
3707
5184
|
} catch (error) {
|
|
3708
5185
|
return createErrorResponse(error);
|
|
3709
5186
|
}
|
|
@@ -3715,8 +5192,8 @@ function registerSynthesizeTool(deps) {
|
|
|
3715
5192
|
var config = getConfig();
|
|
3716
5193
|
function createServer() {
|
|
3717
5194
|
const server2 = new McpServer({
|
|
3718
|
-
name: "
|
|
3719
|
-
version: "0.
|
|
5195
|
+
name: "mcp-tts-voicevox",
|
|
5196
|
+
version: "0.7.1",
|
|
3720
5197
|
description: "A Voicevox server that converts text to speech for playback and saving."
|
|
3721
5198
|
});
|
|
3722
5199
|
const voicevoxClient = new VoicevoxClient({
|
|
@@ -3745,7 +5222,7 @@ function createServer() {
|
|
|
3745
5222
|
var server = createServer();
|
|
3746
5223
|
|
|
3747
5224
|
// src/index.ts
|
|
3748
|
-
var __dirname2 =
|
|
5225
|
+
var __dirname2 = dirname3(fileURLToPath2(import.meta.url));
|
|
3749
5226
|
function isCLI() {
|
|
3750
5227
|
if (!isNodejs() || !process.argv) return false;
|
|
3751
5228
|
const isNpmStart = process.env?.npm_lifecycle_event === "start";
|
|
@@ -3799,12 +5276,22 @@ Options:
|
|
|
3799
5276
|
|
|
3800
5277
|
Tool Options:
|
|
3801
5278
|
--disable-tools <tools> Comma-separated list of tools to disable
|
|
3802
|
-
(
|
|
3803
|
-
|
|
5279
|
+
(e.g.: speak, speak_player, ping, synthesize_file,
|
|
5280
|
+
stop_speaker, get_speakers)
|
|
5281
|
+
The "voicevox_" prefix is added automatically.
|
|
3804
5282
|
|
|
3805
5283
|
UI Player Options:
|
|
3806
5284
|
--auto-play Auto-play audio in UI player (default)
|
|
3807
5285
|
--no-auto-play Require manual play in UI player
|
|
5286
|
+
--player-export Enable track export(download) in UI player (default)
|
|
5287
|
+
--no-player-export Disable track export(download) in UI player
|
|
5288
|
+
--player-export-dir <dir> Default output directory for exported tracks
|
|
5289
|
+
--player-cache-dir <dir> Player cache directory
|
|
5290
|
+
--player-state-file <path> Persisted player state file path
|
|
5291
|
+
--player-audio-cache Enable disk audio cache for player (default)
|
|
5292
|
+
--no-player-audio-cache Disable disk audio cache for player
|
|
5293
|
+
--player-audio-cache-ttl-days <days> Audio cache retention days (0 disables, -1 unlimited)
|
|
5294
|
+
--player-audio-cache-max-mb <mb> Audio cache size cap in MB (0 disables, -1 unlimited)
|
|
3808
5295
|
|
|
3809
5296
|
Server Options:
|
|
3810
5297
|
--http Enable HTTP server mode (remote MCP)
|
|
@@ -3812,6 +5299,7 @@ Options:
|
|
|
3812
5299
|
--host <host> HTTP server host (default: 0.0.0.0)
|
|
3813
5300
|
--allowed-hosts <hosts> Comma-separated list of allowed hosts (default: localhost,127.0.0.1,[::1])
|
|
3814
5301
|
--allowed-origins <origins> Comma-separated list of allowed origins
|
|
5302
|
+
--api-key <key> Require matching API key via X-API-Key or Authorization: Bearer
|
|
3815
5303
|
|
|
3816
5304
|
Examples:
|
|
3817
5305
|
npx @kajidog/mcp-tts-voicevox --url http://192.168.1.50:50021 --speaker 3
|
|
@@ -3828,7 +5316,7 @@ async function startMCPServer() {
|
|
|
3828
5316
|
process.exit(0);
|
|
3829
5317
|
}
|
|
3830
5318
|
if (process.argv.includes("--version") || process.argv.includes("-v")) {
|
|
3831
|
-
const pkg = JSON.parse(readFileSync2(
|
|
5319
|
+
const pkg = JSON.parse(readFileSync2(join4(__dirname2, "../package.json"), "utf-8"));
|
|
3832
5320
|
console.log(`@kajidog/mcp-tts-voicevox v${pkg.version}`);
|
|
3833
5321
|
process.exit(0);
|
|
3834
5322
|
}
|