voxflow 1.17.0 → 1.17.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1 -1
- package/lib/commands/asr/index.js +1 -1
- package/lib/commands/card-render.js +122 -10
- package/lib/commands/card.js +1 -1
- package/lib/commands/dub.js +1 -1
- package/lib/commands/explain.js +3 -3
- package/lib/commands/narrate.js +1 -1
- package/lib/commands/picstory.js +3 -3
- package/lib/commands/podcast/index.js +1 -1
- package/lib/commands/present.js +1 -1
- package/lib/commands/publish.js +1 -1
- package/lib/commands/slides/index.js +1 -1
- package/lib/commands/story.js +1 -1
- package/lib/commands/summarize.js +3 -3
- package/lib/commands/translate.js +1 -1
- package/lib/commands/video-translate.js +1 -1
- package/lib/commands/voices.js +2 -2
- package/package.json +1 -1
- package/skills/.claude-plugin/plugin.json +1 -1
- package/skills/card/SKILL.md +28 -12
- package/skills/hub/SKILL.md +3 -2
- package/skills/video/SKILL.md +2 -2
|
@@ -211,7 +211,7 @@ const meta = {
|
|
|
211
211
|
`--mode <type> auto (default) | sentence | flash | file (cloud only)`,
|
|
212
212
|
`--lang <model> Language. Tencent: 16k_zh (default), 16k_en, ... | Azure: ja-JP, en-US, zh-CN, ...`,
|
|
213
213
|
`--format <fmt> Output format: srt (default), txt, json`,
|
|
214
|
-
|
|
214
|
+
`-o, --output <path> Output file path (default: <input>.<format>)`,
|
|
215
215
|
`--speakers Enable speaker diarization (alias of --diarize)`,
|
|
216
216
|
`--diarize Enable speaker diarization (azure)`,
|
|
217
217
|
`--speaker-number <n> Expected number of speakers (with --speakers / --diarize)`,
|
|
@@ -118,12 +118,99 @@ function escapeDrawtext(text) {
|
|
|
118
118
|
.replace(/\n/g, ' ');
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
+
/**
|
|
122
|
+
* Detect whether a string contains CJK characters that need a CJK fontfile.
|
|
123
|
+
* Covers Han (CJK Unified), Hiragana, Katakana, Hangul, and full-width punctuation.
|
|
124
|
+
*/
|
|
125
|
+
function containsCjk(text) {
|
|
126
|
+
if (!text) return false;
|
|
127
|
+
// U+3001–303F CJK symbols & punctuation (skip U+3000 IDEOGRAPHIC SPACE — eslint flags it)
|
|
128
|
+
// U+3040–30FF Hiragana + Katakana
|
|
129
|
+
// U+3400–9FFF CJK Ext A + CJK Unified
|
|
130
|
+
// U+AC00–D7AF Hangul; U+FF00–FFEF Halfwidth/Fullwidth forms
|
|
131
|
+
return /[、-ヿ㐀-鿿가--]/.test(text);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Locate a fontfile that supports CJK glyphs on the host platform.
|
|
136
|
+
*
|
|
137
|
+
* ffmpeg's `drawtext` filter, when no `fontfile=` is given, falls back to a
|
|
138
|
+
* built-in default that ships only Latin-1. CJK content rendered without an
|
|
139
|
+
* explicit CJK fontfile shows as `□` tofu boxes (issue #3592).
|
|
140
|
+
*
|
|
141
|
+
* Returns an absolute path to a known CJK-capable font, or null if none of
|
|
142
|
+
* the platform-specific candidates exist. Cached for the process lifetime.
|
|
143
|
+
* Override the search via `VOXFLOW_CJK_FONT=/path/to/font.ttc`.
|
|
144
|
+
*
|
|
145
|
+
* @returns {string|null}
|
|
146
|
+
*/
|
|
147
|
+
let _cjkFontPathCache; // undefined = unknown, null = absent, string = found
|
|
148
|
+
function findCjkFontFile() {
|
|
149
|
+
if (_cjkFontPathCache !== undefined) return _cjkFontPathCache;
|
|
150
|
+
|
|
151
|
+
// User override wins over platform autodetect
|
|
152
|
+
if (process.env.VOXFLOW_CJK_FONT && fs.existsSync(process.env.VOXFLOW_CJK_FONT)) {
|
|
153
|
+
_cjkFontPathCache = process.env.VOXFLOW_CJK_FONT;
|
|
154
|
+
return _cjkFontPathCache;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const candidates = [];
|
|
158
|
+
if (process.platform === 'darwin') {
|
|
159
|
+
candidates.push(
|
|
160
|
+
'/System/Library/Fonts/PingFang.ttc',
|
|
161
|
+
'/System/Library/Fonts/Hiragino Sans GB.ttc',
|
|
162
|
+
'/System/Library/Fonts/STHeiti Medium.ttc',
|
|
163
|
+
'/System/Library/Fonts/STHeiti Light.ttc',
|
|
164
|
+
'/Library/Fonts/Songti.ttc',
|
|
165
|
+
);
|
|
166
|
+
} else if (process.platform === 'linux') {
|
|
167
|
+
candidates.push(
|
|
168
|
+
'/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc',
|
|
169
|
+
'/usr/share/fonts/opentype/noto/NotoSansCJK.ttc',
|
|
170
|
+
'/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc',
|
|
171
|
+
'/usr/share/fonts/wqy-microhei/wqy-microhei.ttc',
|
|
172
|
+
'/usr/share/fonts/wqy-zenhei/wqy-zenhei.ttc',
|
|
173
|
+
'/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc',
|
|
174
|
+
);
|
|
175
|
+
} else if (process.platform === 'win32') {
|
|
176
|
+
candidates.push(
|
|
177
|
+
'C:/Windows/Fonts/msyh.ttc',
|
|
178
|
+
'C:/Windows/Fonts/msyhbd.ttc',
|
|
179
|
+
'C:/Windows/Fonts/simsun.ttc',
|
|
180
|
+
'C:/Windows/Fonts/yugothic.ttf',
|
|
181
|
+
);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
for (const p of candidates) {
|
|
185
|
+
if (fs.existsSync(p)) {
|
|
186
|
+
_cjkFontPathCache = p;
|
|
187
|
+
return _cjkFontPathCache;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
_cjkFontPathCache = null;
|
|
191
|
+
return null;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Build the drawtext fontfile= clause when the text contains CJK and a
|
|
196
|
+
* suitable font is available on the host. Returns either ":fontfile=…" (with
|
|
197
|
+
* leading colon, ready to splice into a drawtext arg list) or "" when no
|
|
198
|
+
* font override is needed (ASCII-only text or no CJK font on host).
|
|
199
|
+
*/
|
|
200
|
+
function drawtextFontfileClause(text, cjkFontPath) {
|
|
201
|
+
if (!text || !cjkFontPath) return '';
|
|
202
|
+
if (!containsCjk(text)) return '';
|
|
203
|
+
// ffmpeg fontfile= path needs `:` and `\` escaped inside a filter arg.
|
|
204
|
+
const escaped = cjkFontPath.replace(/\\/g, '/').replace(/:/g, '\\:');
|
|
205
|
+
return `:fontfile='${escaped}'`;
|
|
206
|
+
}
|
|
207
|
+
|
|
121
208
|
// ── Render functions ──────────────────────────────────────────────────────────
|
|
122
209
|
|
|
123
210
|
/**
|
|
124
211
|
* Render a single card: PNG + optional WAV → MP4 clip with subtitle overlay.
|
|
125
212
|
*/
|
|
126
|
-
async function renderCardClip({ pngPath, wavPath, outPath, durationMs, ratio, subtitle, hasDrawtext = false }) {
|
|
213
|
+
async function renderCardClip({ pngPath, wavPath, outPath, durationMs, ratio, subtitle, hasDrawtext = false, cjkFontPath = null }) {
|
|
127
214
|
const { w, h } = RATIO_DIMS[ratio] || RATIO_DIMS['9:16'];
|
|
128
215
|
const durationSec = Math.max(3, durationMs / 1000);
|
|
129
216
|
|
|
@@ -139,8 +226,9 @@ async function renderCardClip({ pngPath, wavPath, outPath, durationMs, ratio, su
|
|
|
139
226
|
const escaped = escapeDrawtext(subtitle);
|
|
140
227
|
const fontSize = Math.round(SUB_FONT_SIZE * (w / 1080));
|
|
141
228
|
const boxY = h - SUB_MARGIN_BOTTOM - fontSize - SUB_PADDING * 2;
|
|
229
|
+
const fontfile = drawtextFontfileClause(subtitle, cjkFontPath);
|
|
142
230
|
vfParts.push(
|
|
143
|
-
`drawtext=text='${escaped}':fontsize=${fontSize}:fontcolor=white:` +
|
|
231
|
+
`drawtext=text='${escaped}'${fontfile}:fontsize=${fontSize}:fontcolor=white:` +
|
|
144
232
|
`x=(w-text_w)/2:y=${boxY + SUB_PADDING}:` +
|
|
145
233
|
`box=1:boxcolor=black@${SUB_BOX_OPACITY}:boxborderw=${SUB_PADDING}`,
|
|
146
234
|
);
|
|
@@ -177,7 +265,7 @@ async function renderCardClip({ pngPath, wavPath, outPath, durationMs, ratio, su
|
|
|
177
265
|
* Generate an intro or outro card via FFmpeg color source (simple solid + no text).
|
|
178
266
|
* Text overlay requires drawtext (libfreetype); if unavailable, renders a plain color card.
|
|
179
267
|
*/
|
|
180
|
-
async function renderTitleCard({ outPath, ratio, title, subtitle, durationSec = 3, bgColor = '1a1520', textColor = 'f4efe6', fadeSeconds = 0.4, isFirst = false, isLast = false, hasDrawtext = false }) {
|
|
268
|
+
async function renderTitleCard({ outPath, ratio, title, subtitle, durationSec = 3, bgColor = '1a1520', textColor = 'f4efe6', fadeSeconds = 0.4, isFirst = false, isLast = false, hasDrawtext = false, cjkFontPath = null }) {
|
|
181
269
|
const { w, h } = RATIO_DIMS[ratio] || RATIO_DIMS['9:16'];
|
|
182
270
|
const fd = fadeSeconds;
|
|
183
271
|
|
|
@@ -185,19 +273,21 @@ async function renderTitleCard({ outPath, ratio, title, subtitle, durationSec =
|
|
|
185
273
|
const subSize = Math.round(32 * (w / 1080));
|
|
186
274
|
const escapedTitle = escapeDrawtext(title || '');
|
|
187
275
|
const escapedSub = escapeDrawtext(subtitle || '');
|
|
276
|
+
const titleFontfile = drawtextFontfileClause(title, cjkFontPath);
|
|
277
|
+
const subFontfile = drawtextFontfileClause(subtitle, cjkFontPath);
|
|
188
278
|
|
|
189
279
|
const vfParts = [`color=c=0x${bgColor}:s=${w}x${h}:d=${durationSec}:r=30`];
|
|
190
280
|
|
|
191
281
|
if (hasDrawtext) {
|
|
192
282
|
if (escapedTitle) {
|
|
193
283
|
vfParts.push(
|
|
194
|
-
`drawtext=text='${escapedTitle}':fontsize=${titleSize}:fontcolor=0x${textColor}:` +
|
|
284
|
+
`drawtext=text='${escapedTitle}'${titleFontfile}:fontsize=${titleSize}:fontcolor=0x${textColor}:` +
|
|
195
285
|
`x=(w-text_w)/2:y=(h-text_h)/2-${Math.round(subSize * 1.5)}`,
|
|
196
286
|
);
|
|
197
287
|
}
|
|
198
288
|
if (escapedSub) {
|
|
199
289
|
vfParts.push(
|
|
200
|
-
`drawtext=text='${escapedSub}':fontsize=${subSize}:fontcolor=0x${textColor}@0.6:` +
|
|
290
|
+
`drawtext=text='${escapedSub}'${subFontfile}:fontsize=${subSize}:fontcolor=0x${textColor}@0.6:` +
|
|
201
291
|
`x=(w-text_w)/2:y=(h-text_h)/2+${Math.round(titleSize * 0.8)}`,
|
|
202
292
|
);
|
|
203
293
|
}
|
|
@@ -252,7 +342,7 @@ async function cardRender(opts) {
|
|
|
252
342
|
introDuration = 2.5,
|
|
253
343
|
outroDuration = 2,
|
|
254
344
|
} = opts;
|
|
255
|
-
const voice = opts.voice || 'female-
|
|
345
|
+
const voice = opts.voice || 'v-female-R2s4N9qJ';
|
|
256
346
|
const speed = Number(opts.speed) || 1.0;
|
|
257
347
|
|
|
258
348
|
const deck = readDeckJson(dir);
|
|
@@ -283,6 +373,7 @@ async function cardRender(opts) {
|
|
|
283
373
|
|
|
284
374
|
// Check drawtext filter availability (needs libfreetype)
|
|
285
375
|
let hasDrawtext = false;
|
|
376
|
+
let cjkFontPath = null;
|
|
286
377
|
if (!noSubtitle) {
|
|
287
378
|
try {
|
|
288
379
|
const { stdout } = await runCommand('ffmpeg', ['-hide_banner', '-filters']);
|
|
@@ -290,6 +381,24 @@ async function cardRender(opts) {
|
|
|
290
381
|
} catch { /* unavailable */ }
|
|
291
382
|
if (!hasDrawtext) {
|
|
292
383
|
console.log(` (drawtext unavailable — subtitles disabled)`);
|
|
384
|
+
} else {
|
|
385
|
+
// Detect CJK content in titles/narrations and locate a CJK fontfile if needed.
|
|
386
|
+
// ffmpeg's default drawtext font is Latin-1 only; without an explicit fontfile
|
|
387
|
+
// CJK characters render as `□` tofu boxes (#3592).
|
|
388
|
+
const allText = [
|
|
389
|
+
deck.meta?.title || '',
|
|
390
|
+
...cards.flatMap((c) => [c.title || '', c.narration || '']),
|
|
391
|
+
].join('\n');
|
|
392
|
+
if (containsCjk(allText)) {
|
|
393
|
+
cjkFontPath = findCjkFontFile();
|
|
394
|
+
if (cjkFontPath) {
|
|
395
|
+
console.log(` (CJK detected — using ${path.basename(cjkFontPath)} for overlay text)`);
|
|
396
|
+
} else {
|
|
397
|
+
console.log(` (CJK detected but no CJK font found — overlay text will show as □.`);
|
|
398
|
+
console.log(` Install Noto Sans CJK or set VOXFLOW_CJK_FONT=/path/to/font.ttc,`);
|
|
399
|
+
console.log(` or rerun with --no-subtitle --no-intro --no-outro.)`);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
293
402
|
}
|
|
294
403
|
}
|
|
295
404
|
|
|
@@ -313,7 +422,7 @@ async function cardRender(opts) {
|
|
|
313
422
|
outPath: introPath, ratio, title,
|
|
314
423
|
subtitle: deck.meta?.language === 'zh' ? '知识卡片' : 'Card Series',
|
|
315
424
|
durationSec: introDuration, fadeSeconds: 0,
|
|
316
|
-
isFirst: true, isLast: false, hasDrawtext,
|
|
425
|
+
isFirst: true, isLast: false, hasDrawtext, cjkFontPath,
|
|
317
426
|
});
|
|
318
427
|
clipPaths.push(introPath);
|
|
319
428
|
}
|
|
@@ -354,7 +463,7 @@ async function cardRender(opts) {
|
|
|
354
463
|
console.log(` Rendering card ${i + 1}/${cards.length}...`);
|
|
355
464
|
await renderCardClip({
|
|
356
465
|
pngPath, wavPath, outPath: clipOut,
|
|
357
|
-
durationMs, ratio, hasDrawtext,
|
|
466
|
+
durationMs, ratio, hasDrawtext, cjkFontPath,
|
|
358
467
|
subtitle: noSubtitle ? null : (card.narration || card.title || null),
|
|
359
468
|
});
|
|
360
469
|
clipPaths.push(clipOut);
|
|
@@ -370,7 +479,7 @@ async function cardRender(opts) {
|
|
|
370
479
|
subtitle: 'voxflow.studio',
|
|
371
480
|
durationSec: outroDuration, fadeSeconds: 0,
|
|
372
481
|
bgColor: '0d0b14',
|
|
373
|
-
isFirst: false, isLast: true, hasDrawtext,
|
|
482
|
+
isFirst: false, isLast: true, hasDrawtext, cjkFontPath,
|
|
374
483
|
});
|
|
375
484
|
clipPaths.push(outroPath);
|
|
376
485
|
}
|
|
@@ -425,7 +534,7 @@ Options:
|
|
|
425
534
|
<dir> Card output directory (must contain deck.json + exports/*.png)
|
|
426
535
|
|
|
427
536
|
Audio:
|
|
428
|
-
--voice <id> TTS voice ID (default: female-
|
|
537
|
+
--voice <id> TTS voice ID (default: v-female-R2s4N9qJ)
|
|
429
538
|
--speed <n> TTS speed, 0.5-2.0 (default: 1.0)
|
|
430
539
|
--no-audio Silent video — skip TTS synthesis
|
|
431
540
|
|
|
@@ -516,5 +625,8 @@ module.exports = {
|
|
|
516
625
|
renderTitleCard,
|
|
517
626
|
escapeDrawtext,
|
|
518
627
|
writePcmAsWav,
|
|
628
|
+
containsCjk,
|
|
629
|
+
findCjkFontFile,
|
|
630
|
+
drawtextFontfileClause,
|
|
519
631
|
handle,
|
|
520
632
|
};
|
package/lib/commands/card.js
CHANGED
|
@@ -31,7 +31,7 @@ const meta = {
|
|
|
31
31
|
description: 'Card video export: narrated MP4 with subtitles, intro/outro, and BGM from a card-skill output directory',
|
|
32
32
|
options: [
|
|
33
33
|
'render <dir> Render deck.json + PNGs → narrated MP4 video',
|
|
34
|
-
'--voice <id> TTS voice ID (default: female-
|
|
34
|
+
'--voice <id> TTS voice ID (default: v-female-R2s4N9qJ)',
|
|
35
35
|
'--speed <n> TTS speed, 0.5-2.0 (default: 1.0)',
|
|
36
36
|
'--no-audio Silent video — skip TTS synthesis',
|
|
37
37
|
'--pause <sec> Silence after narration (reading time, default: 2.5)',
|
package/lib/commands/dub.js
CHANGED
|
@@ -527,7 +527,7 @@ const meta = {
|
|
|
527
527
|
`--bgm <file> Background music file to mix in`,
|
|
528
528
|
`--ducking <n> BGM volume ducking 0-1.0 (default: ${DUB_DEFAULTS.ducking})`,
|
|
529
529
|
`--patch <id> Re-synthesize a single caption by ID (patch mode)`,
|
|
530
|
-
|
|
530
|
+
`-o, --output <path> Output file path (default: ./dub-<timestamp>.wav)`,
|
|
531
531
|
],
|
|
532
532
|
examples: [
|
|
533
533
|
'voxflow dub --srt subtitles.srt',
|
package/lib/commands/explain.js
CHANGED
|
@@ -476,7 +476,7 @@ async function handle(args) {
|
|
|
476
476
|
topic: parseFlag(args, '--topic') || undefined,
|
|
477
477
|
voice: parseFlag(args, '--voice') || undefined,
|
|
478
478
|
style: style || undefined,
|
|
479
|
-
language: parseFlag(args, '--language') || undefined,
|
|
479
|
+
language: parseFlag(args, '--language', '--lang') || undefined,
|
|
480
480
|
output, speed, scenes,
|
|
481
481
|
audioOnly: parseBoolFlag(args, '--audio-only'),
|
|
482
482
|
cloud: parseBoolFlag(args, '--cloud'),
|
|
@@ -492,13 +492,13 @@ const meta = {
|
|
|
492
492
|
options: [
|
|
493
493
|
`--topic <text> Topic to explain (use "demo" for built-in demo)`,
|
|
494
494
|
`--style <style> Visual style: modern (default), playful, corporate, chalkboard`,
|
|
495
|
-
`--language <code> Script language: en (default), zh, ja, ko, etc
|
|
495
|
+
`--language <code> Script language: en (default), zh, ja, ko, etc. (alias: --lang)`,
|
|
496
496
|
`--voice <id> TTS voice ID (default: ${EXPLAIN_DEFAULTS.voice})`,
|
|
497
497
|
`--speed <n> TTS speed 0.5-2.0 (default: ${EXPLAIN_DEFAULTS.speed})`,
|
|
498
498
|
`--scenes <n> Number of scenes, 3-12 (default: ${EXPLAIN_DEFAULTS.sceneCount})`,
|
|
499
499
|
`--audio-only Skip video render, output WAV narration only`,
|
|
500
500
|
`--cloud Render on cloud instead of local Remotion`,
|
|
501
|
-
|
|
501
|
+
`-o, --output <path> Output file path (default: ./explain-<timestamp>.mp4)`,
|
|
502
502
|
],
|
|
503
503
|
examples: [
|
|
504
504
|
'voxflow explain --topic "What is React?"',
|
package/lib/commands/narrate.js
CHANGED
|
@@ -612,7 +612,7 @@ const meta = {
|
|
|
612
612
|
`--format <fmt> Output format: pcm, wav, mp3 (default: pcm → WAV)`,
|
|
613
613
|
`--speed <n> TTS speed 0.5-2.0 (default: ${NARRATE_DEFAULTS.speed})`,
|
|
614
614
|
`--silence <sec> Silence between segments, 0-5.0 (default: ${NARRATE_DEFAULTS.silence})`,
|
|
615
|
-
|
|
615
|
+
`-o, --output <path> Output file path (default: matches input basename, e.g. design.md → design.wav)`,
|
|
616
616
|
],
|
|
617
617
|
examples: [
|
|
618
618
|
'voxflow narrate --input article.txt --voice v-female-R2s4N9qJ',
|
package/lib/commands/picstory.js
CHANGED
|
@@ -487,7 +487,7 @@ async function handle(args) {
|
|
|
487
487
|
outputDir: outputDir || undefined,
|
|
488
488
|
style: style || undefined,
|
|
489
489
|
ratio: ratio || undefined,
|
|
490
|
-
language: parseFlag(args, '--language') || undefined,
|
|
490
|
+
language: parseFlag(args, '--language', '--lang') || undefined,
|
|
491
491
|
sceneCount: scenes,
|
|
492
492
|
quality: quality || undefined,
|
|
493
493
|
fadeSeconds: fadeSeconds !== undefined ? fadeSeconds : undefined,
|
|
@@ -511,7 +511,7 @@ const meta = {
|
|
|
511
511
|
`--text <content> Input text content to visualize`,
|
|
512
512
|
`--style <name> Visual style: sketchnote (default), neon_noir, minimal_3d, chalkboard, photo, manga_panel, vintage_newspaper`,
|
|
513
513
|
`--ratio <name> Aspect ratio: portrait (default, 9:16), landscape (16:9), square (1:1)`,
|
|
514
|
-
`--language <code> Script language: zh (default), en, ja, etc
|
|
514
|
+
`--language <code> Script language: zh (default), en, ja, etc. (alias: --lang)`,
|
|
515
515
|
`--scenes <n> Number of scenes, 2-10 (default: ${PICSTORY_DEFAULTS.sceneCount})`,
|
|
516
516
|
`--quality <tier> Image quality: fast (default), hd, ultra (gpt-5.4-image-2, best quality, ~16× cost), hd-aiberm / fast-aiberm (Aiberm Gemini — strongest Chinese text rendering)`,
|
|
517
517
|
`--voice <id> TTS voice ID`,
|
|
@@ -522,7 +522,7 @@ const meta = {
|
|
|
522
522
|
`--fade <n> Scene fade-in/out duration in seconds (default: ${PICSTORY_DEFAULTS.fadeSeconds}, set 0 to disable)`,
|
|
523
523
|
`--image-only Save images+audio without rendering video`,
|
|
524
524
|
`--output-dir <dir> Directory for all output files (auto-created if needed)`,
|
|
525
|
-
|
|
525
|
+
`-o, --output <path> Output file path (overrides --output-dir)`,
|
|
526
526
|
],
|
|
527
527
|
examples: [
|
|
528
528
|
'voxflow picstory --topic "AI Agent 入门" --style sketchnote',
|
|
@@ -534,7 +534,7 @@ const meta = {
|
|
|
534
534
|
`--voice <id> Override TTS voice for all speakers`,
|
|
535
535
|
`--bgm <file> Background music file to mix in`,
|
|
536
536
|
`--ducking <n> BGM volume ducking 0-1.0 (default: ${PODCAST_DEFAULTS.ducking})`,
|
|
537
|
-
|
|
537
|
+
`-o, --output <path> Output WAV path (default: ./podcast-<timestamp>.wav)`,
|
|
538
538
|
`--speed <n> TTS speed 0.5-2.0 (default: ${PODCAST_DEFAULTS.speed})`,
|
|
539
539
|
`--silence <sec> Uniform silence override between segments, 0-5.0 (legacy)`,
|
|
540
540
|
`--pace <preset> Pacing preset: tight | natural | relaxed (default: natural).`,
|
package/lib/commands/present.js
CHANGED
|
@@ -495,7 +495,7 @@ const meta = {
|
|
|
495
495
|
`--speed <n> TTS speed 0.5-2.0 (default: ${PRESENT_DEFAULTS.speed})`,
|
|
496
496
|
`--no-audio Skip TTS, render silent video only`,
|
|
497
497
|
`--web-search Search the web for up-to-date info on the topic`,
|
|
498
|
-
|
|
498
|
+
`-o, --output <path> Output file path (default: ./present-<timestamp>.mp4)`,
|
|
499
499
|
],
|
|
500
500
|
examples: [
|
|
501
501
|
'voxflow present --text "Claude Code 是一个 AI 编程工具" --style aurora',
|
package/lib/commands/publish.js
CHANGED
|
@@ -395,7 +395,7 @@ const meta = {
|
|
|
395
395
|
'--audio <file> Mode C: merge existing audio into video',
|
|
396
396
|
'--voice <id> TTS voice for Mode A/B',
|
|
397
397
|
'--voices <file> Multi-speaker voice map for Mode A/B',
|
|
398
|
-
'--output <path>
|
|
398
|
+
'-o, --output <path> Final MP4 output path',
|
|
399
399
|
'--publish <target> local (default) | webhook | none',
|
|
400
400
|
'--publish-dir <dir> Local publish directory (for --publish local)',
|
|
401
401
|
'--publish-webhook <url> Webhook URL (for --publish webhook)',
|
|
@@ -331,7 +331,7 @@ const meta = {
|
|
|
331
331
|
`--template <name> Template: product, report, tutorial, pitch, free (default: ${SLIDES_DEFAULTS.template})`,
|
|
332
332
|
`--model <id> Model: swift, balanced, pro, creative (default: ${SLIDES_DEFAULTS.model})`,
|
|
333
333
|
`--no-audio Skip TTS synthesis, generate slides only`,
|
|
334
|
-
|
|
334
|
+
`-o, --output <path> Output HTML file (default: ./slides-<timestamp>.html)`,
|
|
335
335
|
],
|
|
336
336
|
examples: [
|
|
337
337
|
'voxflow slides "AI in Healthcare"',
|
package/lib/commands/story.js
CHANGED
|
@@ -285,7 +285,7 @@ const meta = {
|
|
|
285
285
|
options: [
|
|
286
286
|
`--topic <text> Story topic (default: children's story)`,
|
|
287
287
|
`--voice <id> TTS voice ID (default: ${STORY_DEFAULTS.voice})`,
|
|
288
|
-
|
|
288
|
+
`-o, --output <path> Output WAV path (default: ./story-<timestamp>.wav)`,
|
|
289
289
|
`--paragraphs <n> Paragraph count, 1-20 (default: ${STORY_DEFAULTS.paragraphs})`,
|
|
290
290
|
`--speed <n> TTS speed 0.5-2.0 (default: ${STORY_DEFAULTS.speed})`,
|
|
291
291
|
`--silence <sec> Silence between paragraphs, 0-5.0 (default: ${STORY_DEFAULTS.silence})`,
|
|
@@ -462,7 +462,7 @@ async function handle(args) {
|
|
|
462
462
|
const voice = parseFlag(args, '--voice') || SUM_DEFS.voice;
|
|
463
463
|
const speed = parseFloatFlag(args, '--speed') ?? SUM_DEFS.speed;
|
|
464
464
|
const slideCount = parseIntFlag(args, '--slides') ?? SUM_DEFS.slides;
|
|
465
|
-
const language = parseFlag(args, '--lang') || SUM_DEFS.language;
|
|
465
|
+
const language = parseFlag(args, '--lang', '--language') || SUM_DEFS.language;
|
|
466
466
|
const engine = parseFlag(args, '--engine') || SUM_DEFS.engine;
|
|
467
467
|
const model = parseFlag(args, '--model');
|
|
468
468
|
const tts = parseBoolFlag(args, '--tts');
|
|
@@ -509,7 +509,7 @@ const meta = {
|
|
|
509
509
|
`--input <file> Input video/audio file → ASR + summarize`,
|
|
510
510
|
`--text <text> Direct text input (skip ASR)`,
|
|
511
511
|
`--slides <n> Number of slides, 4-12 (default: ${SUMMARIZE_DEFAULTS.slides})`,
|
|
512
|
-
`--lang <code> Output language: en, zh, ja, etc. (default: ${SUMMARIZE_DEFAULTS.language})`,
|
|
512
|
+
`--lang <code> Output language: en, zh, ja, etc. (default: ${SUMMARIZE_DEFAULTS.language}) (alias: --language)`,
|
|
513
513
|
`--engine <engine> ASR engine: auto, local, cloud (default: ${SUMMARIZE_DEFAULTS.engine})`,
|
|
514
514
|
`--model <model> Whisper model for local ASR: tiny, base, small, medium, large`,
|
|
515
515
|
`--tts Generate TTS narration audio for each slide`,
|
|
@@ -517,7 +517,7 @@ const meta = {
|
|
|
517
517
|
`--scheme <name> Video visual scheme: noir, neon, editorial, aurora (default), brutalist`,
|
|
518
518
|
`--voice <id> TTS voice ID (default: ${SUMMARIZE_DEFAULTS.voice})`,
|
|
519
519
|
`--speed <n> TTS speed 0.5-2.0 (default: ${SUMMARIZE_DEFAULTS.speed})`,
|
|
520
|
-
|
|
520
|
+
`-o, --output <path> Output PPTX path (default: <input>-summary.pptx)`,
|
|
521
521
|
],
|
|
522
522
|
examples: [
|
|
523
523
|
'voxflow summarize --input lecture.mp4',
|
|
@@ -566,7 +566,7 @@ const meta = {
|
|
|
566
566
|
`--input <file> Text file (.txt, .md) to translate`,
|
|
567
567
|
`--from <lang> Source language code (default: auto-detect)`,
|
|
568
568
|
`--to <lang> Target language code (required)`,
|
|
569
|
-
|
|
569
|
+
`-o, --output <path> Output file path (default: <input>-<lang>.<ext>)`,
|
|
570
570
|
`--realign Adjust subtitle timing for target language length`,
|
|
571
571
|
`--batch-size <n> Captions per LLM call, 1-20 (default: ${TRANSLATE_DEFAULTS.batchSize})`,
|
|
572
572
|
],
|
|
@@ -559,7 +559,7 @@ const meta = {
|
|
|
559
559
|
`--speed <n> TTS speed 0.5-2.0 (default: ${require('../core/config').VIDEO_TRANSLATE_DEFAULTS.speed})`,
|
|
560
560
|
`--batch-size <n> Translation batch size, 1-20 (default: ${require('../core/config').VIDEO_TRANSLATE_DEFAULTS.batchSize})`,
|
|
561
561
|
`--keep-intermediates Keep intermediate files (SRT, audio) for debugging`,
|
|
562
|
-
|
|
562
|
+
`-o, --output <path> Output MP4 path (default: <input>-<lang>.mp4)`,
|
|
563
563
|
`--asr-mode <mode> Override ASR mode: auto, sentence, flash, file`,
|
|
564
564
|
`--asr-lang <engine> Override ASR engine: 16k_zh, 16k_en, 16k_ja, 16k_ko, etc.`,
|
|
565
565
|
`--engine <engine> ASR engine: auto, local, cloud (default: auto)`,
|
package/lib/commands/voices.js
CHANGED
|
@@ -250,7 +250,7 @@ async function handle(args) {
|
|
|
250
250
|
api,
|
|
251
251
|
search: parseFlag(args, '--search'),
|
|
252
252
|
gender: parseFlag(args, '--gender'),
|
|
253
|
-
language: parseFlag(args, '--language'),
|
|
253
|
+
language: parseFlag(args, '--language', '--lang'),
|
|
254
254
|
useCase: parseFlag(args, '--use-case'),
|
|
255
255
|
json: parseBoolFlag(args, '--json'),
|
|
256
256
|
extended: parseBoolFlag(args, '--extended'),
|
|
@@ -275,7 +275,7 @@ const meta = {
|
|
|
275
275
|
`--mine List your cloned voices (requires login)`,
|
|
276
276
|
`--search <query> Search by name, tone, style, description`,
|
|
277
277
|
`--gender <m|f> Filter by gender: male/m or female/f`,
|
|
278
|
-
`--language <code> Filter by language: zh, en, etc
|
|
278
|
+
`--language <code> Filter by language: zh, en, etc. (alias: --lang)`,
|
|
279
279
|
`--use-case <tag> Filter by editorial-curated use case (e.g. podcast)`,
|
|
280
280
|
`--extended Include extended voice library (380+ voices)`,
|
|
281
281
|
`--json Output raw JSON instead of table`,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "voxflow",
|
|
3
|
-
"version": "1.17.
|
|
3
|
+
"version": "1.17.2",
|
|
4
4
|
"description": "AI voice CLI bundled as 6 skills (hub, podcast, transcribe, video, slice, card). Synthesize speech in 200+ voices across 40+ languages, generate multi-speaker AI podcasts, transcribe audio/video with word-level timestamps, dub videos from SRT subtitles, run end-to-end video translation, turn long articles into vertical card video reels via Remotion, and turn text into polished shareable card images or narrated card videos. Backed by a hosted TTS/ASR/LLM/render service with per-user quota (free tier 10K/mo).",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "VoxFlow",
|
package/skills/card/SKILL.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: card
|
|
3
|
-
description: "Use when the user wants to turn text content into a set of polished, shareable visual CARD IMAGES or narrated card VIDEOS — knowledge cards, quote cards, 小红书图文, carousel cards, poster cards — rendered as HTML/CSS and exported via Playwright at ratios like 1:1 / 3:4 / 9:16; optionally produces narrated MP4 video from those cards via `voxflow card render` (
|
|
3
|
+
description: "Use when the user wants to turn text content into a set of polished, shareable visual CARD IMAGES or narrated card VIDEOS — knowledge cards, quote cards, 小红书图文, carousel cards, poster cards — rendered as HTML/CSS and exported via Playwright at ratios like 1:1 / 3:4 / 9:16; optionally produces a narrated MP4 video from those cards via `voxflow card render` (per-card TTS + FFmpeg static-image clips with optional subtitle bar / intro+outro cards / BGM mix). Triggers: card / 卡片 / 知识卡 / 文字卡片 / 金句卡 / 图文卡片 / 卡片生成 / make cards / card video / 卡片视频. For article → Slice-themed card VIDEO use voxflow:slice; for short videos / AI clips use voxflow:video; for podcasts use voxflow:podcast."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# VoxFlow Skill — Card
|
|
@@ -374,7 +374,7 @@ Use `references/design-languages.md` to define the card set's visual grammar ind
|
|
|
374
374
|
"meta": {
|
|
375
375
|
"title": "<Series title>",
|
|
376
376
|
"ratio": "<ratio used: 9:16 | 1:1 | 3:4>",
|
|
377
|
-
"language": "<zh | en
|
|
377
|
+
"language": "<zh | en | ja | ...>"
|
|
378
378
|
},
|
|
379
379
|
"cards": [
|
|
380
380
|
{ "file": "card-01.html", "title": "...", "narration": "1-3 sentence spoken caption." },
|
|
@@ -383,6 +383,11 @@ Use `references/design-languages.md` to define the card set's visual grammar ind
|
|
|
383
383
|
}
|
|
384
384
|
```
|
|
385
385
|
|
|
386
|
+
- Field semantics:
|
|
387
|
+
- `meta.title` — drives the intro card text and the default output filename (slugified: `[^a-z0-9一-鿿]` → `-`, lowercased; CJK is preserved).
|
|
388
|
+
- `meta.language` — only `"zh"` switches the intro subtitle to "知识卡片"; any other value (including `"ja"`, `"en"`, `"mixed"`) falls back to "Card Series".
|
|
389
|
+
- `card.title` — used as the on-screen subtitle bar fallback when `card.narration` is empty.
|
|
390
|
+
- `card.narration` — the spoken caption fed to TTS and (by default) also rendered as the subtitle bar text.
|
|
386
391
|
- Narration rules:
|
|
387
392
|
- Write narration in the same language as the card copy.
|
|
388
393
|
- 1-3 sentences per card. Natural spoken rhythm — avoid lists, avoid bullet-speak.
|
|
@@ -405,22 +410,33 @@ Use `references/design-languages.md` to define the card set's visual grammar ind
|
|
|
405
410
|
├── deck.json (narration + metadata)
|
|
406
411
|
├── exports/card-01.png … (PNG exports)
|
|
407
412
|
├── sources.md (attribution)
|
|
408
|
-
└── my-topic.mp4 (final video —
|
|
413
|
+
└── my-topic.mp4 (final video — slug derived from deck.meta.title)
|
|
409
414
|
```
|
|
410
415
|
|
|
411
|
-
- **
|
|
416
|
+
- **Audio / TTS**:
|
|
412
417
|
- `--voice <id>` — TTS voice. Suggest `voxflow voices` to browse.
|
|
413
418
|
- `--speed <n>` — narration speed 0.5-2.0 (default: 1.0)
|
|
414
|
-
- `--
|
|
419
|
+
- `--no-audio` — skip TTS, produce a silent video (zero quota)
|
|
420
|
+
- **Timing**:
|
|
421
|
+
- `--pause <sec>` — silence after each card's narration for reading time (default: 2.5). Baked into the WAV so it always shows in the final clip.
|
|
415
422
|
- `--hold <sec>` — card duration in `--no-audio` mode (default: 5)
|
|
416
|
-
|
|
417
|
-
- `--no-
|
|
418
|
-
- `--
|
|
419
|
-
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
+
- **Structure**:
|
|
424
|
+
- `--no-intro` / `--no-outro` — skip title / branding cards
|
|
425
|
+
- `--intro-dur <sec>` — intro card duration (default: 2.5)
|
|
426
|
+
- `--outro-dur <sec>` — outro card duration (default: 2)
|
|
427
|
+
- **Overlay & mix**:
|
|
428
|
+
- `--no-subtitle` — disable the bottom subtitle bar (subtitles need FFmpeg with `libfreetype`; auto-detected and skipped if missing)
|
|
429
|
+
- `--bgm <path>` — background music, looped at low volume
|
|
430
|
+
- `--bgm-volume <n>` — BGM volume 0-1 (default: 0.08)
|
|
431
|
+
- **Output**:
|
|
432
|
+
- `-o <path>` / `--output <path>` — custom output path (parents auto-created)
|
|
433
|
+
|
|
434
|
+
- **CJK content** (since CLI 1.17.1): subtitles, intro, and outro overlays auto-detect CJK text in `meta.title` / `card.title` / `card.narration` and inject a CJK-capable system fontfile (PingFang / Hiragino / Heiti on macOS; Noto CJK / WQY on Linux; msyh / SimSun on Windows). If your platform has no CJK font installed, set `VOXFLOW_CJK_FONT=/path/to/font.ttc` to point at one explicitly. When neither autodetect nor override finds a font, the command logs a warning and you should fall back to `--no-subtitle --no-intro --no-outro` to avoid `□` tofu boxes.
|
|
435
|
+
|
|
436
|
+
- Default output: `<dir>/<slugified deck.meta.title>.mp4` (next to the cards). If `meta.title` is empty, falls back to `cards.mp4`.
|
|
437
|
+
- No external dependencies beyond FFmpeg (auto-detected; falls back to `ffmpeg-static` npm package when system ffmpeg is missing).
|
|
423
438
|
- Intermediate files (WAVs, clips) stored in `<dir>/.card-render-work/` — auto-cleaned on success, preserved on failure for debugging.
|
|
439
|
+
- **Quota**: ~50 per card narrated (`tts-synthesize`); zero with `--no-audio`. A 5-card deck costs ~250 quota total.
|
|
424
440
|
- For article-to-card VIDEO with Slice themes (paper-slide, editorial-mag, etc.), prefer `voxflow:slice` instead.
|
|
425
441
|
|
|
426
442
|
## Asset and Source Discipline
|
package/skills/hub/SKILL.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: hub
|
|
3
|
-
description: Use when the user wants to read text aloud (TTS), search VoxFlow voices, sample AI stories, or set up VoxFlow install/auth/quota — the entry-point voice toolkit. For podcasts use voxflow:podcast; for short videos / AI clips use voxflow:video; for article-to-card reels (Slice) use voxflow:slice; for transcription / dubbing / subtitle translation use voxflow:transcribe.
|
|
3
|
+
description: Use when the user wants to read text aloud (TTS), search VoxFlow voices, sample AI stories, or set up VoxFlow install/auth/quota — the entry-point voice toolkit. For podcasts use voxflow:podcast; for short videos / AI clips use voxflow:video; for article-to-card reels (Slice) use voxflow:slice; for shareable card images or narrated card videos use voxflow:card; for transcription / dubbing / subtitle translation use voxflow:transcribe.
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# VoxFlow Skill — Hub
|
|
@@ -19,7 +19,8 @@ For specialized tasks, switch to:
|
|
|
19
19
|
|
|
20
20
|
- **Podcasts** (multi-speaker dialogue) → `voxflow:podcast`
|
|
21
21
|
- **Short videos / AI clips / knowledge cards** (`picstory`, `present`, `slides`, `explain`) → `voxflow:video`
|
|
22
|
-
- **Article → vertical card video (Slice)** —
|
|
22
|
+
- **Article → vertical card video (Slice)** — 13 themes (paper-slide / editorial-mag / bold-poster / notion-card / brutalist / glass-dark / editorial-stencil / broadsheet / blueprint / daisy-pastel / showa-catalog / photo-feature / atmospheric), web app + Remotion → `voxflow:slice`
|
|
23
|
+
- **Shareable card images & narrated card videos** (HTML/CSS + Playwright export, optional `voxflow card render` for narrated MP4) → `voxflow:card`
|
|
23
24
|
- **Transcription, subtitle translation, dubbing, summarize, publish** (`asr`, `asr-jobs`, `translate`, `dub`, `video-translate`, `summarize`, `publish`) → `voxflow:transcribe`
|
|
24
25
|
|
|
25
26
|
## Install & login
|
package/skills/video/SKILL.md
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: video
|
|
3
|
-
description: Use when the user wants AI-generated short-form video — knowledge cards (picstory / 小红书 / TikTok / Reels), narrated explainers, presentations, AI clips, or slides — covering picstory, present, slides, explain, and image generation. For article-to-card reels (Slice —
|
|
3
|
+
description: Use when the user wants AI-generated short-form video — knowledge cards (picstory / 小红书 / TikTok / Reels), narrated explainers, presentations, AI clips, or slides — covering picstory, present, slides, explain, and image generation. For article-to-card reels (Slice — 13 themes including paper-slide), use voxflow:slice. For shareable HTML/CSS card images or narrated card MP4 videos (`voxflow card render`) use voxflow:card.
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# VoxFlow Video Skill
|
|
7
7
|
|
|
8
8
|
Generate short-form videos with AI: LLM writes the script, AI draws cards or scenes, TTS narrates, FFmpeg / Remotion renders the final MP4.
|
|
9
9
|
|
|
10
|
-
For article-to-card reels (Slice —
|
|
10
|
+
For article-to-card reels (Slice — 13 themes: paper-slide / editorial-mag / bold-poster / notion-card / brutalist / glass-dark / editorial-stencil / broadsheet / blueprint / daisy-pastel / showa-catalog / photo-feature / atmospheric), switch to `voxflow:slice`. For shareable HTML/CSS card image sets or narrated card-to-MP4 export, switch to `voxflow:card`.
|
|
11
11
|
|
|
12
12
|
Five entry points — pick by what the user wants:
|
|
13
13
|
|