@argo-video/cli 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -20,7 +20,7 @@ Write a demo script with Playwright. Add a voiceover manifest. Run one command.
|
|
|
20
20
|
## Showcase
|
|
21
21
|
|
|
22
22
|
[Watch the demo video](
|
|
23
|
-
https://gist.github.com/user-attachments/assets/
|
|
23
|
+
https://gist.github.com/user-attachments/assets/ba009e90-0310-454b-833e-e0d71d4dd72f)
|
|
24
24
|
|
|
25
25
|
> *This demo was recorded by Argo, using Argo. Yes, really.*
|
|
26
26
|
|
|
@@ -251,6 +251,29 @@ choco install ffmpeg # Windows
|
|
|
251
251
|
|--------|------|---------|---------|
|
|
252
252
|
| `engines.kokoro()` | local | built-in | none |
|
|
253
253
|
| `engines.mlxAudio()` | local | `pip install mlx-audio` | none |
|
|
254
|
+
|
|
255
|
+
**Voice cloning** — Clone your own voice locally with mlx-audio. Record a 15-second clip, and every demo sounds like you — privately, no data leaves your machine:
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
# Record a reference clip (macOS)
|
|
259
|
+
./scripts/record-voice-ref.sh assets/ref-voice.wav
|
|
260
|
+
|
|
261
|
+
# Preview cloned voice against your manifest
|
|
262
|
+
./scripts/voice-clone-preview.sh \
|
|
263
|
+
--ref-audio assets/ref-voice.wav \
|
|
264
|
+
--ref-text "Transcript of what I said." \
|
|
265
|
+
--voiceover demos/showcase.voiceover.json --play
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
```js
|
|
269
|
+
tts: {
|
|
270
|
+
engine: engines.mlxAudio({
|
|
271
|
+
model: 'mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16',
|
|
272
|
+
refAudio: './assets/ref-voice.wav',
|
|
273
|
+
refText: 'Transcript of what I said in the clip.',
|
|
274
|
+
}),
|
|
275
|
+
}
|
|
276
|
+
```
|
|
254
277
|
| `engines.openai()` | cloud | `npm i openai` | `OPENAI_API_KEY` |
|
|
255
278
|
| `engines.elevenlabs()` | cloud | `npm i elevenlabs` | `ELEVENLABS_API_KEY` |
|
|
256
279
|
| `engines.gemini()` | cloud | `npm i @google/genai` | `GEMINI_API_KEY` |
|
|
@@ -4,10 +4,41 @@ export interface MlxAudioEngineOptions {
|
|
|
4
4
|
baseUrl?: string;
|
|
5
5
|
/** Model ID passed to the server. Default: mlx-community/Spark-TTS-0.5B-bf16 */
|
|
6
6
|
model?: string;
|
|
7
|
+
/** Path to a reference audio WAV file for voice cloning (requires a cloning-capable model). */
|
|
8
|
+
refAudio?: string;
|
|
9
|
+
/** Transcript of the reference audio (required when refAudio is set). */
|
|
10
|
+
refText?: string;
|
|
11
|
+
/** Instruction text for models that support instruct mode (e.g. emotion/style control). */
|
|
12
|
+
instruct?: string;
|
|
13
|
+
/** Gender hint. Default: "male". */
|
|
14
|
+
gender?: string;
|
|
15
|
+
/** Pitch multiplier. Default: 1.0. */
|
|
16
|
+
pitch?: number;
|
|
17
|
+
/** Language code. Default: "a". */
|
|
18
|
+
langCode?: string;
|
|
19
|
+
/** Sampling temperature. Default: 0.7. */
|
|
20
|
+
temperature?: number;
|
|
21
|
+
/** Top-p sampling. Default: 0.95. */
|
|
22
|
+
topP?: number;
|
|
23
|
+
/** Top-k sampling. Default: 40. */
|
|
24
|
+
topK?: number;
|
|
25
|
+
/** Repetition penalty. Default: 1.0. */
|
|
26
|
+
repetitionPenalty?: number;
|
|
27
|
+
/** Response audio format. Default: "mp3". */
|
|
28
|
+
responseFormat?: string;
|
|
29
|
+
/** Enable streaming response. Default: false. */
|
|
30
|
+
stream?: boolean;
|
|
31
|
+
/** Streaming chunk interval in seconds. Default: 2.0. */
|
|
32
|
+
streamingInterval?: number;
|
|
33
|
+
/** Max generation tokens. Default: 1200. */
|
|
34
|
+
maxTokens?: number;
|
|
35
|
+
/** Enable verbose server logging. Default: false. */
|
|
36
|
+
verbose?: boolean;
|
|
7
37
|
}
|
|
8
38
|
export declare class MlxAudioEngine implements TTSEngine {
|
|
9
39
|
private baseUrl;
|
|
10
40
|
private model;
|
|
41
|
+
private serverOptions;
|
|
11
42
|
constructor(options?: MlxAudioEngineOptions);
|
|
12
43
|
generate(text: string, options: TTSEngineOptions): Promise<Buffer>;
|
|
13
44
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mlx-audio.d.ts","sourceRoot":"","sources":["../../../src/tts/engines/mlx-audio.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAEhE,MAAM,WAAW,qBAAqB;IACpC,2DAA2D;IAC3D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,gFAAgF;IAChF,KAAK,CAAC,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"mlx-audio.d.ts","sourceRoot":"","sources":["../../../src/tts/engines/mlx-audio.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAEhE,MAAM,WAAW,qBAAqB;IACpC,2DAA2D;IAC3D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,gFAAgF;IAChF,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,+FAA+F;IAC/F,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,yEAAyE;IACzE,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,2FAA2F;IAC3F,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,oCAAoC;IACpC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,sCAAsC;IACtC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,mCAAmC;IACnC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,0CAA0C;IAC1C,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,qCAAqC;IACrC,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,mCAAmC;IACnC,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,wCAAwC;IACxC,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,6CAA6C;IAC7C,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,iDAAiD;IACjD,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,4CAA4C;IAC5C,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,qDAAqD;IACrD,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,qBAAa,cAAe,YAAW,SAAS;IAC9C,OAAO,CAAC,OAAO,CAAS;IACxB,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,aAAa,CAA0B;gBAEnC,OAAO,CAAC,EAAE,qBAAqB;IA2BrC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC;CA4CzE"}
|
|
@@ -1,25 +1,67 @@
|
|
|
1
1
|
export class MlxAudioEngine {
|
|
2
2
|
baseUrl;
|
|
3
3
|
model;
|
|
4
|
+
serverOptions;
|
|
4
5
|
constructor(options) {
|
|
5
6
|
this.baseUrl = options?.baseUrl ?? 'http://localhost:8000';
|
|
6
7
|
this.model = options?.model ?? 'mlx-community/Spark-TTS-0.5B-bf16';
|
|
8
|
+
if (options?.refAudio && !options.refText) {
|
|
9
|
+
throw new Error('refText is required when refAudio is set for voice cloning');
|
|
10
|
+
}
|
|
11
|
+
// Build the optional server params, converting camelCase to snake_case keys
|
|
12
|
+
this.serverOptions = {};
|
|
13
|
+
if (options?.refAudio != null)
|
|
14
|
+
this.serverOptions.ref_audio = options.refAudio;
|
|
15
|
+
if (options?.refText != null)
|
|
16
|
+
this.serverOptions.ref_text = options.refText;
|
|
17
|
+
if (options?.instruct != null)
|
|
18
|
+
this.serverOptions.instruct = options.instruct;
|
|
19
|
+
if (options?.gender != null)
|
|
20
|
+
this.serverOptions.gender = options.gender;
|
|
21
|
+
if (options?.pitch != null)
|
|
22
|
+
this.serverOptions.pitch = options.pitch;
|
|
23
|
+
if (options?.langCode != null)
|
|
24
|
+
this.serverOptions.lang_code = options.langCode;
|
|
25
|
+
if (options?.temperature != null)
|
|
26
|
+
this.serverOptions.temperature = options.temperature;
|
|
27
|
+
if (options?.topP != null)
|
|
28
|
+
this.serverOptions.top_p = options.topP;
|
|
29
|
+
if (options?.topK != null)
|
|
30
|
+
this.serverOptions.top_k = options.topK;
|
|
31
|
+
if (options?.repetitionPenalty != null)
|
|
32
|
+
this.serverOptions.repetition_penalty = options.repetitionPenalty;
|
|
33
|
+
if (options?.responseFormat != null)
|
|
34
|
+
this.serverOptions.response_format = options.responseFormat;
|
|
35
|
+
if (options?.stream != null)
|
|
36
|
+
this.serverOptions.stream = options.stream;
|
|
37
|
+
if (options?.streamingInterval != null)
|
|
38
|
+
this.serverOptions.streaming_interval = options.streamingInterval;
|
|
39
|
+
if (options?.maxTokens != null)
|
|
40
|
+
this.serverOptions.max_tokens = options.maxTokens;
|
|
41
|
+
if (options?.verbose != null)
|
|
42
|
+
this.serverOptions.verbose = options.verbose;
|
|
7
43
|
}
|
|
8
44
|
async generate(text, options) {
|
|
9
45
|
if (!text?.trim())
|
|
10
46
|
throw new Error('TTS text must not be empty');
|
|
11
47
|
const controller = new AbortController();
|
|
12
48
|
const timeout = setTimeout(() => controller.abort(), 180_000);
|
|
49
|
+
const payload = {
|
|
50
|
+
model: this.model,
|
|
51
|
+
input: text,
|
|
52
|
+
voice: options.voice ?? 'af_heart',
|
|
53
|
+
...this.serverOptions,
|
|
54
|
+
};
|
|
55
|
+
// TTSEngineOptions.speed maps to the server's speed field
|
|
56
|
+
if (options.speed != null) {
|
|
57
|
+
payload.speed = options.speed;
|
|
58
|
+
}
|
|
13
59
|
let response;
|
|
14
60
|
try {
|
|
15
61
|
response = await fetch(`${this.baseUrl}/v1/audio/speech`, {
|
|
16
62
|
method: 'POST',
|
|
17
63
|
headers: { 'Content-Type': 'application/json' },
|
|
18
|
-
body: JSON.stringify(
|
|
19
|
-
model: this.model,
|
|
20
|
-
input: text,
|
|
21
|
-
voice: options.voice ?? 'af_heart',
|
|
22
|
-
}),
|
|
64
|
+
body: JSON.stringify(payload),
|
|
23
65
|
signal: controller.signal,
|
|
24
66
|
});
|
|
25
67
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mlx-audio.js","sourceRoot":"","sources":["../../../src/tts/engines/mlx-audio.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"mlx-audio.js","sourceRoot":"","sources":["../../../src/tts/engines/mlx-audio.ts"],"names":[],"mappings":"AAuCA,MAAM,OAAO,cAAc;IACjB,OAAO,CAAS;IAChB,KAAK,CAAS;IACd,aAAa,CAA0B;IAE/C,YAAY,OAA+B;QACzC,IAAI,CAAC,OAAO,GAAG,OAAO,EAAE,OAAO,IAAI,uBAAuB,CAAC;QAC3D,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,mCAAmC,CAAC;QAEnE,IAAI,OAAO,EAAE,QAAQ,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YAC1C,MAAM,IAAI,KAAK,CAAC,4DAA4D,CAAC,CAAC;QAChF,CAAC;QAED,4EAA4E;QAC5E,IAAI,CAAC,aAAa,GAAG,EAAE,CAAC;QACxB,IAAI,OAAO,EAAE,QAAQ,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,SAAS,GAAG,OAAO,CAAC,QAAQ,CAAC;QAC/E,IAAI,OAAO,EAAE,OAAO,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,QAAQ,GAAG,OAAO,CAAC,OAAO,CAAC;QAC5E,IAAI,OAAO,EAAE,QAAQ,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;QAC9E,IAAI,OAAO,EAAE,MAAM,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QACxE,IAAI,OAAO,EAAE,KAAK,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;QACrE,IAAI,OAAO,EAAE,QAAQ,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,SAAS,GAAG,OAAO,CAAC,QAAQ,CAAC;QAC/E,IAAI,OAAO,EAAE,WAAW,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,CAAC;QACvF,IAAI,OAAO,EAAE,IAAI,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;QACnE,IAAI,OAAO,EAAE,IAAI,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;QACnE,IAAI,OAAO,EAAE,iBAAiB,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,kBAAkB,GAAG,OAAO,CAAC,iBAAiB,CAAC;QAC1G,IAAI,OAAO,EAAE,cAAc,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,eAAe,GAAG,OAAO,CAAC,cAAc,CAAC;QACjG,IAAI,OAAO,EAAE,MAAM,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QACxE,IAAI,OAAO,EAAE,iBAAiB,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,kBAAkB,GAAG,OAAO,CAAC,iBAAiB,CAAC;QAC1G,IAAI,OAAO,EAAE,SAAS,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,UAAU,GAAG,OAAO,CAAC,SAAS,CAAC;QAClF,IAAI,OAAO,EAAE,OAAO,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC;IAC7E,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,OAAyB;QACpD,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAEjE,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,OAAO,CAAC,CAAC;QAE9D,MAAM,OAAO,GAA4B;YACvC,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,KAAK,EAAE,IAAI;YACX,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,UAAU;YAClC,GAAG,IAAI,CAAC,aAAa;SACtB,CAAC;QAEF,0DAA0D;QAC1D,IAAI,OAAO,CAAC,KAAK,IAAI,IAAI,EAAE,CAAC;YAC1B,OAAO,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;QAChC,CAAC;QAED,IAAI,QAAQ,CAAC;QACb,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,IAAI,CAAC,OAAO,kBAAkB,EAAE;gBACxD,MAAM,EAAE,MAAM;gBACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;gBAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;gBAC7B,MAAM,EAAE,UAAU,CAAC,MAAM;aAC1B,CAAC,CAAC;QACL,CAAC;gBAAS,CAAC;YACT,YAAY,CAAC,OAAO,CAAC,CAAC;QACxB,CAAC;QAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CACb,0BAA0B,QAAQ,CAAC,MAAM,KAAK,IAAI,IAAI;gBACtD,qEAAqE,IAAI,CAAC,KAAK,EAAE,CAClF,CAAC;QACJ,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;QAE9D,kDAAkD;QAClD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;QACtD,OAAO,YAAY,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;CACF"}
|