@argo-video/cli 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -3
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +12 -3
- package/dist/cli.js.map +1 -1
- package/dist/init.d.ts +15 -0
- package/dist/init.d.ts.map +1 -1
- package/dist/init.js +55 -2
- package/dist/init.js.map +1 -1
- package/dist/parse-playwright.d.ts +49 -0
- package/dist/parse-playwright.d.ts.map +1 -0
- package/dist/parse-playwright.js +265 -0
- package/dist/parse-playwright.js.map +1 -0
- package/dist/tts/engines/mlx-audio.d.ts +31 -0
- package/dist/tts/engines/mlx-audio.d.ts.map +1 -1
- package/dist/tts/engines/mlx-audio.js +47 -5
- package/dist/tts/engines/mlx-audio.js.map +1 -1
- package/dist/tts/engines/sarvam.d.ts.map +1 -1
- package/dist/tts/engines/sarvam.js +20 -23
- package/dist/tts/engines/sarvam.js.map +1 -1
- package/package.json +9 -2
- package/scripts/generate_logo_thumbnail.py +174 -0
- package/scripts/record-voice-ref.sh +87 -0
- package/scripts/setup-mlx-audio.sh +66 -0
- package/scripts/voice-clone-preview.sh +193 -0
|
@@ -1,25 +1,67 @@
|
|
|
1
1
|
export class MlxAudioEngine {
|
|
2
2
|
baseUrl;
|
|
3
3
|
model;
|
|
4
|
+
serverOptions;
|
|
4
5
|
constructor(options) {
|
|
5
6
|
this.baseUrl = options?.baseUrl ?? 'http://localhost:8000';
|
|
6
7
|
this.model = options?.model ?? 'mlx-community/Spark-TTS-0.5B-bf16';
|
|
8
|
+
if (options?.refAudio && !options.refText) {
|
|
9
|
+
throw new Error('refText is required when refAudio is set for voice cloning');
|
|
10
|
+
}
|
|
11
|
+
// Build the optional server params, converting camelCase to snake_case keys
|
|
12
|
+
this.serverOptions = {};
|
|
13
|
+
if (options?.refAudio != null)
|
|
14
|
+
this.serverOptions.ref_audio = options.refAudio;
|
|
15
|
+
if (options?.refText != null)
|
|
16
|
+
this.serverOptions.ref_text = options.refText;
|
|
17
|
+
if (options?.instruct != null)
|
|
18
|
+
this.serverOptions.instruct = options.instruct;
|
|
19
|
+
if (options?.gender != null)
|
|
20
|
+
this.serverOptions.gender = options.gender;
|
|
21
|
+
if (options?.pitch != null)
|
|
22
|
+
this.serverOptions.pitch = options.pitch;
|
|
23
|
+
if (options?.langCode != null)
|
|
24
|
+
this.serverOptions.lang_code = options.langCode;
|
|
25
|
+
if (options?.temperature != null)
|
|
26
|
+
this.serverOptions.temperature = options.temperature;
|
|
27
|
+
if (options?.topP != null)
|
|
28
|
+
this.serverOptions.top_p = options.topP;
|
|
29
|
+
if (options?.topK != null)
|
|
30
|
+
this.serverOptions.top_k = options.topK;
|
|
31
|
+
if (options?.repetitionPenalty != null)
|
|
32
|
+
this.serverOptions.repetition_penalty = options.repetitionPenalty;
|
|
33
|
+
if (options?.responseFormat != null)
|
|
34
|
+
this.serverOptions.response_format = options.responseFormat;
|
|
35
|
+
if (options?.stream != null)
|
|
36
|
+
this.serverOptions.stream = options.stream;
|
|
37
|
+
if (options?.streamingInterval != null)
|
|
38
|
+
this.serverOptions.streaming_interval = options.streamingInterval;
|
|
39
|
+
if (options?.maxTokens != null)
|
|
40
|
+
this.serverOptions.max_tokens = options.maxTokens;
|
|
41
|
+
if (options?.verbose != null)
|
|
42
|
+
this.serverOptions.verbose = options.verbose;
|
|
7
43
|
}
|
|
8
44
|
async generate(text, options) {
|
|
9
45
|
if (!text?.trim())
|
|
10
46
|
throw new Error('TTS text must not be empty');
|
|
11
47
|
const controller = new AbortController();
|
|
12
48
|
const timeout = setTimeout(() => controller.abort(), 180_000);
|
|
49
|
+
const payload = {
|
|
50
|
+
model: this.model,
|
|
51
|
+
input: text,
|
|
52
|
+
voice: options.voice ?? 'af_heart',
|
|
53
|
+
...this.serverOptions,
|
|
54
|
+
};
|
|
55
|
+
// TTSEngineOptions.speed maps to the server's speed field
|
|
56
|
+
if (options.speed != null) {
|
|
57
|
+
payload.speed = options.speed;
|
|
58
|
+
}
|
|
13
59
|
let response;
|
|
14
60
|
try {
|
|
15
61
|
response = await fetch(`${this.baseUrl}/v1/audio/speech`, {
|
|
16
62
|
method: 'POST',
|
|
17
63
|
headers: { 'Content-Type': 'application/json' },
|
|
18
|
-
body: JSON.stringify(
|
|
19
|
-
model: this.model,
|
|
20
|
-
input: text,
|
|
21
|
-
voice: options.voice ?? 'af_heart',
|
|
22
|
-
}),
|
|
64
|
+
body: JSON.stringify(payload),
|
|
23
65
|
signal: controller.signal,
|
|
24
66
|
});
|
|
25
67
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mlx-audio.js","sourceRoot":"","sources":["../../../src/tts/engines/mlx-audio.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"mlx-audio.js","sourceRoot":"","sources":["../../../src/tts/engines/mlx-audio.ts"],"names":[],"mappings":"AAuCA,MAAM,OAAO,cAAc;IACjB,OAAO,CAAS;IAChB,KAAK,CAAS;IACd,aAAa,CAA0B;IAE/C,YAAY,OAA+B;QACzC,IAAI,CAAC,OAAO,GAAG,OAAO,EAAE,OAAO,IAAI,uBAAuB,CAAC;QAC3D,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,mCAAmC,CAAC;QAEnE,IAAI,OAAO,EAAE,QAAQ,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YAC1C,MAAM,IAAI,KAAK,CAAC,4DAA4D,CAAC,CAAC;QAChF,CAAC;QAED,4EAA4E;QAC5E,IAAI,CAAC,aAAa,GAAG,EAAE,CAAC;QACxB,IAAI,OAAO,EAAE,QAAQ,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,SAAS,GAAG,OAAO,CAAC,QAAQ,CAAC;QAC/E,IAAI,OAAO,EAAE,OAAO,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,QAAQ,GAAG,OAAO,CAAC,OAAO,CAAC;QAC5E,IAAI,OAAO,EAAE,QAAQ,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;QAC9E,IAAI,OAAO,EAAE,MAAM,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QACxE,IAAI,OAAO,EAAE,KAAK,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;QACrE,IAAI,OAAO,EAAE,QAAQ,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,SAAS,GAAG,OAAO,CAAC,QAAQ,CAAC;QAC/E,IAAI,OAAO,EAAE,WAAW,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,CAAC;QACvF,IAAI,OAAO,EAAE,IAAI,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;QACnE,IAAI,OAAO,EAAE,IAAI,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;QACnE,IAAI,OAAO,EAAE,iBAAiB,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,kBAAkB,GAAG,OAAO,CAAC,iBAAiB,CAAC;QAC1G,IAAI,OAAO,EAAE,cAAc,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,eAAe,GAAG,OAAO,CAAC,cAAc,CAAC;QACjG,IAAI,OAAO,EAAE,MAAM,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QACxE,IAAI,OAAO,EAAE,iBAAiB,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,kBAAkB,GAAG,OAAO,CAAC,iBAAiB,CAAC;QAC1G,IAAI,OAAO,EAAE,SAAS,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,UAAU,GAAG,OAAO,CAAC,SAAS,CAAC;QAClF,IAAI,OAAO,EAAE,OAAO,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC;IAC7E,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,OAAyB;QACpD,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAEjE,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,OAAO,CAAC,CAAC;QAE9D,MAAM,OAAO,GAA4B;YACvC,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,KAAK,EAAE,IAAI;YACX,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,UAAU;YAClC,GAAG,IAAI,CAAC,aAAa;SACtB,CAAC;QAEF,0DAA0D;QAC1D,IAAI,OAAO,CAAC,KAAK,IAAI,IAAI,EAAE,CAAC;YAC1B,OAAO,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;QAChC,CAAC;QAED,IAAI,QAAQ,CAAC;QACb,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,IAAI,CAAC,OAAO,kBAAkB,EAAE;gBACxD,MAAM,EAAE,MAAM;gBACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;gBAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;gBAC7B,MAAM,EAAE,UAAU,CAAC,MAAM;aAC1B,CAAC,CAAC;QACL,CAAC;gBAAS,CAAC;YACT,YAAY,CAAC,OAAO,CAAC,CAAC;QACxB,CAAC;QAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CACb,0BAA0B,QAAQ,CAAC,MAAM,KAAK,IAAI,IAAI;gBACtD,qEAAqE,IAAI,CAAC,KAAK,EAAE,CAClF,CAAC;QACJ,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;QAE9D,kDAAkD;QAClD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;QACtD,OAAO,YAAY,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;CACF"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sarvam.d.ts","sourceRoot":"","sources":["../../../src/tts/engines/sarvam.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAEhE,MAAM,WAAW,mBAAmB;IAClC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,qBAAa,YAAa,YAAW,SAAS;IAC5C,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,mBAAmB;IAKzC,OAAO,CAAC,aAAa;IAWf,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"sarvam.d.ts","sourceRoot":"","sources":["../../../src/tts/engines/sarvam.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAEhE,MAAM,WAAW,mBAAmB;IAClC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,qBAAa,YAAa,YAAW,SAAS;IAC5C,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,mBAAmB;IAKzC,OAAO,CAAC,aAAa;IAWf,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC;CAoCzE"}
|
|
@@ -16,33 +16,30 @@ export class SarvamEngine {
|
|
|
16
16
|
async generate(text, options) {
|
|
17
17
|
if (!text?.trim())
|
|
18
18
|
throw new Error('TTS text must not be empty');
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
inputs: [text],
|
|
27
|
-
target_language_code: options.lang ?? 'hi-IN',
|
|
28
|
-
speaker: options.voice ?? 'meera',
|
|
29
|
-
model: this.model,
|
|
30
|
-
pitch: 0,
|
|
31
|
-
pace: options.speed ?? 1.0,
|
|
32
|
-
loudness: 1.5,
|
|
33
|
-
enable_preprocessing: true,
|
|
34
|
-
}),
|
|
35
|
-
});
|
|
36
|
-
if (!response.ok) {
|
|
37
|
-
const body = await response.text();
|
|
38
|
-
throw new Error(`Sarvam TTS API error ${response.status}: ${body}`);
|
|
19
|
+
let SarvamAI;
|
|
20
|
+
try {
|
|
21
|
+
// @ts-ignore — sarvamai is an optional dependency
|
|
22
|
+
({ default: SarvamAI } = await import('sarvamai'));
|
|
23
|
+
}
|
|
24
|
+
catch {
|
|
25
|
+
throw new Error("Sarvam TTS engine requires the 'sarvamai' package. Install it with: npm i sarvamai");
|
|
39
26
|
}
|
|
40
|
-
const
|
|
41
|
-
|
|
27
|
+
const client = new SarvamAI({ apiSubscriptionKey: this.resolveApiKey() });
|
|
28
|
+
const response = await client.textToSpeech.convert({
|
|
29
|
+
inputs: [text],
|
|
30
|
+
target_language_code: options.lang ?? 'hi-IN',
|
|
31
|
+
speaker: options.voice ?? 'meera',
|
|
32
|
+
model: this.model,
|
|
33
|
+
pitch: 0,
|
|
34
|
+
pace: options.speed ?? 1.0,
|
|
35
|
+
loudness: 1.5,
|
|
36
|
+
enable_preprocessing: true,
|
|
37
|
+
});
|
|
38
|
+
if (!response.audios?.[0]) {
|
|
42
39
|
throw new Error('Sarvam TTS returned no audio data');
|
|
43
40
|
}
|
|
44
41
|
// Sarvam returns base64-encoded WAV
|
|
45
|
-
const audioBuffer = Buffer.from(
|
|
42
|
+
const audioBuffer = Buffer.from(response.audios[0], 'base64');
|
|
46
43
|
// Convert to Argo WAV format (mono Float32 24kHz)
|
|
47
44
|
const { convertToWav } = await import('../engine.js');
|
|
48
45
|
return convertToWav(audioBuffer);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sarvam.js","sourceRoot":"","sources":["../../../src/tts/engines/sarvam.ts"],"names":[],"mappings":"AAOA,MAAM,OAAO,YAAY;IACf,MAAM,CAAS;IACf,KAAK,CAAS;IAEtB,YAAY,OAA6B;QACvC,IAAI,CAAC,MAAM,GAAG,OAAO,EAAE,MAAM,IAAI,EAAE,CAAC;QACpC,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,WAAW,CAAC;IAC7C,CAAC;IAEO,aAAa;QACnB,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC;QAC5D,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,KAAK,CACb,yCAAyC;gBACzC,gEAAgE,CACjE,CAAC;QACJ,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,OAAyB;QACpD,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAEjE,
|
|
1
|
+
{"version":3,"file":"sarvam.js","sourceRoot":"","sources":["../../../src/tts/engines/sarvam.ts"],"names":[],"mappings":"AAOA,MAAM,OAAO,YAAY;IACf,MAAM,CAAS;IACf,KAAK,CAAS;IAEtB,YAAY,OAA6B;QACvC,IAAI,CAAC,MAAM,GAAG,OAAO,EAAE,MAAM,IAAI,EAAE,CAAC;QACpC,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,WAAW,CAAC;IAC7C,CAAC;IAEO,aAAa;QACnB,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC;QAC5D,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,KAAK,CACb,yCAAyC;gBACzC,gEAAgE,CACjE,CAAC;QACJ,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,OAAyB;QACpD,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAEjE,IAAI,QAAa,CAAC;QAClB,IAAI,CAAC;YACH,kDAAkD;YAClD,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC;QACrD,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CACb,oFAAoF,CACrF,CAAC;QACJ,CAAC;QAED,MAAM,MAAM,GAAG,IAAI,QAAQ,CAAC,EAAE,kBAAkB,EAAE,IAAI,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC;QAC1E,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,YAAY,CAAC,OAAO,CAAC;YACjD,MAAM,EAAE,CAAC,IAAI,CAAC;YACd,oBAAoB,EAAE,OAAO,CAAC,IAAI,IAAI,OAAO;YAC7C,OAAO,EAAE,OAAO,CAAC,KAAK,IAAI,OAAO;YACjC,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,KAAK,EAAE,CAAC;YACR,IAAI,EAAE,OAAO,CAAC,KAAK,IAAI,GAAG;YAC1B,QAAQ,EAAE,GAAG;YACb,oBAAoB,EAAE,IAAI;SAC3B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACvD,CAAC;QAED,oCAAoC;QACpC,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAE9D,kDAAkD;QAClD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;QACtD,OAAO,YAAY,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;CACF"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@argo-video/cli",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.0",
|
|
4
4
|
"description": "Turn Playwright demo scripts into polished product demo videos with AI voiceover",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -29,6 +29,12 @@
|
|
|
29
29
|
"commander": "^12.0.0",
|
|
30
30
|
"kokoro-js": "^1.2.1"
|
|
31
31
|
},
|
|
32
|
+
"optionalDependencies": {
|
|
33
|
+
"openai": "^4.0.0",
|
|
34
|
+
"elevenlabs": "^1.0.0",
|
|
35
|
+
"@google/genai": "^1.0.0",
|
|
36
|
+
"sarvamai": "^1.0.0"
|
|
37
|
+
},
|
|
32
38
|
"publishConfig": {
|
|
33
39
|
"access": "public"
|
|
34
40
|
},
|
|
@@ -39,7 +45,8 @@
|
|
|
39
45
|
"license": "MIT",
|
|
40
46
|
"files": [
|
|
41
47
|
"dist",
|
|
42
|
-
"bin"
|
|
48
|
+
"bin",
|
|
49
|
+
"scripts"
|
|
43
50
|
],
|
|
44
51
|
"devDependencies": {
|
|
45
52
|
"@playwright/test": "^1.50.0",
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import random
|
|
5
|
+
|
|
6
|
+
from PIL import Image, ImageDraw, ImageFilter, ImageFont
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
10
|
+
ASSETS = ROOT / "assets"
|
|
11
|
+
OUTPUT_PATH = ASSETS / "logo-thumb.png"
|
|
12
|
+
SOURCE_MARK_PATH = ASSETS / "logo-mark-source.png"
|
|
13
|
+
|
|
14
|
+
WIDTH = 1920
|
|
15
|
+
HEIGHT = 1080
|
|
16
|
+
|
|
17
|
+
MONO_FONT = "/System/Library/Fonts/SFNSMono.ttf"
|
|
18
|
+
DISPLAY_BOLD = "/Library/Fonts/SF-Compact-Display-Bold.otf"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_font(path: str, size: int) -> ImageFont.FreeTypeFont:
|
|
22
|
+
return ImageFont.truetype(path, size=size)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def normalize_mark() -> Image.Image:
|
|
26
|
+
if SOURCE_MARK_PATH.exists():
|
|
27
|
+
return Image.open(SOURCE_MARK_PATH).convert("RGBA")
|
|
28
|
+
|
|
29
|
+
original = Image.open(OUTPUT_PATH).convert("RGBA")
|
|
30
|
+
mark = Image.new("RGBA", original.size, (0, 0, 0, 0))
|
|
31
|
+
in_px = original.load()
|
|
32
|
+
out_px = mark.load()
|
|
33
|
+
|
|
34
|
+
for y in range(original.height):
|
|
35
|
+
for x in range(original.width):
|
|
36
|
+
r, g, b, _ = in_px[x, y]
|
|
37
|
+
if b > 150 and g > 110 and r < 140:
|
|
38
|
+
out_px[x, y] = (107, 180, 255, 255)
|
|
39
|
+
|
|
40
|
+
bbox = mark.getbbox()
|
|
41
|
+
if bbox is None:
|
|
42
|
+
raise RuntimeError("Could not isolate source logo mark")
|
|
43
|
+
|
|
44
|
+
cropped = mark.crop(bbox)
|
|
45
|
+
cropped.save(SOURCE_MARK_PATH)
|
|
46
|
+
return cropped
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def lerp(a: int, b: int, t: float) -> int:
|
|
50
|
+
return int(a + (b - a) * t)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def make_background() -> Image.Image:
|
|
54
|
+
image = Image.new("RGBA", (WIDTH, HEIGHT), (0, 0, 0, 255))
|
|
55
|
+
px = image.load()
|
|
56
|
+
|
|
57
|
+
top_left = (8, 10, 18)
|
|
58
|
+
bottom_right = (18, 22, 37)
|
|
59
|
+
top_right = (12, 14, 26)
|
|
60
|
+
|
|
61
|
+
for y in range(HEIGHT):
|
|
62
|
+
ty = y / (HEIGHT - 1)
|
|
63
|
+
for x in range(WIDTH):
|
|
64
|
+
tx = x / (WIDTH - 1)
|
|
65
|
+
r = lerp(lerp(top_left[0], top_right[0], tx), bottom_right[0], ty)
|
|
66
|
+
g = lerp(lerp(top_left[1], top_right[1], tx), bottom_right[1], ty)
|
|
67
|
+
b = lerp(lerp(top_left[2], top_right[2], tx), bottom_right[2], ty)
|
|
68
|
+
px[x, y] = (r, g, b, 255)
|
|
69
|
+
|
|
70
|
+
return image
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def add_radial_glow(base: Image.Image, center: tuple[int, int], radius: int, color: tuple[int, int, int], alpha: int, blur: int) -> None:
|
|
74
|
+
layer = Image.new("RGBA", base.size, (0, 0, 0, 0))
|
|
75
|
+
draw = ImageDraw.Draw(layer)
|
|
76
|
+
x, y = center
|
|
77
|
+
draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color + (alpha,))
|
|
78
|
+
layer = layer.filter(ImageFilter.GaussianBlur(blur))
|
|
79
|
+
base.alpha_composite(layer)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def add_grid(base: Image.Image) -> None:
|
|
83
|
+
grid = Image.new("RGBA", base.size, (0, 0, 0, 0))
|
|
84
|
+
draw = ImageDraw.Draw(grid)
|
|
85
|
+
|
|
86
|
+
for x in range(120, WIDTH, 96):
|
|
87
|
+
draw.line((x, 0, x, HEIGHT), fill=(92, 126, 188, 14), width=1)
|
|
88
|
+
for y in range(96, HEIGHT, 96):
|
|
89
|
+
draw.line((0, y, WIDTH, y), fill=(92, 126, 188, 10), width=1)
|
|
90
|
+
|
|
91
|
+
random.seed(7)
|
|
92
|
+
for _ in range(240):
|
|
93
|
+
x = random.randint(0, WIDTH - 1)
|
|
94
|
+
y = random.randint(0, HEIGHT - 1)
|
|
95
|
+
draw.point((x, y), fill=(160, 185, 255, random.randint(10, 28)))
|
|
96
|
+
|
|
97
|
+
base.alpha_composite(grid)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def draw_mark(base: Image.Image, mark: Image.Image) -> None:
|
|
101
|
+
mark = mark.resize((1440, int(mark.height * (1440 / mark.width))), Image.Resampling.LANCZOS)
|
|
102
|
+
|
|
103
|
+
glow = Image.new("RGBA", base.size, (0, 0, 0, 0))
|
|
104
|
+
glow_mark = mark.copy()
|
|
105
|
+
glow_mark = glow_mark.filter(ImageFilter.GaussianBlur(14))
|
|
106
|
+
gx = (WIDTH - glow_mark.width) // 2
|
|
107
|
+
gy = 200
|
|
108
|
+
glow.alpha_composite(glow_mark, (gx, gy))
|
|
109
|
+
|
|
110
|
+
tint = Image.new("RGBA", base.size, (0, 0, 0, 0))
|
|
111
|
+
tint_draw = ImageDraw.Draw(tint)
|
|
112
|
+
tint_draw.ellipse((420, 220, 1500, 760), fill=(71, 136, 255, 30))
|
|
113
|
+
tint = tint.filter(ImageFilter.GaussianBlur(54))
|
|
114
|
+
|
|
115
|
+
base.alpha_composite(tint)
|
|
116
|
+
base.alpha_composite(glow, (0, 0))
|
|
117
|
+
|
|
118
|
+
mark_shadow = Image.new("RGBA", base.size, (0, 0, 0, 0))
|
|
119
|
+
shadow = mark.copy().filter(ImageFilter.GaussianBlur(18))
|
|
120
|
+
mark_shadow.alpha_composite(shadow, (gx, gy + 8))
|
|
121
|
+
base.alpha_composite(mark_shadow)
|
|
122
|
+
base.alpha_composite(mark, (gx, gy))
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def draw_command_card(base: Image.Image) -> None:
|
|
126
|
+
card = Image.new("RGBA", base.size, (0, 0, 0, 0))
|
|
127
|
+
draw = ImageDraw.Draw(card)
|
|
128
|
+
|
|
129
|
+
x0, y0, x1, y1 = 500, 760, 1420, 892
|
|
130
|
+
shadow = Image.new("RGBA", base.size, (0, 0, 0, 0))
|
|
131
|
+
ImageDraw.Draw(shadow).rounded_rectangle((x0, y0 + 18, x1, y1 + 18), radius=34, fill=(0, 0, 0, 120))
|
|
132
|
+
shadow = shadow.filter(ImageFilter.GaussianBlur(24))
|
|
133
|
+
base.alpha_composite(shadow)
|
|
134
|
+
|
|
135
|
+
draw.rounded_rectangle((x0, y0, x1, y1), radius=34, fill=(14, 18, 30, 228), outline=(97, 127, 183, 88), width=2)
|
|
136
|
+
draw.rounded_rectangle((x0 + 2, y0 + 2, x1 - 2, y0 + 38), radius=32, fill=(18, 23, 38, 245))
|
|
137
|
+
|
|
138
|
+
button_y = y0 + 20
|
|
139
|
+
for idx, color in enumerate(((255, 95, 86), (255, 189, 46), (39, 201, 63))):
|
|
140
|
+
bx = x0 + 30 + idx * 22
|
|
141
|
+
draw.ellipse((bx, button_y, bx + 12, button_y + 12), fill=color)
|
|
142
|
+
|
|
143
|
+
mono = load_font(MONO_FONT, 40)
|
|
144
|
+
small = load_font(MONO_FONT, 24)
|
|
145
|
+
|
|
146
|
+
prompt_y = y0 + 64
|
|
147
|
+
draw.text((x0 + 34, prompt_y), "$", font=mono, fill=(123, 217, 129))
|
|
148
|
+
draw.text((x0 + 72, prompt_y), "npx argo", font=mono, fill=(110, 187, 255))
|
|
149
|
+
draw.text((x0 + 330, prompt_y), "pipeline", font=mono, fill=(233, 239, 252))
|
|
150
|
+
draw.text((x0 + 566, prompt_y), "showcase", font=mono, fill=(169, 146, 255))
|
|
151
|
+
|
|
152
|
+
note = "local-first • webkit-friendly • retina-ready"
|
|
153
|
+
note_box = draw.textbbox((0, 0), note, font=small)
|
|
154
|
+
note_x = (WIDTH - (note_box[2] - note_box[0])) / 2
|
|
155
|
+
draw.text((note_x, y1 + 24), note, font=small, fill=(100, 121, 160))
|
|
156
|
+
|
|
157
|
+
base.alpha_composite(card)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def main() -> None:
|
|
161
|
+
ASSETS.mkdir(exist_ok=True)
|
|
162
|
+
mark = normalize_mark()
|
|
163
|
+
image = make_background()
|
|
164
|
+
add_radial_glow(image, (WIDTH // 2, 420), 420, (48, 99, 235), 48, 96)
|
|
165
|
+
add_radial_glow(image, (1470, 210), 220, (41, 91, 235), 42, 84)
|
|
166
|
+
add_radial_glow(image, (360, 910), 300, (43, 74, 170), 34, 92)
|
|
167
|
+
add_grid(image)
|
|
168
|
+
draw_mark(image, mark)
|
|
169
|
+
draw_command_card(image)
|
|
170
|
+
image.convert("RGB").save(OUTPUT_PATH, quality=95)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
if __name__ == "__main__":
|
|
174
|
+
main()
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
#
|
|
3
|
+
# Record and process a voice reference clip for mlx-audio voice cloning.
|
|
4
|
+
# Usage: ./scripts/record-voice-ref.sh [output_path]
|
|
5
|
+
#
|
|
6
|
+
# Requirements: ffmpeg (brew install ffmpeg), macOS with built-in mic
|
|
7
|
+
#
|
|
8
|
+
set -euo pipefail
|
|
9
|
+
|
|
10
|
+
OUTPUT="${1:-assets/ref-voice.wav}"
|
|
11
|
+
RAW_FILE=$(mktemp /tmp/voice-ref-raw.XXXXXX.wav)
|
|
12
|
+
|
|
13
|
+
# Suggested text — covers a wide range of English phonemes
|
|
14
|
+
cat <<'PROMPT'
|
|
15
|
+
╔══════════════════════════════════════════════════════════════════╗
|
|
16
|
+
║ Voice Reference Recording ║
|
|
17
|
+
╠══════════════════════════════════════════════════════════════════╣
|
|
18
|
+
║ ║
|
|
19
|
+
║ Read the following text naturally, at your demo narration pace: ║
|
|
20
|
+
║ ║
|
|
21
|
+
║ "Hi, my name is [YOUR NAME]. I build developer tools and love ║
|
|
22
|
+
║ creating great product demos. The quick brown fox jumps over ║
|
|
23
|
+
║ the lazy dog. Pack my box with five dozen liquor jugs." ║
|
|
24
|
+
║ ║
|
|
25
|
+
║ Tips: ║
|
|
26
|
+
║ • Sit ~6 inches from the mic ║
|
|
27
|
+
║ • Use a quiet room (close windows, turn off fans) ║
|
|
28
|
+
║ • Speak clearly at your natural pace ║
|
|
29
|
+
║ • Aim for 5–15 seconds ║
|
|
30
|
+
║ ║
|
|
31
|
+
╚══════════════════════════════════════════════════════════════════╝
|
|
32
|
+
|
|
33
|
+
PROMPT
|
|
34
|
+
|
|
35
|
+
echo "Press ENTER to start recording (Ctrl+C to cancel)..."
|
|
36
|
+
read -r
|
|
37
|
+
|
|
38
|
+
echo "🎙 Recording... Press Ctrl+C when done."
|
|
39
|
+
|
|
40
|
+
# Record using macOS Core Audio (avfoundation) via ffmpeg
|
|
41
|
+
# Uses the default input device (built-in mic or whatever is selected in
|
|
42
|
+
# System Settings > Sound > Input)
|
|
43
|
+
ffmpeg -y -f avfoundation -i ":default" \
|
|
44
|
+
-acodec pcm_s16le -ar 44100 -ac 1 \
|
|
45
|
+
"$RAW_FILE" 2>/dev/null || true
|
|
46
|
+
|
|
47
|
+
echo ""
|
|
48
|
+
echo "Processing audio..."
|
|
49
|
+
|
|
50
|
+
# Get duration
|
|
51
|
+
DURATION=$(ffprobe -v error -show_entries format=duration \
|
|
52
|
+
-of csv=p=0 "$RAW_FILE" 2>/dev/null || echo "0")
|
|
53
|
+
|
|
54
|
+
if [ "$DURATION" = "0" ] || [ "$(echo "$DURATION < 2" | bc -l 2>/dev/null || echo 0)" = "1" ]; then
|
|
55
|
+
echo "Error: Recording too short or failed. Try again."
|
|
56
|
+
rm -f "$RAW_FILE"
|
|
57
|
+
exit 1
|
|
58
|
+
fi
|
|
59
|
+
|
|
60
|
+
# Create output directory if needed
|
|
61
|
+
mkdir -p "$(dirname "$OUTPUT")"
|
|
62
|
+
|
|
63
|
+
# Process: mono, 24kHz, noise-reduced, normalized
|
|
64
|
+
ffmpeg -y -i "$RAW_FILE" \
|
|
65
|
+
-af "highpass=f=80,lowpass=f=12000,afftdn=nf=-25,loudnorm=I=-16:TP=-1.5:LRA=11" \
|
|
66
|
+
-ar 24000 -ac 1 -acodec pcm_s16le \
|
|
67
|
+
"$OUTPUT" 2>/dev/null
|
|
68
|
+
|
|
69
|
+
rm -f "$RAW_FILE"
|
|
70
|
+
|
|
71
|
+
FINAL_DURATION=$(ffprobe -v error -show_entries format=duration \
|
|
72
|
+
-of csv=p=0 "$OUTPUT" 2>/dev/null)
|
|
73
|
+
|
|
74
|
+
echo ""
|
|
75
|
+
echo "Done! Saved to: $OUTPUT (${FINAL_DURATION}s)"
|
|
76
|
+
echo ""
|
|
77
|
+
echo "Next steps:"
|
|
78
|
+
echo " 1. Listen: ffplay $OUTPUT"
|
|
79
|
+
echo " 2. Add to your argo config:"
|
|
80
|
+
echo ""
|
|
81
|
+
echo " tts: {"
|
|
82
|
+
echo " engine: engines.mlxAudio({"
|
|
83
|
+
echo " model: 'mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16',"
|
|
84
|
+
echo " refAudio: './$OUTPUT',"
|
|
85
|
+
echo " refText: 'YOUR EXACT TRANSCRIPT HERE',"
|
|
86
|
+
echo " }),"
|
|
87
|
+
echo " }"
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
#
|
|
3
|
+
# Set up mlx-audio with all dependencies for Argo voice cloning.
|
|
4
|
+
# Usage: ./scripts/setup-mlx-audio.sh
|
|
5
|
+
#
|
|
6
|
+
# Creates a .venv in the project root with mlx-audio and its
|
|
7
|
+
# transitive deps that aren't auto-installed.
|
|
8
|
+
#
|
|
9
|
+
set -euo pipefail
|
|
10
|
+
|
|
11
|
+
VENV_DIR="${1:-.venv}"
|
|
12
|
+
|
|
13
|
+
echo "╔══════════════════════════════════════════════╗"
|
|
14
|
+
echo "║ Argo — mlx-audio setup (Apple Silicon) ║"
|
|
15
|
+
echo "╚══════════════════════════════════════════════╝"
|
|
16
|
+
echo ""
|
|
17
|
+
|
|
18
|
+
# Check for uv
|
|
19
|
+
if ! command -v uv &>/dev/null; then
|
|
20
|
+
echo "✗ uv not found. Install it: curl -LsSf https://astral.sh/uv/install.sh | sh"
|
|
21
|
+
exit 1
|
|
22
|
+
fi
|
|
23
|
+
|
|
24
|
+
# Check for Apple Silicon
|
|
25
|
+
if [[ "$(uname -m)" != "arm64" ]]; then
|
|
26
|
+
echo "! Warning: mlx-audio is optimized for Apple Silicon (arm64)."
|
|
27
|
+
echo " Current arch: $(uname -m). Performance may be poor."
|
|
28
|
+
fi
|
|
29
|
+
|
|
30
|
+
# Create venv
|
|
31
|
+
echo "★ Creating venv at ${VENV_DIR}..."
|
|
32
|
+
uv venv "$VENV_DIR"
|
|
33
|
+
|
|
34
|
+
# Install mlx-audio + missing transitive deps
|
|
35
|
+
echo "★ Installing mlx-audio..."
|
|
36
|
+
uv pip install -p "$VENV_DIR" mlx-audio
|
|
37
|
+
|
|
38
|
+
echo "★ Installing missing transitive dependencies..."
|
|
39
|
+
uv pip install -p "$VENV_DIR" "misaki[en]" num2words pip
|
|
40
|
+
|
|
41
|
+
# setuptools < 70 needed for webrtcvad's pkg_resources import
|
|
42
|
+
echo "★ Installing setuptools (< 70 for pkg_resources compat)..."
|
|
43
|
+
uv pip install -p "$VENV_DIR" "setuptools<70"
|
|
44
|
+
|
|
45
|
+
# Server deps (for OpenAI-compatible HTTP API)
|
|
46
|
+
echo "★ Installing server dependencies..."
|
|
47
|
+
uv pip install -p "$VENV_DIR" uvicorn fastapi python-multipart webrtcvad
|
|
48
|
+
|
|
49
|
+
echo ""
|
|
50
|
+
echo "✓ Done! mlx-audio installed at ${VENV_DIR}"
|
|
51
|
+
echo ""
|
|
52
|
+
echo "Usage:"
|
|
53
|
+
echo " # Start the TTS server"
|
|
54
|
+
echo " ${VENV_DIR}/bin/python3 -m mlx_audio.server --port 8000"
|
|
55
|
+
echo ""
|
|
56
|
+
echo " # Record a voice reference clip"
|
|
57
|
+
echo " ./scripts/record-voice-ref.sh assets/ref-voice.wav"
|
|
58
|
+
echo ""
|
|
59
|
+
echo " # Preview cloned voice"
|
|
60
|
+
echo " ./scripts/voice-clone-preview.sh \\"
|
|
61
|
+
echo " --ref-audio assets/ref-voice.wav \\"
|
|
62
|
+
echo " --ref-text 'Your transcript here.' \\"
|
|
63
|
+
echo " --voiceover demos/showcase.voiceover.json --play"
|
|
64
|
+
echo ""
|
|
65
|
+
echo " # Use in argo config"
|
|
66
|
+
echo " engines.mlxAudio({ model: 'mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16' })"
|