@argo-video/cli 0.7.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,25 +1,67 @@
1
1
  export class MlxAudioEngine {
2
2
  baseUrl;
3
3
  model;
4
+ serverOptions;
4
5
  constructor(options) {
5
6
  this.baseUrl = options?.baseUrl ?? 'http://localhost:8000';
6
7
  this.model = options?.model ?? 'mlx-community/Spark-TTS-0.5B-bf16';
8
+ if (options?.refAudio && !options.refText) {
9
+ throw new Error('refText is required when refAudio is set for voice cloning');
10
+ }
11
+ // Build the optional server params, converting camelCase to snake_case keys
12
+ this.serverOptions = {};
13
+ if (options?.refAudio != null)
14
+ this.serverOptions.ref_audio = options.refAudio;
15
+ if (options?.refText != null)
16
+ this.serverOptions.ref_text = options.refText;
17
+ if (options?.instruct != null)
18
+ this.serverOptions.instruct = options.instruct;
19
+ if (options?.gender != null)
20
+ this.serverOptions.gender = options.gender;
21
+ if (options?.pitch != null)
22
+ this.serverOptions.pitch = options.pitch;
23
+ if (options?.langCode != null)
24
+ this.serverOptions.lang_code = options.langCode;
25
+ if (options?.temperature != null)
26
+ this.serverOptions.temperature = options.temperature;
27
+ if (options?.topP != null)
28
+ this.serverOptions.top_p = options.topP;
29
+ if (options?.topK != null)
30
+ this.serverOptions.top_k = options.topK;
31
+ if (options?.repetitionPenalty != null)
32
+ this.serverOptions.repetition_penalty = options.repetitionPenalty;
33
+ if (options?.responseFormat != null)
34
+ this.serverOptions.response_format = options.responseFormat;
35
+ if (options?.stream != null)
36
+ this.serverOptions.stream = options.stream;
37
+ if (options?.streamingInterval != null)
38
+ this.serverOptions.streaming_interval = options.streamingInterval;
39
+ if (options?.maxTokens != null)
40
+ this.serverOptions.max_tokens = options.maxTokens;
41
+ if (options?.verbose != null)
42
+ this.serverOptions.verbose = options.verbose;
7
43
  }
8
44
  async generate(text, options) {
9
45
  if (!text?.trim())
10
46
  throw new Error('TTS text must not be empty');
11
47
  const controller = new AbortController();
12
48
  const timeout = setTimeout(() => controller.abort(), 180_000);
49
+ const payload = {
50
+ model: this.model,
51
+ input: text,
52
+ voice: options.voice ?? 'af_heart',
53
+ ...this.serverOptions,
54
+ };
55
+ // TTSEngineOptions.speed maps to the server's speed field
56
+ if (options.speed != null) {
57
+ payload.speed = options.speed;
58
+ }
13
59
  let response;
14
60
  try {
15
61
  response = await fetch(`${this.baseUrl}/v1/audio/speech`, {
16
62
  method: 'POST',
17
63
  headers: { 'Content-Type': 'application/json' },
18
- body: JSON.stringify({
19
- model: this.model,
20
- input: text,
21
- voice: options.voice ?? 'af_heart',
22
- }),
64
+ body: JSON.stringify(payload),
23
65
  signal: controller.signal,
24
66
  });
25
67
  }
@@ -1 +1 @@
1
- {"version":3,"file":"mlx-audio.js","sourceRoot":"","sources":["../../../src/tts/engines/mlx-audio.ts"],"names":[],"mappings":"AASA,MAAM,OAAO,cAAc;IACjB,OAAO,CAAS;IAChB,KAAK,CAAS;IAEtB,YAAY,OAA+B;QACzC,IAAI,CAAC,OAAO,GAAG,OAAO,EAAE,OAAO,IAAI,uBAAuB,CAAC;QAC3D,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,mCAAmC,CAAC;IACrE,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,OAAyB;QACpD,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAEjE,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,OAAO,CAAC,CAAC;QAE9D,IAAI,QAAQ,CAAC;QACb,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,IAAI,CAAC,OAAO,kBAAkB,EAAE;gBACxD,MAAM,EAAE,MAAM;gBACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;gBAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;oBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;oBACjB,KAAK,EAAE,IAAI;oBACX,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,UAAU;iBACnC,CAAC;gBACF,MAAM,EAAE,UAAU,CAAC,MAAM;aAC1B,CAAC,CAAC;QACL,CAAC;gBAAS,CAAC;YACT,YAAY,CAAC,OAAO,CAAC,CAAC;QACxB,CAAC;QAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CACb,0BAA0B,QAAQ,CAAC,MAAM,KAAK,IAAI,IAAI;gBACtD,qEAAqE,IAAI,CAAC,KAAK,EAAE,CAClF,CAAC;QACJ,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;QAE9D,kDAAkD;QAClD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;QACtD,OAAO,YAAY,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;CACF"}
1
+ {"version":3,"file":"mlx-audio.js","sourceRoot":"","sources":["../../../src/tts/engines/mlx-audio.ts"],"names":[],"mappings":"AAuCA,MAAM,OAAO,cAAc;IACjB,OAAO,CAAS;IAChB,KAAK,CAAS;IACd,aAAa,CAA0B;IAE/C,YAAY,OAA+B;QACzC,IAAI,CAAC,OAAO,GAAG,OAAO,EAAE,OAAO,IAAI,uBAAuB,CAAC;QAC3D,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,mCAAmC,CAAC;QAEnE,IAAI,OAAO,EAAE,QAAQ,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YAC1C,MAAM,IAAI,KAAK,CAAC,4DAA4D,CAAC,CAAC;QAChF,CAAC;QAED,4EAA4E;QAC5E,IAAI,CAAC,aAAa,GAAG,EAAE,CAAC;QACxB,IAAI,OAAO,EAAE,QAAQ,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,SAAS,GAAG,OAAO,CAAC,QAAQ,CAAC;QAC/E,IAAI,OAAO,EAAE,OAAO,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,QAAQ,GAAG,OAAO,CAAC,OAAO,CAAC;QAC5E,IAAI,OAAO,EAAE,QAAQ,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;QAC9E,IAAI,OAAO,EAAE,MAAM,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QACxE,IAAI,OAAO,EAAE,KAAK,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;QACrE,IAAI,OAAO,EAAE,QAAQ,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,SAAS,GAAG,OAAO,CAAC,QAAQ,CAAC;QAC/E,IAAI,OAAO,EAAE,WAAW,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,CAAC;QACvF,IAAI,OAAO,EAAE,IAAI,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;QACnE,IAAI,OAAO,EAAE,IAAI,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC;QACnE,IAAI,OAAO,EAAE,iBAAiB,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,kBAAkB,GAAG,OAAO,CAAC,iBAAiB,CAAC;QAC1G,IAAI,OAAO,EAAE,cAAc,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,eAAe,GAAG,OAAO,CAAC,cAAc,CAAC;QACjG,IAAI,OAAO,EAAE,MAAM,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QACxE,IAAI,OAAO,EAAE,iBAAiB,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,kBAAkB,GAAG,OAAO,CAAC,iBAAiB,CAAC;QAC1G,IAAI,OAAO,EAAE,SAAS,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,UAAU,GAAG,OAAO,CAAC,SAAS,CAAC;QAClF,IAAI,OAAO,EAAE,OAAO,IAAI,IAAI;YAAE,IAAI,CAAC,aAAa,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC;IAC7E,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,OAAyB;QACpD,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAEjE,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,OAAO,CAAC,CAAC;QAE9D,MAAM,OAAO,GAA4B;YACvC,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,KAAK,EAAE,IAAI;YACX,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,UAAU;YAClC,GAAG,IAAI,CAAC,aAAa;SACtB,CAAC;QAEF,0DAA0D;QAC1D,IAAI,OAAO,CAAC,KAAK,IAAI,IAAI,EAAE,CAAC;YAC1B,OAAO,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;QAChC,CAAC;QAED,IAAI,QAAQ,CAAC;QACb,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,IAAI,CAAC,OAAO,kBAAkB,EAAE;gBACxD,MAAM,EAAE,MAAM;gBACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;gBAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;gBAC7B,MAAM,EAAE,UAAU,CAAC,MAAM;aAC1B,CAAC,CAAC;QACL,CAAC;gBAAS,CAAC;YACT,YAAY,CAAC,OAAO,CAAC,CAAC;QACxB,CAAC;QAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CACb,0BAA0B,QAAQ,CAAC,MAAM,KAAK,IAAI,IAAI;gBACtD,qEAAqE,IAAI,CAAC,KAAK,EAAE,CAClF,CAAC;QACJ,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;QAE9D,kDAAkD;QAClD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;QACtD,OAAO,YAAY,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;CACF"}
@@ -1 +1 @@
1
- {"version":3,"file":"sarvam.d.ts","sourceRoot":"","sources":["../../../src/tts/engines/sarvam.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAEhE,MAAM,WAAW,mBAAmB;IAClC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,qBAAa,YAAa,YAAW,SAAS;IAC5C,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,mBAAmB;IAKzC,OAAO,CAAC,aAAa;IAWf,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC;CAsCzE"}
1
+ {"version":3,"file":"sarvam.d.ts","sourceRoot":"","sources":["../../../src/tts/engines/sarvam.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAEhE,MAAM,WAAW,mBAAmB;IAClC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,qBAAa,YAAa,YAAW,SAAS;IAC5C,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,mBAAmB;IAKzC,OAAO,CAAC,aAAa;IAWf,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC;CAoCzE"}
@@ -16,33 +16,30 @@ export class SarvamEngine {
16
16
  async generate(text, options) {
17
17
  if (!text?.trim())
18
18
  throw new Error('TTS text must not be empty');
19
- const response = await fetch('https://api.sarvam.ai/text-to-speech', {
20
- method: 'POST',
21
- headers: {
22
- 'Content-Type': 'application/json',
23
- 'API-Subscription-Key': this.resolveApiKey(),
24
- },
25
- body: JSON.stringify({
26
- inputs: [text],
27
- target_language_code: options.lang ?? 'hi-IN',
28
- speaker: options.voice ?? 'meera',
29
- model: this.model,
30
- pitch: 0,
31
- pace: options.speed ?? 1.0,
32
- loudness: 1.5,
33
- enable_preprocessing: true,
34
- }),
35
- });
36
- if (!response.ok) {
37
- const body = await response.text();
38
- throw new Error(`Sarvam TTS API error ${response.status}: ${body}`);
19
+ let SarvamAI;
20
+ try {
21
+ // @ts-ignore — sarvamai is an optional dependency
22
+ ({ default: SarvamAI } = await import('sarvamai'));
23
+ }
24
+ catch {
25
+ throw new Error("Sarvam TTS engine requires the 'sarvamai' package. Install it with: npm i sarvamai");
39
26
  }
40
- const json = await response.json();
41
- if (!json.audios?.[0]) {
27
+ const client = new SarvamAI({ apiSubscriptionKey: this.resolveApiKey() });
28
+ const response = await client.textToSpeech.convert({
29
+ inputs: [text],
30
+ target_language_code: options.lang ?? 'hi-IN',
31
+ speaker: options.voice ?? 'meera',
32
+ model: this.model,
33
+ pitch: 0,
34
+ pace: options.speed ?? 1.0,
35
+ loudness: 1.5,
36
+ enable_preprocessing: true,
37
+ });
38
+ if (!response.audios?.[0]) {
42
39
  throw new Error('Sarvam TTS returned no audio data');
43
40
  }
44
41
  // Sarvam returns base64-encoded WAV
45
- const audioBuffer = Buffer.from(json.audios[0], 'base64');
42
+ const audioBuffer = Buffer.from(response.audios[0], 'base64');
46
43
  // Convert to Argo WAV format (mono Float32 24kHz)
47
44
  const { convertToWav } = await import('../engine.js');
48
45
  return convertToWav(audioBuffer);
@@ -1 +1 @@
1
- {"version":3,"file":"sarvam.js","sourceRoot":"","sources":["../../../src/tts/engines/sarvam.ts"],"names":[],"mappings":"AAOA,MAAM,OAAO,YAAY;IACf,MAAM,CAAS;IACf,KAAK,CAAS;IAEtB,YAAY,OAA6B;QACvC,IAAI,CAAC,MAAM,GAAG,OAAO,EAAE,MAAM,IAAI,EAAE,CAAC;QACpC,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,WAAW,CAAC;IAC7C,CAAC;IAEO,aAAa;QACnB,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC;QAC5D,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,KAAK,CACb,yCAAyC;gBACzC,gEAAgE,CACjE,CAAC;QACJ,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,OAAyB;QACpD,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAEjE,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,sCAAsC,EAAE;YACnE,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,sBAAsB,EAAE,IAAI,CAAC,aAAa,EAAE;aAC7C;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACnB,MAAM,EAAE,CAAC,IAAI,CAAC;gBACd,oBAAoB,EAAE,OAAO,CAAC,IAAI,IAAI,OAAO;gBAC7C,OAAO,EAAE,OAAO,CAAC,KAAK,IAAI,OAAO;gBACjC,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,KAAK,EAAE,CAAC;gBACR,IAAI,EAAE,OAAO,CAAC,KAAK,IAAI,GAAG;gBAC1B,QAAQ,EAAE,GAAG;gBACb,oBAAoB,EAAE,IAAI;aAC3B,CAAC;SACH,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CAAC,wBAAwB,QAAQ,CAAC,MAAM,KAAK,IAAI,EAAE,CAAC,CAAC;QACtE,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAA2B,CAAC;QAC5D,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACvD,CAAC;QAED,oCAAoC;QACpC,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAE1D,kDAAkD;QAClD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;QACtD,OAAO,YAAY,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;CACF"}
1
+ {"version":3,"file":"sarvam.js","sourceRoot":"","sources":["../../../src/tts/engines/sarvam.ts"],"names":[],"mappings":"AAOA,MAAM,OAAO,YAAY;IACf,MAAM,CAAS;IACf,KAAK,CAAS;IAEtB,YAAY,OAA6B;QACvC,IAAI,CAAC,MAAM,GAAG,OAAO,EAAE,MAAM,IAAI,EAAE,CAAC;QACpC,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,WAAW,CAAC;IAC7C,CAAC;IAEO,aAAa;QACnB,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC;QAC5D,IAAI,CAAC,GAAG,EAAE,CAAC;YACT,MAAM,IAAI,KAAK,CACb,yCAAyC;gBACzC,gEAAgE,CACjE,CAAC;QACJ,CAAC;QACD,OAAO,GAAG,CAAC;IACb,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,OAAyB;QACpD,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAEjE,IAAI,QAAa,CAAC;QAClB,IAAI,CAAC;YACH,kDAAkD;YAClD,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC;QACrD,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CACb,oFAAoF,CACrF,CAAC;QACJ,CAAC;QAED,MAAM,MAAM,GAAG,IAAI,QAAQ,CAAC,EAAE,kBAAkB,EAAE,IAAI,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC;QAC1E,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,YAAY,CAAC,OAAO,CAAC;YACjD,MAAM,EAAE,CAAC,IAAI,CAAC;YACd,oBAAoB,EAAE,OAAO,CAAC,IAAI,IAAI,OAAO;YAC7C,OAAO,EAAE,OAAO,CAAC,KAAK,IAAI,OAAO;YACjC,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,KAAK,EAAE,CAAC;YACR,IAAI,EAAE,OAAO,CAAC,KAAK,IAAI,GAAG;YAC1B,QAAQ,EAAE,GAAG;YACb,oBAAoB,EAAE,IAAI;SAC3B,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACvD,CAAC;QAED,oCAAoC;QACpC,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAE9D,kDAAkD;QAClD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;QACtD,OAAO,YAAY,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;CACF"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@argo-video/cli",
3
- "version": "0.7.1",
3
+ "version": "0.9.0",
4
4
  "description": "Turn Playwright demo scripts into polished product demo videos with AI voiceover",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -29,6 +29,12 @@
29
29
  "commander": "^12.0.0",
30
30
  "kokoro-js": "^1.2.1"
31
31
  },
32
+ "optionalDependencies": {
33
+ "openai": "^4.0.0",
34
+ "elevenlabs": "^1.0.0",
35
+ "@google/genai": "^1.0.0",
36
+ "sarvamai": "^1.0.0"
37
+ },
32
38
  "publishConfig": {
33
39
  "access": "public"
34
40
  },
@@ -39,7 +45,8 @@
39
45
  "license": "MIT",
40
46
  "files": [
41
47
  "dist",
42
- "bin"
48
+ "bin",
49
+ "scripts"
43
50
  ],
44
51
  "devDependencies": {
45
52
  "@playwright/test": "^1.50.0",
@@ -0,0 +1,174 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ import random
5
+
6
+ from PIL import Image, ImageDraw, ImageFilter, ImageFont
7
+
8
+
9
+ ROOT = Path(__file__).resolve().parents[1]
10
+ ASSETS = ROOT / "assets"
11
+ OUTPUT_PATH = ASSETS / "logo-thumb.png"
12
+ SOURCE_MARK_PATH = ASSETS / "logo-mark-source.png"
13
+
14
+ WIDTH = 1920
15
+ HEIGHT = 1080
16
+
17
+ MONO_FONT = "/System/Library/Fonts/SFNSMono.ttf"
18
+ DISPLAY_BOLD = "/Library/Fonts/SF-Compact-Display-Bold.otf"
19
+
20
+
21
+ def load_font(path: str, size: int) -> ImageFont.FreeTypeFont:
22
+ return ImageFont.truetype(path, size=size)
23
+
24
+
25
+ def normalize_mark() -> Image.Image:
26
+ if SOURCE_MARK_PATH.exists():
27
+ return Image.open(SOURCE_MARK_PATH).convert("RGBA")
28
+
29
+ original = Image.open(OUTPUT_PATH).convert("RGBA")
30
+ mark = Image.new("RGBA", original.size, (0, 0, 0, 0))
31
+ in_px = original.load()
32
+ out_px = mark.load()
33
+
34
+ for y in range(original.height):
35
+ for x in range(original.width):
36
+ r, g, b, _ = in_px[x, y]
37
+ if b > 150 and g > 110 and r < 140:
38
+ out_px[x, y] = (107, 180, 255, 255)
39
+
40
+ bbox = mark.getbbox()
41
+ if bbox is None:
42
+ raise RuntimeError("Could not isolate source logo mark")
43
+
44
+ cropped = mark.crop(bbox)
45
+ cropped.save(SOURCE_MARK_PATH)
46
+ return cropped
47
+
48
+
49
+ def lerp(a: int, b: int, t: float) -> int:
50
+ return int(a + (b - a) * t)
51
+
52
+
53
+ def make_background() -> Image.Image:
54
+ image = Image.new("RGBA", (WIDTH, HEIGHT), (0, 0, 0, 255))
55
+ px = image.load()
56
+
57
+ top_left = (8, 10, 18)
58
+ bottom_right = (18, 22, 37)
59
+ top_right = (12, 14, 26)
60
+
61
+ for y in range(HEIGHT):
62
+ ty = y / (HEIGHT - 1)
63
+ for x in range(WIDTH):
64
+ tx = x / (WIDTH - 1)
65
+ r = lerp(lerp(top_left[0], top_right[0], tx), bottom_right[0], ty)
66
+ g = lerp(lerp(top_left[1], top_right[1], tx), bottom_right[1], ty)
67
+ b = lerp(lerp(top_left[2], top_right[2], tx), bottom_right[2], ty)
68
+ px[x, y] = (r, g, b, 255)
69
+
70
+ return image
71
+
72
+
73
+ def add_radial_glow(base: Image.Image, center: tuple[int, int], radius: int, color: tuple[int, int, int], alpha: int, blur: int) -> None:
74
+ layer = Image.new("RGBA", base.size, (0, 0, 0, 0))
75
+ draw = ImageDraw.Draw(layer)
76
+ x, y = center
77
+ draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color + (alpha,))
78
+ layer = layer.filter(ImageFilter.GaussianBlur(blur))
79
+ base.alpha_composite(layer)
80
+
81
+
82
+ def add_grid(base: Image.Image) -> None:
83
+ grid = Image.new("RGBA", base.size, (0, 0, 0, 0))
84
+ draw = ImageDraw.Draw(grid)
85
+
86
+ for x in range(120, WIDTH, 96):
87
+ draw.line((x, 0, x, HEIGHT), fill=(92, 126, 188, 14), width=1)
88
+ for y in range(96, HEIGHT, 96):
89
+ draw.line((0, y, WIDTH, y), fill=(92, 126, 188, 10), width=1)
90
+
91
+ random.seed(7)
92
+ for _ in range(240):
93
+ x = random.randint(0, WIDTH - 1)
94
+ y = random.randint(0, HEIGHT - 1)
95
+ draw.point((x, y), fill=(160, 185, 255, random.randint(10, 28)))
96
+
97
+ base.alpha_composite(grid)
98
+
99
+
100
+ def draw_mark(base: Image.Image, mark: Image.Image) -> None:
101
+ mark = mark.resize((1440, int(mark.height * (1440 / mark.width))), Image.Resampling.LANCZOS)
102
+
103
+ glow = Image.new("RGBA", base.size, (0, 0, 0, 0))
104
+ glow_mark = mark.copy()
105
+ glow_mark = glow_mark.filter(ImageFilter.GaussianBlur(14))
106
+ gx = (WIDTH - glow_mark.width) // 2
107
+ gy = 200
108
+ glow.alpha_composite(glow_mark, (gx, gy))
109
+
110
+ tint = Image.new("RGBA", base.size, (0, 0, 0, 0))
111
+ tint_draw = ImageDraw.Draw(tint)
112
+ tint_draw.ellipse((420, 220, 1500, 760), fill=(71, 136, 255, 30))
113
+ tint = tint.filter(ImageFilter.GaussianBlur(54))
114
+
115
+ base.alpha_composite(tint)
116
+ base.alpha_composite(glow, (0, 0))
117
+
118
+ mark_shadow = Image.new("RGBA", base.size, (0, 0, 0, 0))
119
+ shadow = mark.copy().filter(ImageFilter.GaussianBlur(18))
120
+ mark_shadow.alpha_composite(shadow, (gx, gy + 8))
121
+ base.alpha_composite(mark_shadow)
122
+ base.alpha_composite(mark, (gx, gy))
123
+
124
+
125
+ def draw_command_card(base: Image.Image) -> None:
126
+ card = Image.new("RGBA", base.size, (0, 0, 0, 0))
127
+ draw = ImageDraw.Draw(card)
128
+
129
+ x0, y0, x1, y1 = 500, 760, 1420, 892
130
+ shadow = Image.new("RGBA", base.size, (0, 0, 0, 0))
131
+ ImageDraw.Draw(shadow).rounded_rectangle((x0, y0 + 18, x1, y1 + 18), radius=34, fill=(0, 0, 0, 120))
132
+ shadow = shadow.filter(ImageFilter.GaussianBlur(24))
133
+ base.alpha_composite(shadow)
134
+
135
+ draw.rounded_rectangle((x0, y0, x1, y1), radius=34, fill=(14, 18, 30, 228), outline=(97, 127, 183, 88), width=2)
136
+ draw.rounded_rectangle((x0 + 2, y0 + 2, x1 - 2, y0 + 38), radius=32, fill=(18, 23, 38, 245))
137
+
138
+ button_y = y0 + 20
139
+ for idx, color in enumerate(((255, 95, 86), (255, 189, 46), (39, 201, 63))):
140
+ bx = x0 + 30 + idx * 22
141
+ draw.ellipse((bx, button_y, bx + 12, button_y + 12), fill=color)
142
+
143
+ mono = load_font(MONO_FONT, 40)
144
+ small = load_font(MONO_FONT, 24)
145
+
146
+ prompt_y = y0 + 64
147
+ draw.text((x0 + 34, prompt_y), "$", font=mono, fill=(123, 217, 129))
148
+ draw.text((x0 + 72, prompt_y), "npx argo", font=mono, fill=(110, 187, 255))
149
+ draw.text((x0 + 330, prompt_y), "pipeline", font=mono, fill=(233, 239, 252))
150
+ draw.text((x0 + 566, prompt_y), "showcase", font=mono, fill=(169, 146, 255))
151
+
152
+ note = "local-first • webkit-friendly • retina-ready"
153
+ note_box = draw.textbbox((0, 0), note, font=small)
154
+ note_x = (WIDTH - (note_box[2] - note_box[0])) / 2
155
+ draw.text((note_x, y1 + 24), note, font=small, fill=(100, 121, 160))
156
+
157
+ base.alpha_composite(card)
158
+
159
+
160
+ def main() -> None:
161
+ ASSETS.mkdir(exist_ok=True)
162
+ mark = normalize_mark()
163
+ image = make_background()
164
+ add_radial_glow(image, (WIDTH // 2, 420), 420, (48, 99, 235), 48, 96)
165
+ add_radial_glow(image, (1470, 210), 220, (41, 91, 235), 42, 84)
166
+ add_radial_glow(image, (360, 910), 300, (43, 74, 170), 34, 92)
167
+ add_grid(image)
168
+ draw_mark(image, mark)
169
+ draw_command_card(image)
170
+ image.convert("RGB").save(OUTPUT_PATH, quality=95)
171
+
172
+
173
+ if __name__ == "__main__":
174
+ main()
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Record and process a voice reference clip for mlx-audio voice cloning.
4
+ # Usage: ./scripts/record-voice-ref.sh [output_path]
5
+ #
6
+ # Requirements: ffmpeg (brew install ffmpeg), macOS with built-in mic
7
+ #
8
+ set -euo pipefail
9
+
10
+ OUTPUT="${1:-assets/ref-voice.wav}"
11
+ RAW_FILE=$(mktemp /tmp/voice-ref-raw.XXXXXX.wav)
12
+
13
+ # Suggested text — covers a wide range of English phonemes
14
+ cat <<'PROMPT'
15
+ ╔══════════════════════════════════════════════════════════════════╗
16
+ ║ Voice Reference Recording ║
17
+ ╠══════════════════════════════════════════════════════════════════╣
18
+ ║ ║
19
+ ║ Read the following text naturally, at your demo narration pace: ║
20
+ ║ ║
21
+ ║ "Hi, my name is [YOUR NAME]. I build developer tools and love ║
22
+ ║ creating great product demos. The quick brown fox jumps over ║
23
+ ║ the lazy dog. Pack my box with five dozen liquor jugs." ║
24
+ ║ ║
25
+ ║ Tips: ║
26
+ ║ • Sit ~6 inches from the mic ║
27
+ ║ • Use a quiet room (close windows, turn off fans) ║
28
+ ║ • Speak clearly at your natural pace ║
29
+ ║ • Aim for 5–15 seconds ║
30
+ ║ ║
31
+ ╚══════════════════════════════════════════════════════════════════╝
32
+
33
+ PROMPT
34
+
35
+ echo "Press ENTER to start recording (Ctrl+C to cancel)..."
36
+ read -r
37
+
38
+ echo "🎙 Recording... Press Ctrl+C when done."
39
+
40
+ # Record using macOS Core Audio (avfoundation) via ffmpeg
41
+ # Uses the default input device (built-in mic or whatever is selected in
42
+ # System Settings > Sound > Input)
43
+ ffmpeg -y -f avfoundation -i ":default" \
44
+ -acodec pcm_s16le -ar 44100 -ac 1 \
45
+ "$RAW_FILE" 2>/dev/null || true
46
+
47
+ echo ""
48
+ echo "Processing audio..."
49
+
50
+ # Get duration
51
+ DURATION=$(ffprobe -v error -show_entries format=duration \
52
+ -of csv=p=0 "$RAW_FILE" 2>/dev/null || echo "0")
53
+
54
+ if [ "$DURATION" = "0" ] || [ "$(echo "$DURATION < 2" | bc -l 2>/dev/null || echo 0)" = "1" ]; then
55
+ echo "Error: Recording too short or failed. Try again."
56
+ rm -f "$RAW_FILE"
57
+ exit 1
58
+ fi
59
+
60
+ # Create output directory if needed
61
+ mkdir -p "$(dirname "$OUTPUT")"
62
+
63
+ # Process: mono, 24kHz, noise-reduced, normalized
64
+ ffmpeg -y -i "$RAW_FILE" \
65
+ -af "highpass=f=80,lowpass=f=12000,afftdn=nf=-25,loudnorm=I=-16:TP=-1.5:LRA=11" \
66
+ -ar 24000 -ac 1 -acodec pcm_s16le \
67
+ "$OUTPUT" 2>/dev/null
68
+
69
+ rm -f "$RAW_FILE"
70
+
71
+ FINAL_DURATION=$(ffprobe -v error -show_entries format=duration \
72
+ -of csv=p=0 "$OUTPUT" 2>/dev/null)
73
+
74
+ echo ""
75
+ echo "Done! Saved to: $OUTPUT (${FINAL_DURATION}s)"
76
+ echo ""
77
+ echo "Next steps:"
78
+ echo " 1. Listen: ffplay $OUTPUT"
79
+ echo " 2. Add to your argo config:"
80
+ echo ""
81
+ echo " tts: {"
82
+ echo " engine: engines.mlxAudio({"
83
+ echo " model: 'mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16',"
84
+ echo " refAudio: './$OUTPUT',"
85
+ echo " refText: 'YOUR EXACT TRANSCRIPT HERE',"
86
+ echo " }),"
87
+ echo " }"
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Set up mlx-audio with all dependencies for Argo voice cloning.
4
+ # Usage: ./scripts/setup-mlx-audio.sh
5
+ #
6
+ # Creates a .venv in the project root with mlx-audio and its
7
+ # transitive deps that aren't auto-installed.
8
+ #
9
+ set -euo pipefail
10
+
11
+ VENV_DIR="${1:-.venv}"
12
+
13
+ echo "╔══════════════════════════════════════════════╗"
14
+ echo "║ Argo — mlx-audio setup (Apple Silicon) ║"
15
+ echo "╚══════════════════════════════════════════════╝"
16
+ echo ""
17
+
18
+ # Check for uv
19
+ if ! command -v uv &>/dev/null; then
20
+ echo "✗ uv not found. Install it: curl -LsSf https://astral.sh/uv/install.sh | sh"
21
+ exit 1
22
+ fi
23
+
24
+ # Check for Apple Silicon
25
+ if [[ "$(uname -m)" != "arm64" ]]; then
26
+ echo "! Warning: mlx-audio is optimized for Apple Silicon (arm64)."
27
+ echo " Current arch: $(uname -m). Performance may be poor."
28
+ fi
29
+
30
+ # Create venv
31
+ echo "★ Creating venv at ${VENV_DIR}..."
32
+ uv venv "$VENV_DIR"
33
+
34
+ # Install mlx-audio + missing transitive deps
35
+ echo "★ Installing mlx-audio..."
36
+ uv pip install -p "$VENV_DIR" mlx-audio
37
+
38
+ echo "★ Installing missing transitive dependencies..."
39
+ uv pip install -p "$VENV_DIR" "misaki[en]" num2words pip
40
+
41
+ # setuptools < 70 needed for webrtcvad's pkg_resources import
42
+ echo "★ Installing setuptools (< 70 for pkg_resources compat)..."
43
+ uv pip install -p "$VENV_DIR" "setuptools<70"
44
+
45
+ # Server deps (for OpenAI-compatible HTTP API)
46
+ echo "★ Installing server dependencies..."
47
+ uv pip install -p "$VENV_DIR" uvicorn fastapi python-multipart webrtcvad
48
+
49
+ echo ""
50
+ echo "✓ Done! mlx-audio installed at ${VENV_DIR}"
51
+ echo ""
52
+ echo "Usage:"
53
+ echo " # Start the TTS server"
54
+ echo " ${VENV_DIR}/bin/python3 -m mlx_audio.server --port 8000"
55
+ echo ""
56
+ echo " # Record a voice reference clip"
57
+ echo " ./scripts/record-voice-ref.sh assets/ref-voice.wav"
58
+ echo ""
59
+ echo " # Preview cloned voice"
60
+ echo " ./scripts/voice-clone-preview.sh \\"
61
+ echo " --ref-audio assets/ref-voice.wav \\"
62
+ echo " --ref-text 'Your transcript here.' \\"
63
+ echo " --voiceover demos/showcase.voiceover.json --play"
64
+ echo ""
65
+ echo " # Use in argo config"
66
+ echo " engines.mlxAudio({ model: 'mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16' })"