speechflow 0.9.4 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -17,14 +17,19 @@ About
17
17
  **SpeechFlow** is a command-line interface based tool for establishing a
18
18
  directed data flow graph of audio and text processing nodes. This way,
19
19
  it allows to perform various speech processing tasks in a flexible way.
20
- Currently, **SpeechFlow** comes with graph nodes for local file I/O, local audio
20
+
21
+ **SpeechFlow** comes with built-in graph nodes for local file I/O, local audio
21
22
  device I/O, local/remote WebSocket network I/O, cloud-based [Deepgram](https://deepgram.com)
22
23
  speech-to-text conversion, cloud-based [DeepL](https://deepl.com) text-to-text
23
24
  translation, local [Gemma/Ollama](https://ollama.com/library/gemma3)
24
25
  text-to-text translation, cloud-based [ElevenLabs](https://elevenlabs.io/)
25
26
  text-to-speech conversion, and local [FFmpeg](https://ffmpeg.org/)
26
- speech-to-speech encoding. **SpeechFlow** is written in TypeScript and
27
- ships as a package for the Node Package Manager (NPM).
27
+ speech-to-speech encoding. Additional SpeechFlow graph nodes can be provided externally
28
+ by NPM packages named `speechflow-node-xxx` which expose a class
29
+ derived from the exported `SpeechFlowNode` class of the `speechflow` package.
30
+
31
+ **SpeechFlow** is written in TypeScript and
32
+ ships as an installable package for the Node Package Manager (NPM).
28
33
 
29
34
  Installation
30
35
  ------------
@@ -2,6 +2,7 @@ import SpeechFlowNode from "./speechflow-node";
2
2
  export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
3
3
  static name: string;
4
4
  private elevenlabs;
5
+ private static speexInitialized;
5
6
  constructor(id: string, opts: {
6
7
  [id: string]: any;
7
8
  }, args: any[]);
@@ -47,6 +47,7 @@ const node_events_1 = require("node:events");
47
47
  /* external dependencies */
48
48
  const ElevenLabs = __importStar(require("elevenlabs"));
49
49
  const get_stream_1 = require("get-stream");
50
+ const speex_resampler_1 = __importDefault(require("speex-resampler"));
50
51
  /* internal dependencies */
51
52
  const speechflow_node_1 = __importDefault(require("./speechflow-node"));
52
53
  /*
@@ -68,14 +69,17 @@ class SpeechFlowNodeElevenlabs extends speechflow_node_1.default {
68
69
  static name = "elevenlabs";
69
70
  /* internal state */
70
71
  elevenlabs = null;
72
+ static speexInitialized = false;
71
73
  /* construct node */
72
74
  constructor(id, opts, args) {
73
75
  super(id, opts, args);
74
76
  /* declare node configuration parameters */
75
77
  this.configure({
76
78
  key: { type: "string", val: process.env.SPEECHFLOW_KEY_ELEVENLABS },
77
- voice: { type: "string", val: "Brian", pos: 0 },
78
- language: { type: "string", val: "de", pos: 1 }
79
+ voice: { type: "string", val: "Brian", pos: 0, match: /^(?:.+)$/ },
80
+ language: { type: "string", val: "en", pos: 1, match: /^(?:de|en)$/ },
81
+ speed: { type: "number", val: 1.05, pos: 2, match: (n) => n >= 0.7 && n <= 1.2 },
82
+ optimize: { type: "string", val: "latency", pos: 3, match: /^(?:latency|quality)$/ }
79
83
  });
80
84
  /* declare node input/output format */
81
85
  this.input = "text";
@@ -83,39 +87,76 @@ class SpeechFlowNodeElevenlabs extends speechflow_node_1.default {
83
87
  }
84
88
  /* open node */
85
89
  async open() {
90
+ /* establish ElevenLabs API connection */
86
91
  this.elevenlabs = new ElevenLabs.ElevenLabsClient({
87
92
  apiKey: this.params.key
88
93
  });
94
+ /* determine maximum sample rate of ElevenLabs tier */
95
+ const maxSampleRates = {
96
+ "free": 16000,
97
+ "starter": 22050,
98
+ "creator": 24000,
99
+ "independent_publisher": 44100,
100
+ "growing_business": 44100,
101
+ "enterprise": 44100
102
+ };
103
+ const sub = await this.elevenlabs.user.getSubscription();
104
+ const tier = (sub.tier ?? "free");
105
+ this.log("info", `determined ElevenLabs tier: "${tier}"`);
106
+ let maxSampleRate = 16000;
107
+ if (maxSampleRates[tier] !== undefined)
108
+ maxSampleRate = maxSampleRates[tier];
109
+ this.log("info", `determined maximum audio sample rate: ${maxSampleRate}`);
110
+ /* determine voice for text-to-speech operation
111
+ (for details see https://elevenlabs.io/text-to-speech) */
89
112
  const voices = await this.elevenlabs.voices.getAll();
90
- const voice = voices.voices.find((voice) => voice.name === this.params.voice);
91
- if (voice === undefined)
92
- throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`);
113
+ let voice = voices.voices.find((voice) => voice.name === this.params.voice);
114
+ if (voice === undefined) {
115
+ voice = voices.voices.find((voice) => voice.name.startsWith(this.params.voice));
116
+ if (voice === undefined)
117
+ throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`);
118
+ }
119
+ const info = Object.keys(voice.labels ?? {}).length > 0 ?
120
+ (", " + Object.entries(voice.labels)
121
+ .map(([key, val]) => `${key}: "${val}"`).join(", ")) : "";
122
+ this.log("info", `selected voice: name: "${voice.name}"${info}`);
123
+ /* perform text-to-speech operation with Elevenlabs API */
124
+ const model = this.params.optimize === "quality" ?
125
+ "eleven_multilingual_v2" :
126
+ "eleven_flash_v2_5";
93
127
  const speechStream = (text) => {
94
128
  return this.elevenlabs.textToSpeech.convert(voice.voice_id, {
95
129
  text,
96
- optimize_streaming_latency: 2,
97
- output_format: "pcm_16000", // S16LE
98
- model_id: "eleven_flash_v2_5",
99
- /*
130
+ model_id: model,
131
+ language_code: this.params.language,
132
+ output_format: `pcm_${maxSampleRate}`,
133
+ seed: 815, /* arbitrary, but fixated by us */
100
134
  voice_settings: {
101
- stability: 0,
102
- similarity_boost: 0
135
+ speed: this.params.speed
103
136
  }
104
- */
105
137
  }, {
106
138
  timeoutInSeconds: 30,
107
139
  maxRetries: 10
108
140
  });
109
141
  };
142
+ /* internal queue of results */
110
143
  const queue = new node_events_1.EventEmitter();
144
+ /* establish resampler from ElevenLabs's maximum 24Khz
145
+ output to our standard audio sample rate (48KHz) */
146
+ if (!SpeechFlowNodeElevenlabs.speexInitialized) {
147
+ /* at least once initialize resampler */
148
+ await speex_resampler_1.default.initPromise;
149
+ SpeechFlowNodeElevenlabs.speexInitialized = true;
150
+ }
151
+ const resampler = new speex_resampler_1.default(1, maxSampleRate, this.config.audioSampleRate, 7);
152
+ /* create duplex stream and connect it to the ElevenLabs API */
111
153
  this.stream = new node_stream_1.default.Duplex({
112
154
  write(chunk, encoding, callback) {
113
- if (encoding !== "utf8" && encoding !== "utf-8")
114
- callback(new Error("only text input supported by Elevenlabs node"));
115
155
  const data = chunk.toString();
116
156
  speechStream(data).then((stream) => {
117
157
  (0, get_stream_1.getStreamAsBuffer)(stream).then((buffer) => {
118
- queue.emit("audio", buffer);
158
+ const bufferResampled = resampler.processChunk(buffer);
159
+ queue.emit("audio", bufferResampled);
119
160
  callback();
120
161
  }).catch((error) => {
121
162
  callback(error);
@@ -138,6 +179,9 @@ class SpeechFlowNodeElevenlabs extends speechflow_node_1.default {
138
179
  this.stream.destroy();
139
180
  this.stream = null;
140
181
  }
182
+ /* destroy ElevenLabs API */
183
+ if (this.elevenlabs !== null)
184
+ this.elevenlabs = null;
141
185
  }
142
186
  }
143
187
  exports.default = SpeechFlowNodeElevenlabs;
@@ -27,7 +27,7 @@ export default class SpeechFlowNode extends Events.EventEmitter {
27
27
  type: string;
28
28
  pos?: number;
29
29
  val?: any;
30
- match?: RegExp;
30
+ match?: RegExp | ((x: any) => boolean);
31
31
  };
32
32
  }): void;
33
33
  connect(other: SpeechFlowNode): void;
@@ -46,9 +46,11 @@ class SpeechFlowNode extends node_events_1.default.EventEmitter {
46
46
  throw new Error(`invalid type of named parameter "${name}" ` +
47
47
  `(has to be ${spec[name].type})`);
48
48
  if ("match" in spec[name]
49
- && this.opts[name].match(spec[name].match) === null)
50
- throw new Error(`invalid value of named parameter "${name}" ` +
51
- `(has to match ${spec[name].match})`);
49
+ && ((spec[name].match instanceof RegExp
50
+ && this.opts[name].match(spec[name].match) === null)
51
+ || (typeof spec[name].match === "function"
52
+ && !spec[name].match(this.opts[name]))))
53
+ throw new Error(`invalid value "${this.opts[name]}" of named parameter "${name}"`);
52
54
  this.params[name] = this.opts[name];
53
55
  }
54
56
  else if (this.opts[name] === undefined
@@ -63,6 +65,12 @@ class SpeechFlowNode extends node_events_1.default.EventEmitter {
63
65
  && this.args[spec[name].pos].match(spec[name].match) === null)
64
66
  throw new Error(`invalid value of positional parameter "${name}" ` +
65
67
  `(has to match ${spec[name].match})`);
68
+ if ("match" in spec[name]
69
+ && ((spec[name].match instanceof RegExp
70
+ && this.args[spec[name].pos].match(spec[name].match) === null)
71
+ || (typeof spec[name].match === "function"
72
+ && !spec[name].match(this.args[spec[name].pos]))))
73
+ throw new Error(`invalid value "${this.opts[name]}" of positional parameter "${name}"`);
66
74
  this.params[name] = this.args[spec[name].pos];
67
75
  }
68
76
  else if ("val" in spec[name] && spec[name].val !== undefined)
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "speechflow",
3
- "version": "0.9.4",
4
- "x-stdver": "0.9.4-EA",
3
+ "version": "0.9.5",
4
+ "x-stdver": "0.9.5-EA",
5
5
  "x-release": "2025-04-27",
6
6
  "homepage": "https://github.com/rse/speechflow",
7
7
  "description": "Speech Processing Flow Graph",
package/sample.yaml CHANGED
@@ -4,30 +4,36 @@
4
4
 
5
5
  # capture audio from microphone to file
6
6
  capture-microphone: |
7
- device(device: "wasapi:VoiceMeeter Output", mode: "r") |
8
- file(path: "capture.pcm", mode: "w", type: "audio")
7
+ device(device: "wasapi:VoiceMeeter Output", mode: "r") |
8
+ file(path: "capture.pcm", mode: "w", type: "audio")
9
9
 
10
10
  # generate audio file with narration of text file
11
11
  generate-narration: |
12
- file(path: argv.0, mode: "r", type: "audio") |
13
- deepgram(key: env.SPEECHFLOW_KEY_DEEPGRAM) |
14
- file(path: argv.1, mode: "w", type: "text")
12
+ file(path: argv.0, mode: "r", type: "audio") |
13
+ deepgram(key: env.SPEECHFLOW_KEY_DEEPGRAM) |
14
+ file(path: argv.1, mode: "w", type: "text")
15
15
 
16
16
  # pass-through audio from microphone to speaker and in parallel record it to file
17
17
  microphone-to-speaker: |
18
- device(device: "wasapi:VoiceMeeter Output", mode: "r") | {
19
- file(path: "capture.pcm", mode: "w", type: "audio"),
20
- device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
21
- }
18
+ device(device: "wasapi:VoiceMeeter Output", mode: "r") | {
19
+ file(path: "capture.pcm", mode: "w", type: "audio"),
20
+ device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
21
+ }
22
22
 
23
23
  # translate stdin to stdout
24
24
  translation: |
25
- file(path: "-", mode: "r", type: "text") |
26
- deepl(key: env.SPEECHFLOW_KEY_DEEPL, src: "de", dst: "en-US") |
27
- file(path: "-", mode: "w", type: "text")
25
+ file(path: "-", mode: "r", type: "text") |
26
+ deepl(key: env.SPEECHFLOW_KEY_DEEPL, src: "de", dst: "en-US") |
27
+ file(path: "-", mode: "w", type: "text")
28
28
 
29
29
  # sample for development
30
30
  sample: |
31
- device(device: "coreaudio:Elgato Wave:3", mode: "r") |
32
- file(path: "capture.pcm", mode: "w", type: "audio")
31
+ file(path: "sample.txt", mode: "r", type: "text") |
32
+ elevenlabs(voice: "Mark", speed: 1.05) |
33
+ ffmpeg(dst: "wav") |
34
+ file(path: "sample.wav", mode: "w", type: "audio")
35
+ sample2: |
36
+ device(device: "coreaudio:Elgato Wave:3", mode: "r") |
37
+ ffmpeg(dst: "wav") |
38
+ file(path: "sample.wav", mode: "w", type: "audio")
33
39
 
@@ -11,6 +11,7 @@ import { EventEmitter } from "node:events"
11
11
  /* external dependencies */
12
12
  import * as ElevenLabs from "elevenlabs"
13
13
  import { getStreamAsBuffer } from "get-stream"
14
+ import SpeexResampler from "speex-resampler"
14
15
 
15
16
  /* internal dependencies */
16
17
  import SpeechFlowNode from "./speechflow-node"
@@ -36,6 +37,7 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
36
37
 
37
38
  /* internal state */
38
39
  private elevenlabs: ElevenLabs.ElevenLabsClient | null = null
40
+ private static speexInitialized = false
39
41
 
40
42
  /* construct node */
41
43
  constructor (id: string, opts: { [ id: string ]: any }, args: any[]) {
@@ -44,8 +46,10 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
44
46
  /* declare node configuration parameters */
45
47
  this.configure({
46
48
  key: { type: "string", val: process.env.SPEECHFLOW_KEY_ELEVENLABS },
47
- voice: { type: "string", val: "Brian", pos: 0 },
48
- language: { type: "string", val: "de", pos: 1 }
49
+ voice: { type: "string", val: "Brian", pos: 0, match: /^(?:.+)$/ },
50
+ language: { type: "string", val: "en", pos: 1, match: /^(?:de|en)$/ },
51
+ speed: { type: "number", val: 1.05, pos: 2, match: (n: number) => n >= 0.7 && n <= 1.2 },
52
+ optimize: { type: "string", val: "latency", pos: 3, match: /^(?:latency|quality)$/ }
49
53
  })
50
54
 
51
55
  /* declare node input/output format */
@@ -55,39 +59,82 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
55
59
 
56
60
  /* open node */
57
61
  async open () {
62
+ /* establish ElevenLabs API connection */
58
63
  this.elevenlabs = new ElevenLabs.ElevenLabsClient({
59
64
  apiKey: this.params.key
60
65
  })
66
+
67
+ /* determine maximum sample rate of ElevenLabs tier */
68
+ const maxSampleRates = {
69
+ "free": 16000,
70
+ "starter": 22050,
71
+ "creator": 24000,
72
+ "independent_publisher": 44100,
73
+ "growing_business": 44100,
74
+ "enterprise": 44100
75
+ }
76
+ const sub = await this.elevenlabs.user.getSubscription()
77
+ const tier = (sub.tier ?? "free") as keyof typeof maxSampleRates
78
+ this.log("info", `determined ElevenLabs tier: "${tier}"`)
79
+ let maxSampleRate = 16000
80
+ if (maxSampleRates[tier] !== undefined)
81
+ maxSampleRate = maxSampleRates[tier]
82
+ this.log("info", `determined maximum audio sample rate: ${maxSampleRate}`)
83
+
84
+ /* determine voice for text-to-speech operation
85
+ (for details see https://elevenlabs.io/text-to-speech) */
61
86
  const voices = await this.elevenlabs.voices.getAll()
62
- const voice = voices.voices.find((voice) => voice.name === this.params.voice)
63
- if (voice === undefined)
64
- throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`)
87
+ let voice = voices.voices.find((voice) => voice.name === this.params.voice)
88
+ if (voice === undefined) {
89
+ voice = voices.voices.find((voice) => voice.name!.startsWith(this.params.voice))
90
+ if (voice === undefined)
91
+ throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`)
92
+ }
93
+ const info = Object.keys(voice.labels ?? {}).length > 0 ?
94
+ (", " + Object.entries(voice.labels!)
95
+ .map(([ key, val ]) => `${key}: "${val}"`).join(", ")) : ""
96
+ this.log("info", `selected voice: name: "${voice.name}"${info}`)
97
+
98
+ /* perform text-to-speech operation with Elevenlabs API */
99
+ const model = this.params.optimize === "quality" ?
100
+ "eleven_multilingual_v2" :
101
+ "eleven_flash_v2_5"
65
102
  const speechStream = (text: string) => {
66
103
  return this.elevenlabs!.textToSpeech.convert(voice.voice_id, {
67
104
  text,
68
- optimize_streaming_latency: 2,
69
- output_format: "pcm_16000", // S16LE
70
- model_id: "eleven_flash_v2_5",
71
- /*
105
+ model_id: model,
106
+ language_code: this.params.language,
107
+ output_format: `pcm_${maxSampleRate}` as ElevenLabs.ElevenLabs.OutputFormat,
108
+ seed: 815, /* arbitrary, but fixated by us */
72
109
  voice_settings: {
73
- stability: 0,
74
- similarity_boost: 0
110
+ speed: this.params.speed
75
111
  }
76
- */
77
112
  }, {
78
113
  timeoutInSeconds: 30,
79
- maxRetries: 10
114
+ maxRetries: 10
80
115
  })
81
116
  }
117
+
118
+ /* internal queue of results */
82
119
  const queue = new EventEmitter()
120
+
121
+ /* establish resampler from ElevenLabs's maximum 24Khz
122
+ output to our standard audio sample rate (48KHz) */
123
+ if (!SpeechFlowNodeElevenlabs.speexInitialized) {
124
+ /* at least once initialize resampler */
125
+ await SpeexResampler.initPromise
126
+ SpeechFlowNodeElevenlabs.speexInitialized = true
127
+ }
128
+ const resampler = new SpeexResampler(1, maxSampleRate, this.config.audioSampleRate, 7)
129
+
130
+ /* create duplex stream and connect it to the ElevenLabs API */
83
131
  this.stream = new Stream.Duplex({
84
- write (chunk: Buffer, encoding: BufferEncoding, callback: (error?: Error | null | undefined) => void) {
85
- if (encoding !== "utf8" && encoding !== "utf-8")
86
- callback(new Error("only text input supported by Elevenlabs node"))
132
+ write (chunk: Buffer, encoding, callback) {
87
133
  const data = chunk.toString()
88
134
  speechStream(data).then((stream) => {
89
135
  getStreamAsBuffer(stream).then((buffer) => {
90
- queue.emit("audio", buffer)
136
+ const bufferResampled = resampler.processChunk(buffer)
137
+ queue.emit("audio", bufferResampled)
91
138
  callback()
92
139
  }).catch((error) => {
93
140
  callback(error)
@@ -96,7 +143,7 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
96
143
  callback(error)
97
144
  })
98
145
  },
99
- read (size: number) {
146
+ read (size) {
100
147
  queue.once("audio", (buffer: Buffer) => {
101
148
  this.push(buffer, "binary")
102
149
  })
@@ -111,6 +158,10 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
111
158
  this.stream.destroy()
112
159
  this.stream = null
113
160
  }
161
+
162
+ /* destroy ElevenLabs API */
163
+ if (this.elevenlabs !== null)
164
+ this.elevenlabs = null
114
165
  }
115
166
  }
116
167
 
@@ -37,7 +37,7 @@ export default class SpeechFlowNode extends Events.EventEmitter {
37
37
  }
38
38
 
39
39
  /* INTERNAL: utility function: create "params" attribute from constructor of sub-classes */
40
- configure (spec: { [ id: string ]: { type: string, pos?: number, val?: any, match?: RegExp } }) {
40
+ configure (spec: { [ id: string ]: { type: string, pos?: number, val?: any, match?: RegExp | ((x: any) => boolean) } }) {
41
41
  for (const name of Object.keys(spec)) {
42
42
  if (this.opts[name] !== undefined) {
43
43
  /* named parameter */
@@ -45,9 +45,11 @@ export default class SpeechFlowNode extends Events.EventEmitter {
45
45
  throw new Error(`invalid type of named parameter "${name}" ` +
46
46
  `(has to be ${spec[name].type})`)
47
47
  if ("match" in spec[name]
48
- && this.opts[name].match(spec[name].match) === null)
49
- throw new Error(`invalid value of named parameter "${name}" ` +
50
- `(has to match ${spec[name].match})`)
48
+ && ( ( spec[name].match instanceof RegExp
49
+ && this.opts[name].match(spec[name].match) === null)
50
+ || ( typeof spec[name].match === "function"
51
+ && !spec[name].match(this.opts[name]) ) ))
52
+ throw new Error(`invalid value "${this.opts[name]}" of named parameter "${name}"`)
51
53
  this.params[name] = this.opts[name]
52
54
  }
53
55
  else if (this.opts[name] === undefined
@@ -55,14 +57,20 @@ export default class SpeechFlowNode extends Events.EventEmitter {
55
57
  && typeof spec[name].pos === "number"
56
58
  && spec[name].pos < this.args.length) {
57
59
  /* positional argument */
58
- if (typeof this.args[spec[name].pos!] !== spec[name].type)
60
+ if (typeof this.args[spec[name].pos] !== spec[name].type)
59
61
  throw new Error(`invalid type of positional parameter "${name}" ` +
60
62
  `(has to be ${spec[name].type})`)
61
63
  if ("match" in spec[name]
62
- && this.args[spec[name].pos!].match(spec[name].match) === null)
64
+ && this.args[spec[name].pos].match(spec[name].match) === null)
63
65
  throw new Error(`invalid value of positional parameter "${name}" ` +
64
66
  `(has to match ${spec[name].match})`)
65
- this.params[name] = this.args[spec[name].pos!]
67
+ if ("match" in spec[name]
68
+ && ( ( spec[name].match instanceof RegExp
69
+ && this.args[spec[name].pos].match(spec[name].match) === null)
70
+ || ( typeof spec[name].match === "function"
71
+ && !spec[name].match(this.args[spec[name].pos]) ) ))
72
+ throw new Error(`invalid value "${this.opts[name]}" of positional parameter "${name}"`)
73
+ this.params[name] = this.args[spec[name].pos]
66
74
  }
67
75
  else if ("val" in spec[name] && spec[name].val !== undefined)
68
76
  /* default argument */