speechflow 0.9.4 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/README.md +227 -54
  3. package/dst/speechflow-node-a2a-ffmpeg.d.ts +13 -0
  4. package/dst/speechflow-node-a2a-ffmpeg.js +152 -0
  5. package/dst/speechflow-node-a2a-wav.d.ts +11 -0
  6. package/dst/speechflow-node-a2a-wav.js +170 -0
  7. package/dst/speechflow-node-a2t-deepgram.d.ts +12 -0
  8. package/dst/speechflow-node-a2t-deepgram.js +220 -0
  9. package/dst/speechflow-node-deepgram.d.ts +3 -1
  10. package/dst/speechflow-node-deepgram.js +86 -22
  11. package/dst/speechflow-node-deepl.d.ts +3 -1
  12. package/dst/speechflow-node-deepl.js +25 -20
  13. package/dst/speechflow-node-device.d.ts +3 -1
  14. package/dst/speechflow-node-device.js +53 -2
  15. package/dst/speechflow-node-elevenlabs.d.ts +4 -1
  16. package/dst/speechflow-node-elevenlabs.js +88 -49
  17. package/dst/speechflow-node-ffmpeg.d.ts +3 -1
  18. package/dst/speechflow-node-ffmpeg.js +42 -4
  19. package/dst/speechflow-node-file.d.ts +3 -1
  20. package/dst/speechflow-node-file.js +84 -13
  21. package/dst/speechflow-node-format.d.ts +11 -0
  22. package/dst/speechflow-node-format.js +80 -0
  23. package/dst/speechflow-node-gemma.d.ts +3 -1
  24. package/dst/speechflow-node-gemma.js +84 -23
  25. package/dst/speechflow-node-mqtt.d.ts +13 -0
  26. package/dst/speechflow-node-mqtt.js +181 -0
  27. package/dst/speechflow-node-opus.d.ts +12 -0
  28. package/dst/speechflow-node-opus.js +135 -0
  29. package/dst/speechflow-node-subtitle.d.ts +12 -0
  30. package/dst/speechflow-node-subtitle.js +96 -0
  31. package/dst/speechflow-node-t2a-elevenlabs.d.ts +13 -0
  32. package/dst/speechflow-node-t2a-elevenlabs.js +182 -0
  33. package/dst/speechflow-node-t2t-deepl.d.ts +12 -0
  34. package/dst/speechflow-node-t2t-deepl.js +133 -0
  35. package/dst/speechflow-node-t2t-format.d.ts +11 -0
  36. package/dst/speechflow-node-t2t-format.js +80 -0
  37. package/dst/speechflow-node-t2t-gemma.d.ts +13 -0
  38. package/dst/speechflow-node-t2t-gemma.js +213 -0
  39. package/dst/speechflow-node-t2t-opus.d.ts +12 -0
  40. package/dst/speechflow-node-t2t-opus.js +135 -0
  41. package/dst/speechflow-node-t2t-subtitle.d.ts +12 -0
  42. package/dst/speechflow-node-t2t-subtitle.js +96 -0
  43. package/dst/speechflow-node-trace.d.ts +11 -0
  44. package/dst/speechflow-node-trace.js +88 -0
  45. package/dst/speechflow-node-wav.d.ts +11 -0
  46. package/dst/speechflow-node-wav.js +170 -0
  47. package/dst/speechflow-node-websocket.d.ts +3 -1
  48. package/dst/speechflow-node-websocket.js +149 -49
  49. package/dst/speechflow-node-whisper-common.d.ts +34 -0
  50. package/dst/speechflow-node-whisper-common.js +7 -0
  51. package/dst/speechflow-node-whisper-ggml.d.ts +1 -0
  52. package/dst/speechflow-node-whisper-ggml.js +97 -0
  53. package/dst/speechflow-node-whisper-onnx.d.ts +1 -0
  54. package/dst/speechflow-node-whisper-onnx.js +131 -0
  55. package/dst/speechflow-node-whisper-worker-ggml.d.ts +1 -0
  56. package/dst/speechflow-node-whisper-worker-ggml.js +97 -0
  57. package/dst/speechflow-node-whisper-worker-onnx.d.ts +1 -0
  58. package/dst/speechflow-node-whisper-worker-onnx.js +131 -0
  59. package/dst/speechflow-node-whisper-worker.d.ts +1 -0
  60. package/dst/speechflow-node-whisper-worker.js +116 -0
  61. package/dst/speechflow-node-whisper-worker2.d.ts +1 -0
  62. package/dst/speechflow-node-whisper-worker2.js +82 -0
  63. package/dst/speechflow-node-whisper.d.ts +19 -0
  64. package/dst/speechflow-node-whisper.js +604 -0
  65. package/dst/speechflow-node-x2x-trace.d.ts +11 -0
  66. package/dst/speechflow-node-x2x-trace.js +88 -0
  67. package/dst/speechflow-node-xio-device.d.ts +13 -0
  68. package/dst/speechflow-node-xio-device.js +205 -0
  69. package/dst/speechflow-node-xio-file.d.ts +11 -0
  70. package/dst/speechflow-node-xio-file.js +176 -0
  71. package/dst/speechflow-node-xio-mqtt.d.ts +13 -0
  72. package/dst/speechflow-node-xio-mqtt.js +181 -0
  73. package/dst/speechflow-node-xio-websocket.d.ts +13 -0
  74. package/dst/speechflow-node-xio-websocket.js +275 -0
  75. package/dst/speechflow-node.d.ts +25 -7
  76. package/dst/speechflow-node.js +74 -9
  77. package/dst/speechflow-utils.d.ts +23 -0
  78. package/dst/speechflow-utils.js +194 -0
  79. package/dst/speechflow.js +146 -43
  80. package/etc/biome.jsonc +12 -4
  81. package/etc/stx.conf +65 -0
  82. package/package.d/@ericedouard+vad-node-realtime+0.2.0.patch +18 -0
  83. package/package.json +49 -31
  84. package/sample.yaml +61 -23
  85. package/src/lib.d.ts +6 -1
  86. package/src/{speechflow-node-ffmpeg.ts → speechflow-node-a2a-ffmpeg.ts} +10 -4
  87. package/src/speechflow-node-a2a-wav.ts +143 -0
  88. package/src/speechflow-node-a2t-deepgram.ts +199 -0
  89. package/src/speechflow-node-t2a-elevenlabs.ts +160 -0
  90. package/src/{speechflow-node-deepl.ts → speechflow-node-t2t-deepl.ts} +36 -25
  91. package/src/speechflow-node-t2t-format.ts +85 -0
  92. package/src/{speechflow-node-gemma.ts → speechflow-node-t2t-gemma.ts} +89 -25
  93. package/src/speechflow-node-t2t-opus.ts +111 -0
  94. package/src/speechflow-node-t2t-subtitle.ts +101 -0
  95. package/src/speechflow-node-x2x-trace.ts +92 -0
  96. package/src/{speechflow-node-device.ts → speechflow-node-xio-device.ts} +25 -3
  97. package/src/speechflow-node-xio-file.ts +153 -0
  98. package/src/speechflow-node-xio-mqtt.ts +154 -0
  99. package/src/speechflow-node-xio-websocket.ts +248 -0
  100. package/src/speechflow-node.ts +78 -13
  101. package/src/speechflow-utils.ts +212 -0
  102. package/src/speechflow.ts +150 -43
  103. package/etc/nps.yaml +0 -40
  104. package/src/speechflow-node-deepgram.ts +0 -133
  105. package/src/speechflow-node-elevenlabs.ts +0 -116
  106. package/src/speechflow-node-file.ts +0 -108
  107. package/src/speechflow-node-websocket.ts +0 -179
package/package.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "name": "speechflow",
3
- "version": "0.9.4",
4
- "x-stdver": "0.9.4-EA",
5
- "x-release": "2025-04-27",
3
+ "version": "0.9.7",
4
+ "x-stdver": "0.9.7-EA",
5
+ "x-release": "2025-07-12",
6
6
  "homepage": "https://github.com/rse/speechflow",
7
7
  "description": "Speech Processing Flow Graph",
8
8
  "license": "GPL-3.0-only",
@@ -17,58 +17,76 @@
17
17
  },
18
18
  "dependencies": {
19
19
  "cli-io": "0.9.13",
20
- "yargs": "17.7.2",
21
- "flowlink": "0.9.9",
20
+ "yargs": "18.0.0",
21
+ "flowlink": "0.9.11",
22
22
  "js-yaml": "4.1.0",
23
- "@gpeng/naudiodon": "2.4.0",
24
- "@deepgram/sdk": "3.12.1",
25
- "deepl-node": "1.17.3",
26
- "elevenlabs": "1.57.0",
27
- "stream-transform": "3.3.3",
23
+ "@gpeng/naudiodon": "2.4.1",
24
+ "@deepgram/sdk": "4.9.1",
25
+ "deepl-node": "1.19.0",
26
+ "@elevenlabs/elevenlabs-js": "2.6.0",
27
+ "stream-transform": "3.4.0",
28
28
  "get-stream": "9.0.1",
29
- "@dotenvx/dotenvx": "1.41.0",
29
+ "@dotenvx/dotenvx": "1.47.5",
30
30
  "speex-resampler": "3.0.1",
31
31
  "pcm-convert": "1.6.5",
32
32
  "object-path": "0.11.8",
33
- "ws": "8.18.1",
33
+ "ws": "8.18.3",
34
34
  "bufferutil": "4.0.9",
35
35
  "utf-8-validate": "6.0.5",
36
36
  "@opensumi/reconnecting-websocket": "4.4.0",
37
- "ollama": "0.5.15",
37
+ "ollama": "0.5.16",
38
38
  "@rse/ffmpeg": "1.4.2",
39
- "ffmpeg-stream": "1.0.0",
40
- "installed-packages": "1.0.13"
39
+ "ffmpeg-stream": "1.0.1",
40
+ "installed-packages": "1.0.13",
41
+ "syspath": "1.0.8",
42
+ "wav": "1.0.2",
43
+ "mqtt": "5.13.2",
44
+ "cbor2": "2.0.1",
45
+ "pure-uuid": "1.8.1",
46
+ "wavefile": "11.0.0",
47
+ "@huggingface/transformers": "3.6.3",
48
+ "@ericedouard/vad-node-realtime": "0.2.0",
49
+ "luxon": "3.7.1",
50
+ "wrap-text": "1.0.10",
51
+ "smart-whisper": "0.8.1"
41
52
  },
42
53
  "devDependencies": {
43
- "eslint": "9.25.1",
44
- "@eslint/js": "9.25.1",
45
- "neostandard": "0.12.1",
54
+ "eslint": "9.31.0",
55
+ "@eslint/js": "9.31.0",
56
+ "neostandard": "0.12.2",
46
57
  "eslint-plugin-promise": "7.2.1",
47
- "eslint-plugin-import": "2.31.0",
58
+ "eslint-plugin-import": "2.32.0",
48
59
  "eslint-plugin-node": "11.1.0",
49
- "@typescript-eslint/eslint-plugin": "8.31.0",
50
- "@typescript-eslint/parser": "8.31.0",
51
- "oxlint": "0.16.8",
52
- "eslint-plugin-oxlint": "0.16.8",
53
- "@biomejs/biome": "1.9.4",
60
+ "@typescript-eslint/eslint-plugin": "8.36.0",
61
+ "@typescript-eslint/parser": "8.36.0",
62
+ "oxlint": "1.6.0",
63
+ "eslint-plugin-oxlint": "1.6.0",
64
+ "@biomejs/biome": "2.0.6",
54
65
  "eslint-config-biome": "1.9.4",
55
66
 
56
- "@types/node": "22.15.2",
67
+ "@types/node": "24.0.13",
57
68
  "@types/yargs": "17.0.33",
58
69
  "@types/js-yaml": "4.0.9",
59
70
  "@types/object-path": "0.11.4",
60
71
  "@types/ws": "8.18.1",
61
72
  "@types/resolve": "1.20.6",
73
+ "@types/wav": "1.0.4",
74
+ "@types/luxon": "3.6.2",
75
+ "@types/wrap-text": "1.0.2",
62
76
 
63
- "stmux": "1.8.10",
77
+ "patch-package": "8.0.0",
78
+ "stmux": "1.8.11",
64
79
  "nodemon": "3.1.10",
65
- "rimraf": "6.0.1",
80
+ "shx": "0.4.0",
66
81
  "typescript": "5.8.3",
67
82
  "delay-cli": "2.0.0",
68
- "nps": "5.10.0",
83
+ "@rse/stx": "1.0.2",
69
84
  "cross-env": "7.0.3"
70
85
  },
71
- "upd": [],
86
+ "overrides": {
87
+ "onnxruntime-node": "1.22.0-dev.20250418-c19a49615b"
88
+ },
89
+ "upd": [ "!@biomejs/biome" ],
72
90
  "engines": {
73
91
  "node": ">=22.0.0"
74
92
  },
@@ -83,7 +101,7 @@
83
101
  }
84
102
  },
85
103
  "scripts": {
86
- "start": "nps -c etc/nps.yaml",
87
- "speechflow": "node dst/speechflow.js"
104
+ "postinstall": "npm start patch-apply",
105
+ "start": "stx -v4 -c etc/stx.conf"
88
106
  }
89
107
  }
package/sample.yaml CHANGED
@@ -2,32 +2,70 @@
2
2
  ## sample.yaml -- Speechflow Sample Audio Processing Graphs
3
3
  ##
4
4
 
5
- # capture audio from microphone to file
6
- capture-microphone: |
7
- device(device: "wasapi:VoiceMeeter Output", mode: "r") |
8
- file(path: "capture.pcm", mode: "w", type: "audio")
5
+ # Capture audio from microphone device into WAV audio file
6
+ capturing: |
7
+ device(device: "wasapi:VoiceMeeter Out B1", mode: "r") |
8
+ wav(mode: "encode") |
9
+ file(path: "capture.wav", mode: "w", type: "audio")
9
10
 
10
- # generate audio file with narration of text file
11
- generate-narration: |
12
- file(path: argv.0, mode: "r", type: "audio") |
13
- deepgram(key: env.SPEECHFLOW_KEY_DEEPGRAM) |
14
- file(path: argv.1, mode: "w", type: "text")
11
+ # Pass-through audio from microphone device to speaker
12
+ # device and in parallel record it to WAV audio file
13
+ pass-through: |
14
+ device(device: "wasapi:VoiceMeeter Out B1", mode: "r") | {
15
+ wav(mode: "encode") |
16
+ file(path: "capture.wav", mode: "w", type: "audio"),
17
+ device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
18
+ }
15
19
 
16
- # pass-through audio from microphone to speaker and in parallel record it to file
17
- microphone-to-speaker: |
18
- device(device: "wasapi:VoiceMeeter Output", mode: "r") | {
19
- file(path: "capture.pcm", mode: "w", type: "audio"),
20
- device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
21
- }
20
+ # Generate text file with German narration of MP3 audio file
21
+ narration: |
22
+ file(path: argv.0, mode: "r", type: "audio") |
23
+ ffmpeg(src: "mp3", dst: "pcm") |
24
+ deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
25
+ format(width: 80) |
26
+ file(path: argv.1, mode: "w", type: "text")
22
27
 
23
- # translate stdin to stdout
28
+ # Generate text file with German subtitles of MP3 audio file
29
+ subtitling: |
30
+ file(path: argv.0, mode: "r", type: "audio") |
31
+ ffmpeg(src: "mp3", dst: "pcm") |
32
+ deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
33
+ subtitle(format: "vtt") |
34
+ file(path: argv.1, mode: "w", type: "text")
35
+
36
+ # Ad-Hoc text translation from German to English
24
37
  translation: |
25
- file(path: "-", mode: "r", type: "text") |
26
- deepl(key: env.SPEECHFLOW_KEY_DEEPL, src: "de", dst: "en-US") |
27
- file(path: "-", mode: "w", type: "text")
38
+ file(path: "-", mode: "r", type: "text") |
39
+ deepl(src: "de", dst: "en") |
40
+ file(path: "-", mode: "w", type: "text")
28
41
 
29
- # sample for development
30
- sample: |
31
- device(device: "coreaudio:Elgato Wave:3", mode: "r") |
32
- file(path: "capture.pcm", mode: "w", type: "audio")
42
+ # Real-time studio translation from German to English,
43
+ # including the capturing of all involved inputs and outputs:
44
+ studio: |
45
+ device(device: "coreaudio:Elgato Wave:3", mode: "r") | {
46
+ wav(mode: "encode") |
47
+ file(path: "program-de.wav", mode: "w", type: "audio"),
48
+ deepgram(key: env.SPEECHFLOW_KEY_DEEPGRAM, language: "de") | {
49
+ format(width: 80) |
50
+ file(path: "program-de.txt", mode: "w", type: "text"),
51
+ deepl(key: env.SPEECHFLOW_KEY_DEEPL, src: "de", dst: "en") | {
52
+ format(width: 80) |
53
+ file(path: "program-en.txt", mode: "w", type: "text"),
54
+ subtitle(format: "vtt") | {
55
+ file(path: "program-en.vtt", mode: "w", type: "text"),
56
+ mqtt(url: "mqtt://10.1.0.10:1883",
57
+ username: env.SPEECHFLOW_MQTT_USER,
58
+ password: env.SPEECHFLOW_MQTT_PASS,
59
+ topicWrite: "stream/studio/sender")
60
+ },
61
+ subtitle(format: "srt") |
62
+ file(path: "program-en.srt", mode: "w", type: "text"),
63
+ elevenlabs(voice: "Mark", speed: 1.05, language: "en") | {
64
+ wav(mode: "encode") |
65
+ file(path: "program-en.wav", mode: "w", type: "audio"),
66
+ device(device: "coreaudio:USBAudio2.0", mode: "w")
67
+ }
68
+ }
69
+ }
70
+ }
33
71
 
package/src/lib.d.ts CHANGED
@@ -15,6 +15,11 @@ declare module "pcm-convert" {
15
15
  data: Buffer,
16
16
  srcFormat: Format,
17
17
  dstFormat: Format
18
- ): Buffer
18
+ ): any
19
+ }
20
+
21
+ declare module "node:stream" {
22
+ import { Stream, Duplex } from "node:stream"
23
+ export function compose (...streams: Stream[]): Duplex
19
24
  }
20
25
 
@@ -13,6 +13,7 @@ import { Converter as FFmpegStream } from "ffmpeg-stream"
13
13
 
14
14
  /* internal dependencies */
15
15
  import SpeechFlowNode from "./speechflow-node"
16
+ import * as utils from "./speechflow-utils"
16
17
 
17
18
  /* SpeechFlow node for FFmpeg */
18
19
  export default class SpeechFlowNodeFFmpeg extends SpeechFlowNode {
@@ -24,8 +25,8 @@ export default class SpeechFlowNodeFFmpeg extends SpeechFlowNode {
24
25
  private ffmpeg: FFmpegStream | null = null
25
26
 
26
27
  /* construct node */
27
- constructor (id: string, opts: { [ id: string ]: any }, args: any[]) {
28
- super(id, opts, args)
28
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
29
+ super(id, cfg, opts, args)
29
30
 
30
31
  /* declare node configuration parameters */
31
32
  this.configure({
@@ -93,9 +94,14 @@ export default class SpeechFlowNodeFFmpeg extends SpeechFlowNode {
93
94
 
94
95
  /* establish a duplex stream and connect it to FFmpeg */
95
96
  this.stream = Stream.Duplex.from({
96
- readable: streamOutput,
97
- writable: streamInput
97
+ writable: streamInput,
98
+ readable: streamOutput
98
99
  })
100
+
101
+ /* wrap streams with conversions for chunk vs plain audio */
102
+ const wrapper1 = utils.createTransformStreamForWritableSide()
103
+ const wrapper2 = utils.createTransformStreamForReadableSide("audio", () => this.timeZero)
104
+ this.stream = Stream.compose(wrapper1, this.stream, wrapper2)
99
105
  }
100
106
 
101
107
  /* close node */
@@ -0,0 +1,143 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import wav from "wav"
12
+
13
+ /* internal dependencies */
14
+ import SpeechFlowNode from "./speechflow-node"
15
+ import * as utils from "./speechflow-utils"
16
+
17
+ /* utility class for wrapping a custom stream into a regular Transform stream */
18
+ class StreamWrapper extends Stream.Transform {
19
+ private foreignStream: any
20
+ constructor (foreignStream: any, options: Stream.TransformOptions = {}) {
21
+ options.readableObjectMode = true
22
+ options.writableObjectMode = true
23
+ super(options)
24
+ this.foreignStream = foreignStream
25
+ this.foreignStream.on("data", (chunk: any) => {
26
+ this.push(chunk)
27
+ })
28
+ this.foreignStream.on("error", (err: Error) => {
29
+ this.emit("error", err)
30
+ })
31
+ this.foreignStream.on("end", () => {
32
+ this.push(null)
33
+ })
34
+ }
35
+ _transform (chunk: any, encoding: BufferEncoding, callback: Stream.TransformCallback): void {
36
+ try {
37
+ const canContinue = this.foreignStream.write(chunk)
38
+ if (canContinue)
39
+ callback()
40
+ else
41
+ this.foreignStream.once("drain", callback)
42
+ }
43
+ catch (err) {
44
+ callback(err as Error)
45
+ }
46
+ }
47
+ _flush (callback: Stream.TransformCallback): void {
48
+ try {
49
+ if (typeof this.foreignStream.end === "function")
50
+ this.foreignStream.end()
51
+ callback()
52
+ }
53
+ catch (err) {
54
+ callback(err as Error)
55
+ }
56
+ }
57
+ }
58
+
59
+ /* SpeechFlow node for WAV format conversion */
60
+ export default class SpeechFlowNodeWAV extends SpeechFlowNode {
61
+ /* declare official node name */
62
+ public static name = "wav"
63
+
64
+ /* construct node */
65
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
66
+ super(id, cfg, opts, args)
67
+
68
+ /* declare node configuration parameters */
69
+ this.configure({
70
+ mode: { type: "string", pos: 1, val: "encode", match: /^(?:encode|decode)$/ }
71
+ })
72
+
73
+ /* declare node input/output format */
74
+ this.input = "audio"
75
+ this.output = "audio"
76
+ }
77
+
78
+ /* open node */
79
+ async open () {
80
+ if (this.params.mode === "encode") {
81
+ /* convert raw/PCM to WAV/PCM */
82
+ /* NOTICE: as this is a continuous stream, the resulting WAV header is not 100%
83
+ conforming to the WAV standard, as it has to use a zero duration information.
84
+ This cannot be changed in a stream-based processing. */
85
+ const writer = new wav.Writer({
86
+ format: 0x0001 /* PCM */,
87
+ channels: this.config.audioChannels,
88
+ sampleRate: this.config.audioSampleRate,
89
+ bitDepth: this.config.audioBitDepth
90
+ })
91
+ this.stream = new StreamWrapper(writer)
92
+ }
93
+ else if (this.params.mode === "decode") {
94
+ /* convert WAV/PCM to raw/PCM */
95
+ const reader = new wav.Reader()
96
+ reader.on("format", (format: any) => {
97
+ this.log("info", `WAV audio stream: format=${format.audioFormat === 0x0001 ? "PCM" :
98
+ "0x" + (format.audioFormat as number).toString(16).padStart(4, "0")} ` +
99
+ `bitDepth=${format.bitDepth} ` +
100
+ `signed=${format.signed ? "yes" : "no"} ` +
101
+ `endian=${format.endianness} ` +
102
+ `sampleRate=${format.sampleRate} ` +
103
+ `channels=${format.channels}`)
104
+ if (format.audioFormat !== 0x0001 /* PCM */)
105
+ throw new Error("WAV not based on PCM format")
106
+ if (format.bitDepth !== 16)
107
+ throw new Error("WAV not based on 16 bit samples")
108
+ if (!format.signed)
109
+ throw new Error("WAV not based on signed integers")
110
+ if (format.endianness !== "LE")
111
+ throw new Error("WAV not based on little endianness")
112
+ if (format.sampleRate !== 48000)
113
+ throw new Error("WAV not based on 48Khz sample rate")
114
+ if (format.channels !== 1)
115
+ throw new Error("WAV not based on mono channel")
116
+ })
117
+ this.stream = new StreamWrapper(reader)
118
+ }
119
+ else
120
+ throw new Error(`invalid operation mode "${this.params.mode}"`)
121
+
122
+ /* convert regular stream into object-mode stream */
123
+ const wrapper1 = utils.createTransformStreamForWritableSide()
124
+ const wrapper2 = utils.createTransformStreamForReadableSide("audio", () => this.timeZero)
125
+ this.stream = Stream.compose(wrapper1, this.stream, wrapper2)
126
+ }
127
+
128
+ /* close node */
129
+ async close () {
130
+ /* shutdown stream */
131
+ if (this.stream !== null) {
132
+ await new Promise<void>((resolve) => {
133
+ if (this.stream instanceof Stream.Duplex)
134
+ this.stream.end(() => { resolve() })
135
+ else
136
+ resolve()
137
+ })
138
+ this.stream.destroy()
139
+ this.stream = null
140
+ }
141
+ }
142
+ }
143
+
@@ -0,0 +1,199 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import * as Deepgram from "@deepgram/sdk"
12
+ import { DateTime, Duration } from "luxon"
13
+
14
+ /* internal dependencies */
15
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
16
+ import * as utils from "./speechflow-utils"
17
+
18
+ /* SpeechFlow node for Deepgram speech-to-text conversion */
19
+ export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
20
+ /* declare official node name */
21
+ public static name = "deepgram"
22
+
23
+ /* internal state */
24
+ private dg: Deepgram.LiveClient | null = null
25
+
26
+ /* construct node */
27
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
28
+ super(id, cfg, opts, args)
29
+
30
+ /* declare node configuration parameters */
31
+ this.configure({
32
+ key: { type: "string", val: process.env.SPEECHFLOW_KEY_DEEPGRAM },
33
+ model: { type: "string", val: "nova-3", pos: 0 },
34
+ version: { type: "string", val: "latest", pos: 1 },
35
+ language: { type: "string", val: "multi", pos: 2 }
36
+ })
37
+
38
+ /* declare node input/output format */
39
+ this.input = "audio"
40
+ this.output = "text"
41
+ }
42
+
43
+ /* open node */
44
+ async open () {
45
+ /* sanity check situation */
46
+ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
47
+ throw new Error("Deepgram node currently supports PCM-S16LE audio only")
48
+
49
+ /* create queue for results */
50
+ const queue = new utils.SingleQueue<SpeechFlowChunk>()
51
+
52
+ /* connect to Deepgram API */
53
+ const deepgram = Deepgram.createClient(this.params.key)
54
+ let language = "en"
55
+ if (this.params.model.match(/^nova-2/) && this.params.language !== "en")
56
+ language = this.params.language
57
+ else if (this.params.model.match(/^nova-3/) && this.params.language !== "en")
58
+ language = "multi"
59
+ this.dg = deepgram.listen.live({
60
+ mip_opt_out: true,
61
+ model: this.params.model,
62
+ version: this.params.version,
63
+ language,
64
+ channels: this.config.audioChannels,
65
+ sample_rate: this.config.audioSampleRate,
66
+ encoding: "linear16",
67
+ multichannel: false,
68
+ endpointing: 10,
69
+ interim_results: false,
70
+ smart_format: true,
71
+ punctuate: true,
72
+ filler_words: true,
73
+ diarize: true, /* still not used by us */
74
+ numerals: true,
75
+ profanity_filter: false
76
+ })
77
+
78
+ /* hook onto Deepgram API events */
79
+ this.dg.on(Deepgram.LiveTranscriptionEvents.Transcript, async (data) => {
80
+ const text = (data.channel?.alternatives[0].transcript as string) ?? ""
81
+ if (text === "")
82
+ this.log("info", `Deepgram: empty/dummy text received (start: ${data.start}s, duration: ${data.duration}s)`)
83
+ else {
84
+ this.log("info", `Deepgram: text received (start: ${data.start}s, duration: ${data.duration}s): "${text}"`)
85
+ const start = Duration.fromMillis(data.start * 1000).plus(this.timeZeroOffset)
86
+ const end = start.plus({ seconds: data.duration })
87
+ const chunk = new SpeechFlowChunk(start, end, "final", "text", text)
88
+ queue.write(chunk)
89
+ }
90
+ })
91
+ this.dg.on(Deepgram.LiveTranscriptionEvents.Metadata, (data) => {
92
+ this.log("info", "Deepgram: metadata received")
93
+ })
94
+ this.dg.on(Deepgram.LiveTranscriptionEvents.Close, () => {
95
+ this.log("info", "Deepgram: connection close")
96
+ })
97
+ this.dg.on(Deepgram.LiveTranscriptionEvents.Error, (error: Error) => {
98
+ this.log("error", `Deepgram: ${error.message}`)
99
+ this.emit("error")
100
+ })
101
+
102
+ /* wait for Deepgram API to be available */
103
+ await new Promise((resolve, reject) => {
104
+ let timer: ReturnType<typeof setTimeout> | null = setTimeout(() => {
105
+ if (timer !== null) {
106
+ timer = null
107
+ reject(new Error("Deepgram: timeout waiting for connection open"))
108
+ }
109
+ }, 3000)
110
+ this.dg!.once(Deepgram.LiveTranscriptionEvents.Open, () => {
111
+ this.log("info", "Deepgram: connection open")
112
+ if (timer !== null) {
113
+ clearTimeout(timer)
114
+ timer = null
115
+ }
116
+ resolve(true)
117
+ })
118
+ })
119
+
120
+ /* remember opening time to receive time zero offset */
121
+ this.timeOpen = DateTime.now()
122
+
123
+ /* workaround Deepgram initialization problems */
124
+ let initDone = false
125
+ let initTimeout: ReturnType<typeof setTimeout> | null = null
126
+ const initTimeoutStart = () => {
127
+ if (initDone)
128
+ return
129
+ setTimeout(async () => {
130
+ if (initTimeout === null)
131
+ return
132
+ initTimeout = null
133
+ this.log("warning", "Deepgram: initialization timeout -- restarting service usage")
134
+ await this.close()
135
+ this.open()
136
+ }, 3000)
137
+ }
138
+ const initTimeoutStop = () => {
139
+ if (initDone)
140
+ return
141
+ initDone = true
142
+ if (initTimeout !== null) {
143
+ clearTimeout(initTimeout)
144
+ initTimeout = null
145
+ }
146
+ }
147
+
148
+ /* provide Duplex stream and internally attach to Deepgram API */
149
+ const dg = this.dg
150
+ const log = (level: string, msg: string) => {
151
+ this.log(level, msg)
152
+ }
153
+ const encoding = this.config.textEncoding
154
+ this.stream = new Stream.Duplex({
155
+ writableObjectMode: true,
156
+ readableObjectMode: true,
157
+ decodeStrings: false,
158
+ write (chunk: SpeechFlowChunk, encoding, callback) {
159
+ if (chunk.type !== "audio")
160
+ callback(new Error("expected audio input chunk"))
161
+ else if (!Buffer.isBuffer(chunk.payload))
162
+ callback(new Error("expected Buffer input chunk"))
163
+ else {
164
+ if (chunk.payload.byteLength > 0) {
165
+ log("info", `Deepgram: send data (${chunk.payload.byteLength} bytes)`)
166
+ initTimeoutStart()
167
+ dg.send(chunk.payload) /* intentionally discard all time information */
168
+ }
169
+ callback()
170
+ }
171
+ },
172
+ read (size) {
173
+ queue.read().then((chunk) => {
174
+ log("info", `Deepgram: receive data (${chunk.payload.length} bytes)`)
175
+ initTimeoutStop()
176
+ this.push(chunk, encoding)
177
+ })
178
+ },
179
+ final (callback) {
180
+ dg.requestClose()
181
+ this.push(null)
182
+ callback()
183
+ }
184
+ })
185
+ }
186
+
187
+ /* close node */
188
+ async close () {
189
+ /* close stream */
190
+ if (this.stream !== null) {
191
+ this.stream.destroy()
192
+ this.stream = null
193
+ }
194
+
195
+ /* shutdown Deepgram API */
196
+ if (this.dg !== null)
197
+ this.dg.requestClose()
198
+ }
199
+ }