npm - speechflow - Versions diffs - 1.5.1 → 1.6.1 - Mend

speechflow 1.5.1 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (232) hide show

package/etc/claude.md CHANGED Viewed

@@ -1,70 +1,107 @@
 # CLAUDE.md
-This file provides guidance to Claude Code (claude.ai/code) when working
-with code in this repository.
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
 ## Project Overview
-SpeechFlow is a command-line interface tool for establishing directed
-data flow graphs of audio and text processing nodes. It enables flexible
-speech processing tasks including capturing audio, text-to-speech,
-speech-to-text, and speech-to-speech translation.
+SpeechFlow is a command-line interface tool for establishing directed data flow graphs of audio and text processing nodes. It enables flexible speech processing tasks including capturing audio, text-to-speech, speech-to-text, and speech-to-speech translation.
 ## Architecture
-SpeechFlow uses a modular node-based architecture:
+SpeechFlow uses a modular node-based architecture with three main components:
-- **Core Engine**: TypeScript-based CLI tool that orchestrates processing flows
-- **Processing Nodes**: Modular components for different speech processing tasks (see `src/speechflow-node-*.ts`)
-- **Flow Expression Language**: Based on FlowLink for defining processing graphs
-- **Web Interfaces**: Two Vue.js applications for dashboard and subtitle display
-- **REST/WebSocket API**: External control interface for nodes
+- **speechflow-cli**: Core TypeScript-based CLI engine that orchestrates processing flows
+- **speechflow-ui-db**: Dashboard UI component for real-time visualization
+- **speechflow-ui-st**: Subtitle UI component for displaying live subtitles
-### Key Components
+### Processing Node Categories
-- **Main CLI**:
-  `src/speechflow.ts` - Entry point and CLI parsing
-- **Nodes**:
-  - Input/Output: `file`, `device`, `websocket`, `mqtt`
-  - Audio-to-Audio: `ffmpeg`, `wav`, `mute`, `meter`, `vad`, `gender`
-  - Audio-to-Text: `deepgram`
-  - Text-to-Text: `deepl`, `openai`, `ollama`, `transformers`, `subtitle`, `format`, `sentence`
-  - Text-to-Audio: `elevenlabs`, `kokoro`
-  - Any-to-Any: `filter`, `trace`
+- **Input/Output (xio)**: file, device, websocket, mqtt
+- **Audio-to-Audio (a2a)**: ffmpeg, wav, mute, meter, vad, gender, gain, filler, compressor, expander, rnnoise, speex
+- **Audio-to-Text (a2t)**: deepgram, amazon, openai
+- **Text-to-Text (t2t)**: deepl, google, amazon, openai, ollama, transformers, subtitle, format, sentence, modify
+- **Text-to-Audio (t2a)**: elevenlabs, kokoro, amazon
+- **Any-to-Any (x2x)**: filter, trace
 ## Development Commands
-The project uses STX (Simple Task eXecutor) for build automation. Main commands:
-### Core Project
 ```bash
-npm start lint          # Static code analysis (TypeScript, ESLint, Biome, Oxlint)
-npm start build         # Compile TypeScript to JavaScript in dst/
-npm start dev           # Multi-pane development dashboard with linting, building, and server
-npm start server        # Run the main speechflow program
+# Top-level commands (from root directory)
+npm start lint          # Lint all components (TypeScript, ESLint, Biome, Oxlint)
+npm start build         # Build all components (full production build)
 npm start clean         # Remove generated files
+npm start upd           # Update all NPM dependencies
+# Component-specific development (from speechflow-cli/)
+npm start dev           # Multi-pane dashboard with linting, building, and server
+npm start lint          # Static code analysis
+npm start build         # Compile TypeScript to JavaScript
+npm start server        # Run the main speechflow program
+npm start clean         # Clean generated files
+# Testing
+npm start test          # Run test configuration with sample pipeline
 ```
-## Project Structure
+## Key Implementation Files
+### Core Engine
+- `speechflow-cli/src/speechflow.ts` - Main CLI entry point and orchestration
+- `speechflow-cli/src/speechflow-node.ts` - Base node class with stream processing
+- `speechflow-cli/src/speechflow-utils.ts` - Utility functions and helpers
+### Node Implementations
+All node implementations follow the pattern `speechflow-node-{category}-{name}.ts` in `speechflow-cli/src/`.
-- `src/` - Main TypeScript source files
-- `dst/` - Compiled JavaScript output
-- `etc/` - Configuration files (TypeScript, ESLint, Biome, etc.)
-- `package.d/` - NPM package patches
+### Stream Processing Architecture
+- Uses Node.js object-mode streams with timestamp metadata
+- Audio chunks: PCM format, 16-bit, 16kHz, mono
+- Text chunks: Include timing information and metadata (gender, final/interim)
+- All streams maintain chronological timestamps for synchronization
-## Development Notes
+## API Integration
-- Node.js 22+ required
-- Uses object-mode streaming with timestamps for audio/text processing
-- External services integration: Deepgram, ElevenLabs, DeepL, OpenAI, Ollama
-- Supports local processing: FFmpeg, WAV, Voice Activity Detection, Gender Detection
-- REST/WebSocket API on port 8484 for external control
+REST/WebSocket API available on port 8484 (configurable) for:
+- External node control (muting, configuration)
+- Real-time metrics (audio levels, text flow)
+- Dashboard and UI connectivity
+## Environment Configuration
+Key environment variables for service integrations:
+- `SPEECHFLOW_DEEPGRAM_KEY` - Deepgram API key
+- `SPEECHFLOW_ELEVENLABS_KEY` - ElevenLabs API key
+- `SPEECHFLOW_DEEPL_KEY` - DeepL API key
+- `SPEECHFLOW_OPENAI_KEY` - OpenAI API key
+- `SPEECHFLOW_GOOGLE_KEY` - Google Cloud API key
+- `SPEECHFLOW_AWS_ACCESS_KEY_ID` - AWS access key
+- `SPEECHFLOW_AWS_SECRET_ACCESS_KEY` - AWS secret key
+- `SPEECHFLOW_AWS_REGION` - AWS region
+- `SPEECHFLOW_DEVICE_MIC` - Microphone device identifier
+- `SPEECHFLOW_DEVICE_SPK` - Speaker device identifier
+## Flow Expression Language
+Based on FlowLink with support for:
+- Sequential pipelines: `node1 | node2 | node3`
+- Parallel branches: `node1, node2, node3`
+- Grouping: `{ node1 | node2 }`
+- Parameters: `node(param1: value, param2: "string")`
+- Environment variables: `env.VARIABLE_NAME`
+- Command arguments: `argv.0`, `argv.1`
+## Testing Approach
+Run tests using the test configuration:
+```bash
+npm start test
+```
-## Configuration
+This executes a sample pipeline defined in `etc/speechflow.yaml` with dashboard visualization.
-Main configuration in `etc/speechflow.yaml` with example
-processing graphs. Environment variables used for API keys (e.g.,
-`SPEECHFLOW_DEEPGRAM_KEY`, `SPEECHFLOW_ELEVENLABS_KEY`).
+## Important Patterns
+1. **Stream Processing**: All nodes extend `SpeechFlowNode` and implement `process()` method for stream transformation
+2. **Error Handling**: Nodes emit errors via stream events, captured and logged centrally
+3. **Timestamp Preservation**: Audio/text chunks maintain timing for synchronization across pipeline
+4. **Meta Information**: Chunks carry metadata (gender, final/interim status) for downstream filtering

package/etc/speechflow.yaml CHANGED Viewed

@@ -4,74 +4,74 @@
 #   Capture and meter audio from microphone device into WAV audio file
 capturing: |
-    device(device: "coreaudio:Elgato Wave:3", mode: "r") |
-        vad() |
-            meter(1000) |
-                wav(mode: "encode") |
-                    file(path: "capture.wav", mode: "w", type: "audio")
+    xio-device(device: env.SPEECHFLOW_DEVICE_MIC, mode: "r") |
+        a2a-vad() |
+            a2a-meter(1000) |
+                a2a-wav(mode: "encode") |
+                    xio-file(path: "capture.wav", mode: "w", type: "audio")
 #   Pass-through audio from microphone device to speaker
 #   device and in parallel record it to WAV audio file
 pass-through: |
-    device(device: "wasapi:VoiceMeeter Out B1", mode: "r") | {
-        wav(mode: "encode") |
-            file(path: "capture.wav", mode: "w", type: "audio"),
-        device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
+    xio-device(device: env.SPEECHFLOW_DEVICE_MIC, mode: "r") | {
+        a2a-wav(mode: "encode") |
+            xio-file(path: "capture.wav", mode: "w", type: "audio"),
+        xio-device(device: env.SPEECHFLOW_DEVICE_SPK, mode: "w")
     }
 #   Generate text file with German transcription of German MP3 audio file
 transcription: |
-    file(path: argv.0, mode: "r", type: "audio") |
-        ffmpeg(src: "mp3", dst: "pcm") |
-            deepgram(language: "de") |
-                format(width: 80) |
-                    file(path: argv.1, mode: "w", type: "text")
+    xio-file(path: argv.0, mode: "r", type: "audio") |
+        a2a-ffmpeg(src: "mp3", dst: "pcm") |
+            a2t-deepgram(language: "de") |
+                t2t-format(width: 80) |
+                    xio-file(path: argv.1, mode: "w", type: "text")
 #   Generate WebVTT file with German closed captions of German MP3 audio file
 captioning: |
-    file(path: argv.0, mode: "r", type: "audio") |
-        ffmpeg(src: "mp3", dst: "pcm") |
-            deepgram(language: "de") |
-                subtitle(format: "vtt") |
-                    file(path: argv.1, mode: "w", type: "text")
+    xio-file(path: argv.0, mode: "r", type: "audio") |
+        a2a-ffmpeg(src: "mp3", dst: "pcm") |
+            a2t-deepgram(language: "de") |
+                t2t-subtitle(format: "vtt") |
+                    xio-file(path: argv.1, mode: "w", type: "text")
 #   Generate WebVTT file with English subtitles of German MP3 audio file
 subtitling: |
-    file(path: argv.0, mode: "r", type: "audio") |
-        ffmpeg(src: "mp3", dst: "pcm") |
-            deepgram(language: "de") |
-                deepl(src: "de", dst: "en") |
-                    subtitle(format: "vtt") |
-                        file(path: argv.1, mode: "w", type: "text")
+    xio-file(path: argv.0, mode: "r", type: "audio") |
+        a2a-ffmpeg(src: "mp3", dst: "pcm") |
+            a2t-deepgram(language: "de") |
+                t2t-deepl(src: "de", dst: "en") |
+                    t2t-subtitle(format: "vtt") |
+                        xio-file(path: argv.1, mode: "w", type: "text")
 #   Ad-Hoc text translation from German to English
 translation: |
-    file(path: "-", mode: "r", type: "text") |
-        deepl(src: "de", dst: "en") |
-            file(path: "-", mode: "w", type: "text")
+    xio-file(path: "-", mode: "r", type: "text") |
+        t2t-deepl(src: "de", dst: "en") |
+            xio-file(path: "-", mode: "w", type: "text")
 #   Generate audio file with English voice for a text file
 speaking: |
-    file(path: argv.0, mode: "r", type: "text") |
-        kokoro(language: "en") |
-            wav(mode: "encode") |
-                file(path: argv.1, mode: "w", type: "audio")
+    xio-file(path: argv.0, mode: "r", type: "text") |
+        t2a-kokoro(language: "en") |
+            a2a-wav(mode: "encode") |
+                xio-file(path: argv.1, mode: "w", type: "audio")
 #   Batch studio transcription from German to English,
 #   including the capturing of all involved inputs and outputs:
 studio-transcription: |
-    file(path: argv.0, mode: "r", type: "audio") | {
-        ffmpeg(src: "mp3", dst: "pcm") | {
-            deepgram(language: "de") | {
-                format(width: 80) |
-                    file(path: argv.1, mode: "w", type: "text"),
-                subtitle(format: "vtt") |
-                    file(path: argv.2, mode: "w", type: "text"),
-                subtitle(format: "srt") |
-                    file(path: argv.3, mode: "w", type: "text"),
-                elevenlabs(voice: "Mark", optimize: "quality", speed: 1.05, language: "en") |
-                    wav(mode: "encode") |
-                        file(path: argv.4, mode: "w", type: "audio")
+    xio-file(path: argv.0, mode: "r", type: "audio") | {
+        a2a-ffmpeg(src: "mp3", dst: "pcm") | {
+            a2t-deepgram(language: "de") | {
+                t2t-format(width: 80) |
+                    xio-file(path: argv.1, mode: "w", type: "text"),
+                t2t-subtitle(format: "vtt") |
+                    xio-file(path: argv.2, mode: "w", type: "text"),
+                t2t-subtitle(format: "srt") |
+                    xio-file(path: argv.3, mode: "w", type: "text"),
+                t2a-elevenlabs(voice: "Mark", optimize: "quality", speed: 1.05, language: "en") |
+                    a2a-wav(mode: "encode") |
+                        xio-file(path: argv.4, mode: "w", type: "audio")
             }
         }
     }
@@ -79,35 +79,35 @@ studio-transcription: |
 #   Real-time studio translation from German to English,
 #   including the capturing of all involved inputs and outputs:
 studio-translation: |
-    device(device: "coreaudio:Elgato Wave:3", mode: "r") | {
-        gender() | {
-            meter(interval: 250, dashboard: "meter1") |
-                wav(mode: "encode") |
-                    file(path: "program-de.wav", mode: "w", type: "audio"),
-            deepgram(language: "de", key: interim: true) | {
-                trace(name: "trace1", type: "text", dashboard: "text1")
-                subtitle(format: "vtt", words: true) |
-                    file(path: "program-de.vtt", mode: "w", type: "text"),
-                sentence() | {
-                    trace(name: "trace2", type: "text", notify: true, dashboard: "text2") |
-                        format(width: 80) |
-                            file(path: "program-de.txt", mode: "w", type: "text"),
-                    deepl(src: "de", dst: "en") | {
-                        trace(name: "trace3", type: "text", dashboard: "text3") | {
-                            format(width: 80) |
-                                file(path: "program-en.txt", mode: "w", type: "text"),
-                            subtitle(format: "vtt", words: false) |
-                                file(path: "program-en.vtt", mode: "w", type: "text"),
+    xio-device(device: env.SPEECHFLOW_DEVICE_MIC, mode: "r") | {
+        a2a-gender() | {
+            a2a-meter(interval: 250, dashboard: "meter1") |
+                a2a-wav(mode: "encode") |
+                    xio-file(path: "program-de.wav", mode: "w", type: "audio"),
+            a2t-deepgram(language: "de", key: interim: true) | {
+                x2x-trace(name: "trace1", type: "text", dashboard: "text1")
+                t2t-subtitle(format: "vtt", words: true) |
+                    xio-file(path: "program-de.vtt", mode: "w", type: "text"),
+                t2t-sentence() | {
+                    x2x-trace(name: "trace2", type: "text", notify: true, dashboard: "text2") |
+                        t2t-format(width: 80) |
+                            xio-file(path: "program-de.txt", mode: "w", type: "text"),
+                    t2t-deepl(src: "de", dst: "en") | {
+                        x2x-trace(name: "trace3", type: "text", dashboard: "text3") | {
+                            t2t-format(width: 80) |
+                                xio-file(path: "program-en.txt", mode: "w", type: "text"),
+                            t2t-subtitle(format: "vtt", words: false) |
+                                xio-file(path: "program-en.vtt", mode: "w", type: "text"),
                             {
-                                filter(name: "S2T-male", type: "text", var: "meta:gender", op: "==", val: "male") |
-                                    elevenlabs(voice: "Mark", optimize: "latency", speed: 1.05, language: "en"),
-                                filter(name: "S2T-female", type: "text", var: "meta:gender", op: "==", val: "female") |
-                                    elevenlabs(voice: "Brittney", optimize: "latency", speed: 1.05, language: "en")
+                                x2x-filter(name: "S2T-male", type: "text", var: "meta:gender", op: "==", val: "male") |
+                                    t2a-elevenlabs(voice: "Mark", optimize: "latency", speed: 1.05, language: "en"),
+                                x2x-filter(name: "S2T-female", type: "text", var: "meta:gender", op: "==", val: "female") |
+                                    t2a-elevenlabs(voice: "Brittney", optimize: "latency", speed: 1.05, language: "en")
                             } | {
-                                meter(interval: 250, dashboard: "meter2"),
-                                wav(mode: "encode") |
-                                    file(path: "program-en.wav", mode: "w", type: "audio"),
-                                device(device: "coreaudio:USBAudio2.0", mode: "w")
+                                a2a-meter(interval: 250, dashboard: "meter2"),
+                                a2a-wav(mode: "encode") |
+                                    xio-file(path: "program-en.wav", mode: "w", type: "audio"),
+                                xio-device(device: env.SPEECHFLOW_DEVICE_SPK, mode: "w")
                             }
                         }
                     }
@@ -118,18 +118,18 @@ studio-translation: |
 #   Test-drive for development
 test: |
-    device(device: "coreaudio:Elgato Wave:3", mode: "r") |
-        meter(interval: 50, dashboard: "meter1") |
-            deepgram(language: "de", model: "nova-2", interim: true) |
-                trace(type: "text", dashboard: "text1") | {
-                    subtitle(mode: "render", addr: "127.0.0.1", port: 8585),
-                    filter(name: "final", type: "text", var: "kind", op: "==", val: "final") |
-                        sentence() |
-                            trace(type: "text", dashboard: "text2") |
-                                deepl(src: "de", dst: "en") |
-                                    trace(type: "text", dashboard: "text3") |
-                                        elevenlabs(voice: "Mark", optimize: "latency", speed: 1.05, language: "en") |
-                                            meter(interval: 50, dashboard: "meter2") |
-                                                device(device: "coreaudio:USBAudio2.0", mode: "w")
+    xio-device(device: env.SPEECHFLOW_DEVICE_MIC, mode: "r") |
+        a2a-meter(interval: 50, dashboard: "meter1") |
+            a2t-deepgram(language: "de", model: "nova-2", interim: true) |
+                x2x-trace(type: "text", dashboard: "text1") | {
+                    t2t-subtitle(mode: "render", addr: "127.0.0.1", port: 8585),
+                    x2x-filter(name: "final", type: "text", var: "kind", op: "==", val: "final") |
+                        t2t-sentence() |
+                            x2x-trace(type: "text", dashboard: "text2") |
+                                t2t-deepl(src: "de", dst: "en") |
+                                    x2x-trace(type: "text", dashboard: "text3") |
+                                        t2a-elevenlabs(voice: "Mark", optimize: "latency", speed: 1.05, language: "en") |
+                                            a2a-meter(interval: 50, dashboard: "meter2") |
+                                                xio-device(device: env.SPEECHFLOW_DEVICE_SPK, mode: "w")
                 }

package/package.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "name":             "speechflow",
-    "version":          "1.5.1",
-    "x-stdver":         "1.5.1-GA",
-    "x-release":        "2025-09-02",
+    "version":          "1.6.1",
+    "x-stdver":         "1.6.1-GA",
+    "x-release":        "2025-09-06",
     "homepage":         "https://github.com/rse/speechflow",
     "description":      "Speech Processing Flow Graph",
     "keywords":         [ "speech", "audio", "flow", "graph" ],

package/speechflow-cli/dst/speechflow-main-api.d.ts ADDED Viewed

@@ -0,0 +1,12 @@
+import CLIio from "cli-io";
+import { CLIOptions } from "./speechflow-main-cli";
+import { NodeGraph } from "./speechflow-main-graph";
+export declare class APIServer {
+    private cli;
+    private wsPeers;
+    private hapi;
+    private sendOSC;
+    constructor(cli: CLIio);
+    start(args: CLIOptions, graph: NodeGraph): Promise<void>;
+    stop(args: CLIOptions): Promise<void>;
+}