npm - zerg-ztc - Versions diffs - 0.1.10 → 0.1.11 - Mend

zerg-ztc 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/dist/App.d.ts.map +1 -1
package/dist/App.js +63 -2
package/dist/App.js.map +1 -1
package/dist/agent/commands/dictation.d.ts +3 -0
package/dist/agent/commands/dictation.d.ts.map +1 -0
package/dist/agent/commands/dictation.js +10 -0
package/dist/agent/commands/dictation.js.map +1 -0
package/dist/agent/commands/index.d.ts.map +1 -1
package/dist/agent/commands/index.js +2 -1
package/dist/agent/commands/index.js.map +1 -1
package/dist/agent/commands/types.d.ts +7 -0
package/dist/agent/commands/types.d.ts.map +1 -1
package/dist/components/InputArea.d.ts +1 -0
package/dist/components/InputArea.d.ts.map +1 -1
package/dist/components/InputArea.js +591 -43
package/dist/components/InputArea.js.map +1 -1
package/dist/components/SingleMessage.d.ts.map +1 -1
package/dist/components/SingleMessage.js +157 -7
package/dist/components/SingleMessage.js.map +1 -1
package/dist/config/types.d.ts +6 -0
package/dist/config/types.d.ts.map +1 -1
package/dist/ui/views/status_bar.js +2 -2
package/dist/ui/views/status_bar.js.map +1 -1
package/dist/utils/dictation.d.ts +46 -0
package/dist/utils/dictation.d.ts.map +1 -0
package/dist/utils/dictation.js +409 -0
package/dist/utils/dictation.js.map +1 -0
package/dist/utils/dictation_native.d.ts +51 -0
package/dist/utils/dictation_native.d.ts.map +1 -0
package/dist/utils/dictation_native.js +216 -0
package/dist/utils/dictation_native.js.map +1 -0
package/dist/utils/path_format.d.ts +20 -0
package/dist/utils/path_format.d.ts.map +1 -0
package/dist/utils/path_format.js +90 -0
package/dist/utils/path_format.js.map +1 -0
package/dist/utils/table.d.ts +38 -0
package/dist/utils/table.d.ts.map +1 -0
package/dist/utils/table.js +133 -0
package/dist/utils/table.js.map +1 -0
package/dist/utils/tool_trace.d.ts +7 -2
package/dist/utils/tool_trace.d.ts.map +1 -1
package/dist/utils/tool_trace.js +156 -51
package/dist/utils/tool_trace.js.map +1 -1
package/package.json +4 -1
package/packages/ztc-dictation/Cargo.toml +43 -0
package/packages/ztc-dictation/README.md +65 -0
package/packages/ztc-dictation/bin/.gitkeep +0 -0
package/packages/ztc-dictation/index.d.ts +16 -0
package/packages/ztc-dictation/index.js +74 -0
package/packages/ztc-dictation/package.json +41 -0
package/packages/ztc-dictation/src/main.rs +430 -0
package/src/App.tsx +98 -1
package/src/agent/commands/dictation.ts +11 -0
package/src/agent/commands/index.ts +2 -0
package/src/agent/commands/types.ts +8 -0
package/src/components/InputArea.tsx +606 -42
package/src/components/SingleMessage.tsx +248 -9
package/src/config/types.ts +7 -0
package/src/ui/views/status_bar.ts +2 -2
package/src/utils/dictation.ts +467 -0
package/src/utils/dictation_native.ts +258 -0
package/src/utils/path_format.ts +99 -0
package/src/utils/table.ts +171 -0
package/src/utils/tool_trace.ts +184 -54

package/packages/ztc-dictation/src/main.rs ADDED Viewed

@@ -0,0 +1,430 @@
+use clap::Parser;
+use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+use ringbuf::{HeapRb, traits::{Consumer, Producer, Split}};
+use serde::Serialize;
+use std::io::{self, Write};
+use std::path::PathBuf;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
+const SAMPLE_RATE: u32 = 16000;
+const CHANNELS: u16 = 1;
+#[derive(Parser, Debug)]
+#[command(name = "ztc-audio")]
+#[command(about = "Real-time audio capture and transcription for ZTC")]
+struct Args {
+    /// Whisper model to use: tiny, base, small, medium, large
+    #[arg(short, long, default_value = "base")]
+    model: String,
+    /// Just list audio devices and exit
+    #[arg(long)]
+    list_devices: bool,
+    /// Download model if not present and exit
+    #[arg(long)]
+    download_model: bool,
+    /// Audio input device name (uses default if not specified)
+    #[arg(short, long)]
+    device: Option<String>,
+}
+#[derive(Serialize)]
+#[serde(tag = "type")]
+enum OutputMessage {
+    #[serde(rename = "ready")]
+    Ready { device: String, model: String },
+    #[serde(rename = "level")]
+    Level { db: f32, rms: f32 },
+    #[serde(rename = "text")]
+    Text { content: String, partial: bool },
+    #[serde(rename = "error")]
+    Error { message: String },
+    #[serde(rename = "device")]
+    Device { name: String, is_default: bool },
+}
+fn emit(msg: &OutputMessage) {
+    if let Ok(json) = serde_json::to_string(msg) {
+        println!("{}", json);
+        let _ = io::stdout().flush();
+    }
+}
+fn get_model_path(model_name: &str) -> PathBuf {
+    let models_dir = dirs::home_dir()
+        .unwrap_or_else(|| PathBuf::from("."))
+        .join(".ztc")
+        .join("models");
+    std::fs::create_dir_all(&models_dir).ok();
+    models_dir.join(format!("ggml-{}.bin", model_name))
+}
+fn download_model(model_name: &str) -> Result<PathBuf, String> {
+    let path = get_model_path(model_name);
+    if path.exists() {
+        return Ok(path);
+    }
+    let url = format!(
+        "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-{}.bin",
+        model_name
+    );
+    eprintln!("Downloading whisper model '{}' from {}", model_name, url);
+    let response = ureq::get(&url)
+        .call()
+        .map_err(|e| format!("Failed to download model: {}", e))?;
+    let total_size = response
+        .header("Content-Length")
+        .and_then(|s| s.parse::<u64>().ok())
+        .unwrap_or(0);
+    let pb = indicatif::ProgressBar::new(total_size);
+    pb.set_style(
+        indicatif::ProgressStyle::default_bar()
+            .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})")
+            .unwrap()
+            .progress_chars("#>-"),
+    );
+    let mut reader = response.into_reader();
+    let mut file = std::fs::File::create(&path)
+        .map_err(|e| format!("Failed to create model file: {}", e))?;
+    let mut buffer = [0u8; 8192];
+    let mut downloaded = 0u64;
+    loop {
+        let bytes_read = reader.read(&mut buffer)
+            .map_err(|e| format!("Failed to read from network: {}", e))?;
+        if bytes_read == 0 {
+            break;
+        }
+        file.write_all(&buffer[..bytes_read])
+            .map_err(|e| format!("Failed to write to file: {}", e))?;
+        downloaded += bytes_read as u64;
+        pb.set_position(downloaded);
+    }
+    pb.finish_with_message("Download complete");
+    eprintln!("Model saved to {:?}", path);
+    Ok(path)
+}
+fn list_devices() {
+    let host = cpal::default_host();
+    if let Some(device) = host.default_input_device() {
+        if let Ok(name) = device.name() {
+            emit(&OutputMessage::Device {
+                name,
+                is_default: true,
+            });
+        }
+    }
+    if let Ok(devices) = host.input_devices() {
+        for device in devices {
+            if let Ok(name) = device.name() {
+                emit(&OutputMessage::Device {
+                    name,
+                    is_default: false,
+                });
+            }
+        }
+    }
+}
+fn calculate_db(samples: &[f32]) -> (f32, f32) {
+    if samples.is_empty() {
+        return (-60.0, 0.0);
+    }
+    let sum_squares: f32 = samples.iter().map(|s| s * s).sum();
+    let rms = (sum_squares / samples.len() as f32).sqrt();
+    // Convert to dB (with floor at -60dB)
+    let db = if rms > 0.0 {
+        20.0 * rms.log10()
+    } else {
+        -60.0
+    };
+    (db.max(-60.0), rms)
+}
+fn run_audio_capture(
+    device_name: Option<&str>,
+    model_path: PathBuf,
+) -> Result<(), String> {
+    let host = cpal::default_host();
+    // Find the audio device
+    let device = if let Some(name) = device_name {
+        host.input_devices()
+            .map_err(|e| format!("Failed to enumerate devices: {}", e))?
+            .find(|d| d.name().map(|n| n.contains(name)).unwrap_or(false))
+            .ok_or_else(|| format!("Device '{}' not found", name))?
+    } else {
+        host.default_input_device()
+            .ok_or_else(|| "No default input device".to_string())?
+    };
+    let device_name_str = device.name().unwrap_or_else(|_| "Unknown".to_string());
+    // Initialize Whisper
+    let ctx = WhisperContext::new_with_params(
+        model_path.to_str().unwrap(),
+        WhisperContextParameters::default(),
+    )
+    .map_err(|e| format!("Failed to load whisper model: {}", e))?;
+    // Get the device's default config and use its sample rate
+    let default_config = device.default_input_config()
+        .map_err(|e| format!("Failed to get default input config: {}", e))?;
+    let device_sample_rate = default_config.sample_rate().0;
+    let device_channels = default_config.channels();
+    emit(&OutputMessage::Ready {
+        device: device_name_str.clone(),
+        model: model_path.file_stem()
+            .and_then(|s| s.to_str())
+            .unwrap_or("unknown")
+            .to_string(),
+    });
+    // Use device's native sample rate and resample to 16kHz for Whisper
+    let config = cpal::StreamConfig {
+        channels: device_channels,
+        sample_rate: cpal::SampleRate(device_sample_rate),
+        buffer_size: cpal::BufferSize::Default,
+    };
+    let resample_ratio = SAMPLE_RATE as f32 / device_sample_rate as f32;
+    // Shared state
+    let running = Arc::new(AtomicBool::new(true));
+    let running_clone = running.clone();
+    // Ring buffer for audio samples (5 seconds of audio at 16kHz)
+    let ring_buffer = HeapRb::<f32>::new(SAMPLE_RATE as usize * 5);
+    let (mut producer, mut consumer) = ring_buffer.split();
+    // Buffer for level calculation (100ms chunks)
+    let level_buffer = Arc::new(Mutex::new(Vec::with_capacity(SAMPLE_RATE as usize / 10)));
+    let level_buffer_clone = level_buffer.clone();
+    // Resampling state - accumulate fractional samples
+    let resample_accum = Arc::new(Mutex::new(0.0f32));
+    let resample_accum_clone = resample_accum.clone();
+    // Handle Ctrl+C
+    ctrlc::set_handler(move || {
+        running_clone.store(false, Ordering::SeqCst);
+    })
+    .map_err(|e| format!("Failed to set signal handler: {}", e))?;
+    // Start audio capture
+    let stream = device
+        .build_input_stream(
+            &config,
+            move |data: &[f32], _: &cpal::InputCallbackInfo| {
+                // Convert to mono and resample to 16kHz
+                let num_channels = device_channels as usize;
+                // First convert to mono by averaging channels
+                let mono_samples: Vec<f32> = data
+                    .chunks(num_channels)
+                    .map(|frame| frame.iter().sum::<f32>() / num_channels as f32)
+                    .collect();
+                // Simple linear resampling to 16kHz
+                // For better quality, could use a proper resampler crate
+                let mut accum = resample_accum_clone.lock().unwrap();
+                let mut resampled = Vec::new();
+                for &sample in &mono_samples {
+                    *accum += resample_ratio;
+                    while *accum >= 1.0 {
+                        resampled.push(sample);
+                        *accum -= 1.0;
+                    }
+                }
+                // Push resampled samples to ring buffer for transcription
+                for sample in &resampled {
+                    let _ = producer.try_push(*sample);
+                }
+                // Also collect for level metering (use original mono samples for responsiveness)
+                if let Ok(mut buf) = level_buffer_clone.lock() {
+                    buf.extend_from_slice(&mono_samples);
+                }
+            },
+            |err| {
+                emit(&OutputMessage::Error {
+                    message: format!("Audio stream error: {}", err),
+                });
+            },
+            None,
+        )
+        .map_err(|e| format!("Failed to build input stream: {}", e))?;
+    stream.play().map_err(|e| format!("Failed to start stream: {}", e))?;
+    // Main processing loop
+    let mut accumulated_audio: Vec<f32> = Vec::new();
+    let mut last_level_time = Instant::now();
+    let mut last_transcribe_time = Instant::now();
+    let level_interval = Duration::from_millis(50);  // 20 Hz level updates
+    let transcribe_interval = Duration::from_millis(500);  // Transcribe every 500ms
+    while running.load(Ordering::SeqCst) {
+        // Read stdin for commands (non-blocking would be better but this works)
+        // For now, just rely on Ctrl+C / signal handling
+        // Emit audio levels
+        if last_level_time.elapsed() >= level_interval {
+            if let Ok(mut buf) = level_buffer.lock() {
+                if !buf.is_empty() {
+                    let (db, rms) = calculate_db(&buf);
+                    emit(&OutputMessage::Level { db, rms });
+                    buf.clear();
+                }
+            }
+            last_level_time = Instant::now();
+        }
+        // Collect audio for transcription
+        while let Some(sample) = consumer.try_pop() {
+            accumulated_audio.push(sample);
+        }
+        // Transcribe periodically
+        if last_transcribe_time.elapsed() >= transcribe_interval && !accumulated_audio.is_empty() {
+            // Run whisper on accumulated audio
+            let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
+            params.set_language(Some("en"));
+            params.set_print_special(false);
+            params.set_print_progress(false);
+            params.set_print_realtime(false);
+            params.set_print_timestamps(false);
+            params.set_single_segment(true);
+            params.set_no_context(true);
+            let mut state = ctx.create_state()
+                .map_err(|e| format!("Failed to create whisper state: {}", e))?;
+            if state.full(params, &accumulated_audio).is_ok() {
+                let num_segments = state.full_n_segments()
+                    .unwrap_or(0);
+                let mut text = String::new();
+                for i in 0..num_segments {
+                    if let Ok(segment) = state.full_get_segment_text(i) {
+                        text.push_str(&segment);
+                    }
+                }
+                let text = text.trim().to_string();
+                if !text.is_empty() {
+                    emit(&OutputMessage::Text {
+                        content: text,
+                        partial: true,
+                    });
+                }
+            }
+            last_transcribe_time = Instant::now();
+        }
+        std::thread::sleep(Duration::from_millis(10));
+    }
+    // Final transcription
+    if !accumulated_audio.is_empty() {
+        let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
+        params.set_language(Some("en"));
+        params.set_print_special(false);
+        params.set_print_progress(false);
+        params.set_print_realtime(false);
+        params.set_print_timestamps(false);
+        let mut state = ctx.create_state()
+            .map_err(|e| format!("Failed to create whisper state: {}", e))?;
+        if state.full(params, &accumulated_audio).is_ok() {
+            let num_segments = state.full_n_segments().unwrap_or(0);
+            let mut text = String::new();
+            for i in 0..num_segments {
+                if let Ok(segment) = state.full_get_segment_text(i) {
+                    text.push_str(&segment);
+                }
+            }
+            let text = text.trim().to_string();
+            emit(&OutputMessage::Text {
+                content: text,
+                partial: false,
+            });
+        }
+    }
+    Ok(())
+}
+fn main() {
+    let args = Args::parse();
+    if args.list_devices {
+        list_devices();
+        return;
+    }
+    if args.download_model {
+        match download_model(&args.model) {
+            Ok(path) => {
+                eprintln!("Model ready at {:?}", path);
+            }
+            Err(e) => {
+                emit(&OutputMessage::Error { message: e });
+                std::process::exit(1);
+            }
+        }
+        return;
+    }
+    // Ensure model is downloaded
+    let model_path = match download_model(&args.model) {
+        Ok(path) => path,
+        Err(e) => {
+            emit(&OutputMessage::Error { message: e });
+            std::process::exit(1);
+        }
+    };
+    // Run audio capture and transcription
+    if let Err(e) = run_audio_capture(args.device.as_deref(), model_path) {
+        emit(&OutputMessage::Error { message: e });
+        std::process::exit(1);
+    }
+}

package/src/App.tsx CHANGED Viewed

@@ -1,5 +1,5 @@
 import React, { useState, useCallback, useMemo, useRef } from 'react';
-import { Box, useApp, useInput, Static } from 'ink';
+import { Box, Text, useApp, useInput, Static } from 'ink';
 import { Header, MessageList, SingleMessage, InputArea, StatusBar, FullScreen, ActivityLine, useScreenSize } from './components/index.js';
 import { buildAppView } from './ui/views/app.js';
 import { useMirror } from './web/mirror_hook.js';
@@ -25,11 +25,35 @@ import { checkForUpdate } from './utils/update.js';
 import { getVersion } from './utils/version.js';
 import { DEFAULT_SPINNER_VERBS } from './utils/spinner_verbs.js';
 import { SPINNER_FRAMES } from './utils/spinner_frames.js';
+import { getRecordingDeviceName } from './utils/dictation.js';
+import {
+  isNativeDictationAvailable,
+  isNativeRecording,
+  onDictationEvent,
+  startNativeRecording,
+  stopNativeRecording,
+  cancelNativeRecording,
+  DictationEvent
+} from './utils/dictation_native.js';
 // --- Utilities ---
 const generateId = () => Math.random().toString(36).slice(2, 11);
+/**
+ * Render a VU meter from dB level
+ * @param db Audio level in dB (-60 to 0)
+ * @returns ASCII VU meter string
+ */
+function renderVUMeter(db: number): string {
+  // Map -60dB..0dB to 0..10 bars
+  const normalized = Math.max(0, Math.min(1, (db + 60) / 60));
+  const bars = Math.round(normalized * 10);
+  const filled = '█'.repeat(bars);
+  const empty = '░'.repeat(10 - bars);
+  return `[${filled}${empty}]`;
+}
 // --- Initial welcome message ---
 function getWelcomeMessage(): Message {
@@ -112,6 +136,12 @@ export const App: React.FC = () => {
   const [retryAvailable, setRetryAvailable] = useState(false);
   const [toast, setToast] = useState<string | null>(null);
   const toastTimerRef = useRef<NodeJS.Timeout | null>(null);
+  const [isRecording, setIsRecording] = useState(false);
+  const [recordingDevice, setRecordingDevice] = useState<string | null>(null);
+  const [isTranscribing, setIsTranscribing] = useState(false);
+  const [audioLevel, setAudioLevel] = useState<number>(-60); // dB
+  const [partialTranscript, setPartialTranscript] = useState<string>('');
+  const [useNativeDictation] = useState(isNativeDictationAvailable());
   const [spinnerLabel, setSpinnerLabel] = useState<string | null>(null);
   const [spinnerFrame, setSpinnerFrame] = useState<string | null>(null);
   const streamingMessageId = React.useRef<string | null>(null);
@@ -715,6 +745,48 @@ export const App: React.FC = () => {
     }, 2500);
   }, []);
+  const handleDictationStateChange = useCallback((state: 'idle' | 'recording' | 'transcribing') => {
+    setIsRecording(state === 'recording');
+    setIsTranscribing(state === 'transcribing');
+    if (state === 'recording') {
+      setRecordingDevice(getRecordingDeviceName());
+    } else if (state === 'idle') {
+      setRecordingDevice(null);
+      setAudioLevel(-60);
+      setPartialTranscript('');
+    }
+  }, []);
+  // Native dictation event handler
+  React.useEffect(() => {
+    if (!useNativeDictation) return;
+    const cleanup = onDictationEvent((event: DictationEvent) => {
+      switch (event.type) {
+        case 'ready':
+          setRecordingDevice(event.device || 'Unknown');
+          break;
+        case 'level':
+          setAudioLevel(event.db ?? -60);
+          break;
+        case 'text':
+          if (event.partial) {
+            setPartialTranscript(event.content || '');
+          }
+          break;
+        case 'stopped':
+          setIsRecording(false);
+          setAudioLevel(-60);
+          break;
+        case 'error':
+          showToast(`Dictation error: ${event.message}`);
+          break;
+      }
+    });
+    return cleanup;
+  }, [useNativeDictation, showToast]);
   useMirror(layoutTree, inputBus);
   // Compute messages for Static (completed) vs live (streaming)
@@ -772,12 +844,37 @@ export const App: React.FC = () => {
         inputMode={inputMode}
       />
+      {isRecording && (
+        <Box flexDirection="column" paddingLeft={2}>
+          <Box>
+            <Text color="red">● Recording</Text>
+            {recordingDevice && (
+              <Text color="gray"> [{recordingDevice}]</Text>
+            )}
+            <Text color="gray"> </Text>
+            <Text color="cyan">{renderVUMeter(audioLevel)}</Text>
+            <Text color="gray"> — Ctrl+R to stop</Text>
+          </Box>
+          {partialTranscript && (
+            <Box>
+              <Text color="gray" italic>"{partialTranscript}"</Text>
+            </Box>
+          )}
+        </Box>
+      )}
+      {isTranscribing && !useNativeDictation && (
+        <Box paddingLeft={2}>
+          <Text color="yellow">⠋ Transcribing...</Text>
+        </Box>
+      )}
       <InputArea
         onSubmit={handleSubmit}
         onCommand={handleCommand}
         commands={commands}
         onStateChange={setInputSnapshot}
         onToast={showToast}
+        onDictationStateChange={handleDictationStateChange}
         cols={columns}
         inputBus={inputBus}
         disabled={false}

package/src/agent/commands/dictation.ts ADDED Viewed

@@ -0,0 +1,11 @@
+import { Command } from './types.js';
+import { getDictationStatus } from '../../utils/dictation.js';
+export const dictationStatusCommand: Command = {
+  name: 'dictation',
+  description: 'Check voice dictation status and availability',
+  handler: async (args, ctx) => {
+    const status = getDictationStatus();
+    ctx.addMessage({ role: 'system', content: status });
+  }
+};

package/src/agent/commands/index.ts CHANGED Viewed

@@ -15,6 +15,7 @@ import { retryCommand } from './retry.js';
 import { inputModeCommand } from './input_mode.js';
 import { keybindingsCommand } from './keybindings.js';
 import { updateCommand } from './update.js';
+import { dictationStatusCommand } from './dictation.js';
 import { Command } from './types.js';
 const commandList: Command[] = [];
@@ -39,6 +40,7 @@ commandList.push(
   updateCommand,
   inputModeCommand,
   retryCommand,
+  dictationStatusCommand,
   exitCommand
 );

package/src/agent/commands/types.ts CHANGED Viewed

@@ -53,6 +53,12 @@ export interface SkillsController {
   list: () => Promise<Skill[]>;
 }
+export interface DictationController {
+  startRecording: () => void;
+  stopRecording: () => Promise<string>;  // Returns transcribed text
+  isRecording: () => boolean;
+}
 export interface CommandContext {
   addMessage: (msg: Omit<Message, 'id' | 'timestamp'>) => void;
   clearMessages: () => void;
@@ -68,8 +74,10 @@ export interface CommandContext {
   clipboard: ClipboardController;
   models: ModelsController;
   skills: SkillsController;
+  dictation?: DictationController;
   getInputMode: () => 'queue' | 'interrupt';
   setInputMode: (mode: 'queue' | 'interrupt') => void;
+  setInputText?: (text: string) => void;  // Set input field text
 }
 export interface Command {