PyPI - npcpy - Versions diffs - 1.3.21__py3-none-any.whl → 1.3.23__py3-none-any.whl - Mend

npcpy 1.3.21py3-none-any.whl → 1.3.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

npcpy/data/audio.py +58 -286
npcpy/data/image.py +15 -15
npcpy/data/web.py +2 -2
npcpy/gen/audio_gen.py +172 -2
npcpy/gen/image_gen.py +113 -62
npcpy/gen/response.py +239 -0
npcpy/llm_funcs.py +73 -71
npcpy/memory/command_history.py +117 -69
npcpy/memory/kg_vis.py +74 -74
npcpy/npc_compiler.py +261 -26
npcpy/npc_sysenv.py +4 -1
npcpy/serve.py +393 -91
npcpy/work/desktop.py +31 -5
npcpy-1.3.23.dist-info/METADATA +416 -0
{npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/RECORD +18 -18
npcpy-1.3.21.dist-info/METADATA +0 -1039
{npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/WHEEL +0 -0
{npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/licenses/LICENSE +0 -0
{npcpy-1.3.21.dist-info → npcpy-1.3.23.dist-info}/top_level.txt +0 -0

npcpy/data/audio.py CHANGED Viewed

@@ -6,45 +6,22 @@ import time
 import queue
 import re
 import json
 import subprocess
+import logging
+from typing import Optional, List, Dict, Any
+logger = logging.getLogger(__name__)
+# Audio constants
 try:
-    import torch
     import pyaudio
-    import wave
-    from typing import Optional, List, Dict, Any
-    from gtts import gTTS
-    from faster_whisper import WhisperModel
-    os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
-    import pygame
     FORMAT = pyaudio.paInt16
-    CHANNELS = 1
-    RATE = 16000
-    CHUNK = 512
-    is_speaking = False
-    should_stop_speaking = False
-    tts_sequence = 0
-    recording_data = []
-    buffer_data = []
-    is_recording = False
-    last_speech_time = 0
-    running = True
-    audio_queue = queue.Queue()
-    tts_queue = queue.PriorityQueue()
-    cleanup_files = []
-    pygame.mixer.quit()
-    pygame.mixer.init(frequency=44100, size=-16, channels=2, buffer=512)
-except:
-    print("audio dependencies not installed")
+except ImportError:
+    FORMAT = 8  # paInt16 value fallback
+CHANNELS = 1
+RATE = 16000
+CHUNK = 512
 def convert_mp3_to_wav(mp3_file, wav_file):
@@ -90,49 +67,9 @@ def check_ffmpeg():
         return False
-def get_context_string():
-    context = []
-    for exchange in history:
-        context.append(f"User: {exchange['user']}")
-        context.append(f"Assistant: {exchange['assistant']}")
-    return "\n".join(context)
-def cleanup_temp_files():
-    global cleanup_files
-    for file in list(cleanup_files):
-        try:
-            if os.path.exists(file):
-                os.remove(file)
-                cleanup_files.remove(file)
-        except Exception:
-            pass
-def interrupt_speech():
-    global should_stop_speaking
-    should_stop_speaking = True
-    pygame.mixer.music.stop()
-    pygame.mixer.music.unload()
-    while not tts_queue.empty():
-        try:
-            _, temp_filename = tts_queue.get_nowait()
-            try:
-                if os.path.exists(temp_filename):
-                    os.remove(temp_filename)
-            except:
-                if temp_filename not in cleanup_files:
-                    cleanup_files.append(temp_filename)
-        except queue.Empty:
-            break
-    global tts_sequence
-    tts_sequence = 0
 def audio_callback(in_data, frame_count, time_info, status):
+    import pyaudio
+    audio_queue = queue.Queue()
     audio_queue.put(in_data)
     return (in_data, pyaudio.paContinue)
@@ -571,218 +508,67 @@ def get_available_stt_engines() -> dict:
-def load_history():
-    global history
-    try:
-        if os.path.exists(memory_file):
-            with open(memory_file, "r") as f:
-                history = json.load(f)
-    except Exception as e:
-        print(f"Error loading conversation history: {e}")
-        history = []
-def save_history():
-    try:
-        with open(memory_file, "w") as f:
-            json.dump(history, f)
-    except Exception as e:
-        print(f"Error saving conversation history: {e}")
-def add_exchange(user_input, assistant_response):
-    global history
-    exchange = {
-        "user": user_input,
-        "assistant": assistant_response,
-        "timestamp": time.time(),
-    }
-    history.append(exchange)
-    if len(history) > max_history:
-        history.pop(0)
-    save_history()
-def get_context_string():
-    context = []
-    for exchange in history:
-        context.append(f"User: {exchange['user']}")
-        context.append(f"Assistant: {exchange['assistant']}")
-    return "\n".join(context)
-def cleanup_temp_files():
-    global cleanup_files
-    for file in list(cleanup_files):
-        try:
-            if os.path.exists(file):
-                os.remove(file)
-                cleanup_files.remove(file)
-        except Exception:
-            pass
-def interrupt_speech():
-    global should_stop_speaking, response_generator, is_speaking, tts_sequence
-    should_stop_speaking = True
-    pygame.mixer.music.stop()
-    pygame.mixer.music.unload()
-    while not tts_queue.empty():
-        try:
-            _, temp_filename = tts_queue.get_nowait()
-            try:
-                if os.path.exists(temp_filename):
-                    os.remove(temp_filename)
-            except:
-                if temp_filename not in cleanup_files:
-                    cleanup_files.append(temp_filename)
-        except queue.Empty:
-            break
-    tts_sequence = 0
-    is_speaking = False
-def audio_callback(in_data, frame_count, time_info, status):
-    audio_queue.put(in_data)
-    return (in_data, pyaudio.paContinue)
-def play_audio_from_queue():
-    global is_speaking, cleanup_files, should_stop_speaking
-    next_sequence = 0
-    while True:
-        if should_stop_speaking:
-            pygame.mixer.music.stop()
-            pygame.mixer.music.unload()
-            while not tts_queue.empty():
-                try:
-                    _, temp_filename = tts_queue.get_nowait()
-                    try:
-                        if os.path.exists(temp_filename):
-                            os.remove(temp_filename)
-                    except:
-                        if temp_filename not in cleanup_files:
-                            cleanup_files.append(temp_filename)
-                except queue.Empty:
-                    break
-            next_sequence = 0
-            is_speaking = False
-            should_stop_speaking = False
-            time.sleep(0.1)
-            continue
-        try:
-            if not tts_queue.empty():
-                sequence, temp_filename = tts_queue.queue[0]
-                if sequence == next_sequence:
-                    sequence, temp_filename = tts_queue.get()
-                    is_speaking = True
-                    try:
-                        if len(cleanup_files) > 0 and not pygame.mixer.music.get_busy():
-                            cleanup_temp_files()
-                        if should_stop_speaking:
-                            continue
-                        pygame.mixer.music.load(temp_filename)
-                        pygame.mixer.music.play()
-                        while (
-                            pygame.mixer.music.get_busy() and not should_stop_speaking
-                        ):
-                            pygame.time.wait(50)
-                        pygame.mixer.music.unload()
-                    except Exception as e:
-                        print(f"Audio playback error: {str(e)}")
-                    finally:
-                        try:
-                            if os.path.exists(temp_filename):
-                                os.remove(temp_filename)
-                        except:
-                            if temp_filename not in cleanup_files:
-                                cleanup_files.append(temp_filename)
-                        if not should_stop_speaking:
-                            next_sequence += 1
-                        is_speaking = False
-            time.sleep(0.05)
-        except Exception:
-            time.sleep(0.05)
-import pygame
-from gtts import gTTS
-import tempfile
-import os
-import logging
-logging.basicConfig(level=logging.ERROR)
-logger = logging.getLogger(__name__)
-import pyaudio
-import wave
-from gtts import gTTS
-import tempfile
-import os
-import logging
+# =============================================================================
+# TTS Playback Helpers (use unified audio_gen.text_to_speech)
+# =============================================================================
-import tempfile
-import uuid
+def create_and_queue_audio(text, state, engine="kokoro", voice=None):
+    """Create and play TTS audio using the unified engine interface.
+    Args:
+        text: Text to speak
+        state: Dict with 'tts_is_speaking', 'tts_just_finished', 'running' keys
+        engine: TTS engine name (kokoro, qwen3, elevenlabs, openai, gemini, gtts)
+        voice: Voice ID (engine-specific)
+    """
+    import wave
+    import uuid
-def create_and_queue_audio(text, state):
-    """Create and queue audio with state awareness for TTS/recording coordination"""
     state["tts_is_speaking"] = True
     if not text.strip():
-        print("Empty text, skipping TTS")
         state["tts_is_speaking"] = False
         return
     try:
-        unique_id = uuid.uuid4()
-        with tempfile.TemporaryDirectory() as temp_dir:
-            mp3_file = os.path.join(temp_dir, f"temp_{unique_id}.mp3")
-            wav_file = os.path.join(temp_dir, f"temp_{unique_id}.wav")
+        from npcpy.gen.audio_gen import text_to_speech
+        audio_bytes = text_to_speech(text, engine=engine, voice=voice)
-            tts = gTTS(text=text, lang="en", slow=False)
-            tts.save(mp3_file)
+        # Write to temp file and play
+        suffix = '.mp3' if engine in ('elevenlabs', 'gtts') else '.wav'
+        tmp_path = os.path.join(tempfile.gettempdir(), f"npc_tts_{uuid.uuid4()}{suffix}")
+        with open(tmp_path, 'wb') as f:
+            f.write(audio_bytes)
-            convert_mp3_to_wav(mp3_file, wav_file)
+        play_path = tmp_path
+        if suffix == '.mp3':
+            wav_path = tmp_path.replace('.mp3', '.wav')
+            convert_mp3_to_wav(tmp_path, wav_path)
+            play_path = wav_path
-            play_audio(wav_file, state)
+        play_audio(play_path, state)
+        for p in set([tmp_path, play_path]):
+            try:
+                if os.path.exists(p):
+                    os.remove(p)
+            except Exception:
+                pass
     except Exception as e:
-        print(f"Error in TTS process: {e}")
+        logger.error(f"TTS error: {e}")
     finally:
         state["tts_is_speaking"] = False
         state["tts_just_finished"] = True
-        for file in [mp3_file, wav_file]:
-            try:
-                if os.path.exists(file):
-                    os.remove(file)
-            except Exception as e:
-                print(f"Error removing temporary file {file}: {e}")
 def play_audio(filename, state):
-    """Play audio with state awareness for TTS/recording coordination"""
-    CHUNK = 4096
+    """Play a WAV file via pyaudio with state awareness."""
+    import pyaudio
+    import wave
+    PLAY_CHUNK = 4096
     wf = wave.open(filename, "rb")
     p = pyaudio.PyAudio()
@@ -794,33 +580,19 @@ def play_audio(filename, state):
         output=True,
     )
-    data = wf.readframes(CHUNK)
-    while data and state["running"]:
+    data = wf.readframes(PLAY_CHUNK)
+    while data and state.get("running", True):
         stream.write(data)
-        data = wf.readframes(CHUNK)
+        data = wf.readframes(PLAY_CHUNK)
     stream.stop_stream()
     stream.close()
     p.terminate()
-    try:
-        os.unlink(filename)
-    except:
-        pass
-def process_response_chunk(text_chunk):
-    if not text_chunk.strip():
-        return
-    processed_text = process_text_for_tts(text_chunk)
-    create_and_queue_audio(processed_text)
 def process_text_for_tts(text):
-    text = re.sub(r"[*<>{}()\[\]&%")
+    """Clean text for TTS consumption."""
+    text = re.sub(r"[*<>{}()\[\]&%#@^~`]", "", text)
     text = text.strip()
     text = re.sub(r"(\w)\.(\w)\.", r"\1 \2 ", text)
     text = re.sub(r"([.!?])(\w)", r"\1 \2", text)

npcpy/data/image.py CHANGED Viewed

@@ -85,21 +85,21 @@ def capture_screenshot( full=False) -> Dict[str, str]:
             subprocess.run(["screencapture", file_path], capture_output=True)
         elif system == "Linux":
-            if (
-                subprocess.run(
-                    ["which", "gnome-screenshot"], capture_output=True
-                ).returncode
-                == 0
-            ):
-                subprocess.Popen(["gnome-screenshot", "-f", file_path])
-                while not os.path.exists(file_path):
-                    time.sleep(0.5)
-            elif (
-                subprocess.run(["which", "scrot"], capture_output=True).returncode == 0
-            ):
-                subprocess.Popen(["scrot", file_path])
-                while not os.path.exists(file_path):
-                    time.sleep(0.5)
+            _took = False
+            # Try non-interactive tools first
+            for _cmd, _args in [
+                ("grim", [file_path]),                    # Wayland
+                ("scrot", [file_path]),                   # X11, non-interactive full
+                ("import", ["-window", "root", file_path]),  # ImageMagick X11
+                ("gnome-screenshot", ["-f", file_path]),  # GNOME (may show dialog on newer versions)
+            ]:
+                if subprocess.run(["which", _cmd], capture_output=True).returncode == 0:
+                    subprocess.run([_cmd] + _args, capture_output=True, timeout=10)
+                    if os.path.exists(file_path):
+                        _took = True
+                        break
+            if not _took:
+                print("No supported screenshot tool found. Install scrot, grim, or imagemagick.")
         elif system == "Windows":

npcpy/data/web.py CHANGED Viewed

@@ -146,8 +146,8 @@ def search_perplexity(
 ):
     if api_key is None:
         api_key = os.environ.get("PERPLEXITY_API_KEY")
-        if api_key is None:
-            raise
+        if api_key is None:
+            raise ValueError("PERPLEXITY_API_KEY not set. Set it in your environment or ~/.npcshrc.")
     url = "https://api.perplexity.ai/chat/completions"

npcpy/gen/audio_gen.py CHANGED Viewed

@@ -4,6 +4,7 @@ Supports multiple TTS engines including real-time voice APIs.
 TTS Engines:
 - Kokoro: Local neural TTS (default)
+- Qwen3-TTS: Local high-quality multilingual TTS (0.6B/1.7B)
 - ElevenLabs: Cloud TTS with streaming
 - OpenAI: Realtime voice API
 - Gemini: Live API for real-time voice
@@ -13,6 +14,7 @@ Usage:
     from npcpy.gen.audio_gen import text_to_speech
     audio = text_to_speech("Hello world", engine="kokoro", voice="af_heart")
+    audio = text_to_speech("Hello world", engine="qwen3", voice="ryan")
 For STT, see npcpy.data.audio
 """
@@ -477,6 +479,155 @@ def get_gemini_voices() -> list:
     ]
+# =============================================================================
+# Qwen3-TTS (Local High-Quality Multilingual)
+# =============================================================================
+_qwen3_model_cache = {}
+def _get_qwen3_model(
+    model_size: str = "1.7B",
+    model_type: str = "custom_voice",
+    device: str = "auto",
+):
+    """Load and cache a Qwen3-TTS model."""
+    cache_key = (model_size, model_type, device)
+    if cache_key in _qwen3_model_cache:
+        return _qwen3_model_cache[cache_key]
+    import torch
+    from huggingface_hub import snapshot_download
+    if device == "auto":
+        if torch.cuda.is_available():
+            device = "cuda"
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            device = "mps"
+        else:
+            device = "cpu"
+    dtype = torch.bfloat16 if device != "cpu" else torch.float32
+    size_tag = "0.6B" if "0.6" in model_size else "1.7B"
+    type_map = {
+        "custom_voice": f"Qwen/Qwen3-TTS-12Hz-{size_tag}-CustomVoice",
+        "base": f"Qwen/Qwen3-TTS-12Hz-{size_tag}-Base",
+        "voice_design": f"Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
+    }
+    repo_id = type_map.get(model_type, type_map["custom_voice"])
+    # Try local cache first, then download
+    cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "qwen-tts")
+    model_dir = os.path.join(cache_dir, repo_id.split("/")[-1])
+    if not os.path.exists(os.path.join(model_dir, "config.json")):
+        os.makedirs(cache_dir, exist_ok=True)
+        snapshot_download(repo_id=repo_id, local_dir=model_dir)
+    # Import the model class
+    try:
+        from qwen_tts import Qwen3TTSModel
+    except ImportError:
+        raise ImportError(
+            "qwen_tts package not found. Install from: "
+            "https://github.com/QwenLM/Qwen3-TTS or pip install qwen-tts"
+        )
+    model = Qwen3TTSModel.from_pretrained(
+        model_dir, device_map=device, dtype=dtype
+    )
+    # Clear old entries if switching configs
+    _qwen3_model_cache.clear()
+    _qwen3_model_cache[cache_key] = model
+    return model
+def tts_qwen3(
+    text: str,
+    voice: str = "ryan",
+    language: str = "auto",
+    model_size: str = "1.7B",
+    device: str = "auto",
+    speed: float = 1.0,
+    ref_audio: str = None,
+    ref_text: str = None,
+    instruct: str = None,
+) -> bytes:
+    """
+    Generate speech using Qwen3-TTS local model.
+    Supports three modes based on arguments:
+    - Custom voice (default): Use a preset speaker name
+    - Voice clone: Provide ref_audio (path) to clone a voice
+    - Voice design: Provide instruct (text description) to design a voice
+    Args:
+        text: Text to synthesize
+        voice: Speaker name for custom voice mode
+            (aiden, dylan, eric, ono_anna, ryan, serena, sohee, uncle_fu, vivian)
+        language: Language (auto, chinese, english, japanese, korean, french, etc.)
+        model_size: '0.6B' or '1.7B'
+        device: 'auto', 'cuda', 'mps', 'cpu'
+        speed: Speech speed (not directly supported, reserved)
+        ref_audio: Path to reference audio for voice cloning
+        ref_text: Transcript of reference audio (recommended for cloning)
+        instruct: Natural language voice description for voice design mode
+    Returns:
+        WAV audio bytes
+    """
+    import numpy as np
+    import soundfile as sf
+    if ref_audio:
+        model = _get_qwen3_model(model_size, "base", device)
+        wavs, sr = model.generate_voice_clone(
+            text=text,
+            language=language,
+            ref_audio=ref_audio,
+            ref_text=ref_text,
+        )
+    elif instruct:
+        model = _get_qwen3_model(model_size, "voice_design", device)
+        wavs, sr = model.generate_voice_design(
+            text=text,
+            language=language,
+            instruct=instruct,
+        )
+    else:
+        model = _get_qwen3_model(model_size, "custom_voice", device)
+        wavs, sr = model.generate_custom_voice(
+            text=text,
+            language=language,
+            speaker=voice.lower().replace(" ", "_"),
+        )
+    if not wavs:
+        raise ValueError("Qwen3-TTS generated no audio")
+    wav_buffer = io.BytesIO()
+    sf.write(wav_buffer, wavs[0], sr, format='WAV')
+    wav_buffer.seek(0)
+    return wav_buffer.read()
+def get_qwen3_voices() -> list:
+    """Get available Qwen3-TTS preset voices."""
+    return [
+        {"id": "aiden", "name": "Aiden", "gender": "male"},
+        {"id": "dylan", "name": "Dylan", "gender": "male"},
+        {"id": "eric", "name": "Eric", "gender": "male"},
+        {"id": "ryan", "name": "Ryan", "gender": "male"},
+        {"id": "serena", "name": "Serena", "gender": "female"},
+        {"id": "vivian", "name": "Vivian", "gender": "female"},
+        {"id": "sohee", "name": "Sohee", "gender": "female"},
+        {"id": "ono_anna", "name": "Ono Anna", "gender": "female"},
+        {"id": "uncle_fu", "name": "Uncle Fu", "gender": "male"},
+    ]
 # =============================================================================
 # gTTS (Google Text-to-Speech) - Fallback
 # =============================================================================
@@ -527,7 +678,7 @@ def text_to_speech(
     Args:
         text: Text to synthesize
-        engine: TTS engine (kokoro, elevenlabs, openai, gemini, gtts)
+        engine: TTS engine (kokoro, qwen3, elevenlabs, openai, gemini, gtts)
         voice: Voice ID (engine-specific)
         **kwargs: Engine-specific options
@@ -542,6 +693,10 @@ def text_to_speech(
         lang_code = voices.get(voice, {}).get("lang", "a")
         return tts_kokoro(text, voice=voice, lang_code=lang_code, **kwargs)
+    elif engine in ("qwen3", "qwen3-tts", "qwen"):
+        voice = voice or "ryan"
+        return tts_qwen3(text, voice=voice, **kwargs)
     elif engine == "elevenlabs":
         voice = voice or "JBFqnCBsd6RMkjVDRZzb"
         return tts_elevenlabs(text, voice_id=voice, **kwargs)
@@ -568,6 +723,8 @@ def get_available_voices(engine: str = "kokoro") -> list:
     if engine == "kokoro":
         return get_kokoro_voices()
+    elif engine in ("qwen3", "qwen3-tts", "qwen"):
+        return get_qwen3_voices()
     elif engine == "elevenlabs":
         return get_elevenlabs_voices()
     elif engine == "openai":
@@ -590,6 +747,13 @@ def get_available_engines() -> dict:
             "description": "Local neural TTS (82M params)",
             "install": "pip install kokoro soundfile"
         },
+        "qwen3": {
+            "name": "Qwen3-TTS",
+            "type": "local",
+            "available": False,
+            "description": "Local high-quality multilingual TTS (0.6B/1.7B)",
+            "install": "pip install qwen-tts torch torchaudio transformers"
+        },
         "elevenlabs": {
             "name": "ElevenLabs",
             "type": "cloud",
@@ -615,7 +779,7 @@ def get_available_engines() -> dict:
             "name": "Google TTS",
             "type": "cloud",
             "available": False,
-            "description": "Free Google TTS"
+            "description": "Free Google TTS (fallback)"
         }
     }
@@ -625,6 +789,12 @@ def get_available_engines() -> dict:
     except ImportError:
         pass
+    try:
+        from qwen_tts import Qwen3TTSModel
+        engines["qwen3"]["available"] = True
+    except ImportError:
+        pass
     if os.environ.get('ELEVENLABS_API_KEY'):
         engines["elevenlabs"]["available"] = True

npcpy 1.3.21__py3-none-any.whl → 1.3.23__py3-none-any.whl

npcpy 1.3.21py3-none-any.whl → 1.3.23py3-none-any.whl