npcsh 0.3.31__py3-none-any.whl → 0.3.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcsh/audio.py +540 -181
- npcsh/audio_gen.py +1 -0
- npcsh/cli.py +8 -10
- npcsh/conversation.py +14 -251
- npcsh/dataframes.py +13 -5
- npcsh/helpers.py +5 -0
- npcsh/image.py +2 -2
- npcsh/image_gen.py +38 -38
- npcsh/knowledge_graph.py +4 -4
- npcsh/llm_funcs.py +517 -349
- npcsh/npc_compiler.py +32 -23
- npcsh/npc_sysenv.py +5 -0
- npcsh/plonk.py +2 -2
- npcsh/response.py +131 -482
- npcsh/search.py +5 -1
- npcsh/serve.py +210 -203
- npcsh/shell.py +11 -25
- npcsh/shell_helpers.py +489 -99
- npcsh/stream.py +87 -554
- npcsh/video.py +5 -2
- npcsh/video_gen.py +69 -0
- npcsh-0.3.32.dist-info/METADATA +779 -0
- {npcsh-0.3.31.dist-info → npcsh-0.3.32.dist-info}/RECORD +49 -47
- npcsh-0.3.31.dist-info/METADATA +0 -1853
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/bash_executer.tool +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/calculator.tool +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/celona.npc +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/code_executor.tool +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/corca.npc +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/eriane.npc +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/foreman.npc +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/generic_search.tool +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/image_generation.tool +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/lineru.npc +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/local_search.tool +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/maurawa.npc +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/npcsh.ctx +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/npcsh_executor.tool +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/raone.npc +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/screen_cap.tool +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/sibiji.npc +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/slean.npc +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/sql_executor.tool +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/test_pipeline.py +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/turnic.npc +0 -0
- {npcsh-0.3.31.data → npcsh-0.3.32.data}/data/npcsh/npc_team/welxor.npc +0 -0
- {npcsh-0.3.31.dist-info → npcsh-0.3.32.dist-info}/WHEEL +0 -0
- {npcsh-0.3.31.dist-info → npcsh-0.3.32.dist-info}/entry_points.txt +0 -0
- {npcsh-0.3.31.dist-info → npcsh-0.3.32.dist-info}/licenses/LICENSE +0 -0
- {npcsh-0.3.31.dist-info → npcsh-0.3.32.dist-info}/top_level.txt +0 -0
npcsh/shell_helpers.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pandas as pd
|
|
3
3
|
|
|
4
|
+
import threading
|
|
5
|
+
|
|
4
6
|
from typing import Dict, Any, List, Optional, Union
|
|
5
7
|
import numpy as np
|
|
6
8
|
import readline
|
|
@@ -25,12 +27,46 @@ import signal
|
|
|
25
27
|
import platform
|
|
26
28
|
import time
|
|
27
29
|
|
|
30
|
+
import tempfile
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Global variables
|
|
34
|
+
running = True
|
|
35
|
+
is_recording = False
|
|
36
|
+
recording_data = []
|
|
37
|
+
buffer_data = []
|
|
38
|
+
last_speech_time = 0
|
|
39
|
+
|
|
28
40
|
|
|
29
41
|
try:
|
|
30
42
|
import whisper
|
|
31
|
-
|
|
43
|
+
from faster_whisper import WhisperModel
|
|
44
|
+
from gtts import gTTS
|
|
45
|
+
import torch
|
|
46
|
+
import pyaudio
|
|
47
|
+
import wave
|
|
48
|
+
import queue
|
|
49
|
+
|
|
50
|
+
from npcsh.audio import (
|
|
51
|
+
cleanup_temp_files,
|
|
52
|
+
FORMAT,
|
|
53
|
+
CHANNELS,
|
|
54
|
+
RATE,
|
|
55
|
+
device,
|
|
56
|
+
vad_model,
|
|
57
|
+
CHUNK,
|
|
58
|
+
whisper_model,
|
|
59
|
+
transcribe_recording,
|
|
60
|
+
convert_mp3_to_wav,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
except Exception as e:
|
|
32
65
|
print(
|
|
33
|
-
"
|
|
66
|
+
"Exception: "
|
|
67
|
+
+ str(e)
|
|
68
|
+
+ "\n"
|
|
69
|
+
+ "Could not load the whisper package. If you want to use tts/stt features, please run `pip install npcsh[audio]` and follow the instructions in the npcsh github readme to ensure your OS can handle the audio dependencies."
|
|
34
70
|
)
|
|
35
71
|
try:
|
|
36
72
|
from sentence_transformers import SentenceTransformer
|
|
@@ -40,8 +76,15 @@ except:
|
|
|
40
76
|
"Could not load the sentence-transformers package. If you want to use it or other local AI features, please run `pip install npcsh[local]` ."
|
|
41
77
|
)
|
|
42
78
|
|
|
43
|
-
from .load_data import
|
|
44
|
-
|
|
79
|
+
from npcsh.load_data import (
|
|
80
|
+
load_pdf,
|
|
81
|
+
load_csv,
|
|
82
|
+
load_json,
|
|
83
|
+
load_excel,
|
|
84
|
+
load_txt,
|
|
85
|
+
load_image,
|
|
86
|
+
)
|
|
87
|
+
from npcsh.npc_sysenv import (
|
|
45
88
|
get_model_and_provider,
|
|
46
89
|
get_available_models,
|
|
47
90
|
get_system_message,
|
|
@@ -53,16 +96,18 @@ from .npc_sysenv import (
|
|
|
53
96
|
NPCSH_VISION_PROVIDER,
|
|
54
97
|
NPCSH_IMAGE_GEN_MODEL,
|
|
55
98
|
NPCSH_IMAGE_GEN_PROVIDER,
|
|
99
|
+
NPCSH_VIDEO_GEN_MODEL,
|
|
100
|
+
NPCSH_VIDEO_GEN_PROVIDER,
|
|
56
101
|
)
|
|
57
|
-
from .command_history import (
|
|
102
|
+
from npcsh.command_history import (
|
|
58
103
|
CommandHistory,
|
|
59
104
|
save_attachment_to_message,
|
|
60
105
|
save_conversation_message,
|
|
61
106
|
start_new_conversation,
|
|
62
107
|
)
|
|
63
|
-
from .embeddings import search_similar_texts, chroma_client
|
|
108
|
+
from npcsh.embeddings import search_similar_texts, chroma_client
|
|
64
109
|
|
|
65
|
-
from .llm_funcs import (
|
|
110
|
+
from npcsh.llm_funcs import (
|
|
66
111
|
execute_llm_command,
|
|
67
112
|
execute_llm_question,
|
|
68
113
|
get_stream,
|
|
@@ -70,13 +115,14 @@ from .llm_funcs import (
|
|
|
70
115
|
get_llm_response,
|
|
71
116
|
check_llm_command,
|
|
72
117
|
generate_image,
|
|
118
|
+
generate_video,
|
|
73
119
|
get_embeddings,
|
|
74
120
|
get_stream,
|
|
75
121
|
)
|
|
76
|
-
from .plonk import plonk, action_space
|
|
77
|
-
from .helpers import get_db_npcs, get_npc_path
|
|
122
|
+
from npcsh.plonk import plonk, action_space
|
|
123
|
+
from npcsh.helpers import get_db_npcs, get_npc_path
|
|
78
124
|
|
|
79
|
-
from .npc_compiler import (
|
|
125
|
+
from npcsh.npc_compiler import (
|
|
80
126
|
NPCCompiler,
|
|
81
127
|
NPC,
|
|
82
128
|
load_npc_from_file,
|
|
@@ -86,10 +132,10 @@ from .npc_compiler import (
|
|
|
86
132
|
)
|
|
87
133
|
|
|
88
134
|
|
|
89
|
-
from .search import rag_search, search_web
|
|
90
|
-
from .image import capture_screenshot, analyze_image
|
|
135
|
+
from npcsh.search import rag_search, search_web
|
|
136
|
+
from npcsh.image import capture_screenshot, analyze_image
|
|
91
137
|
|
|
92
|
-
from .audio import calibrate_silence, record_audio, speak_text
|
|
138
|
+
# from npcsh.audio import calibrate_silence, record_audio, speak_text
|
|
93
139
|
from rich.console import Console
|
|
94
140
|
from rich.markdown import Markdown
|
|
95
141
|
from rich.syntax import Syntax
|
|
@@ -566,11 +612,11 @@ def setup_readline() -> str:
|
|
|
566
612
|
|
|
567
613
|
readline.set_history_length(1000)
|
|
568
614
|
readline.parse_and_bind("set enable-bracketed-paste on") # Enable paste mode
|
|
569
|
-
readline.parse_and_bind('"\e[A": history-search-backward')
|
|
570
|
-
readline.parse_and_bind('"\e[B": history-search-forward')
|
|
571
|
-
readline.parse_and_bind('"\C-r": reverse-search-history')
|
|
572
|
-
readline.parse_and_bind(
|
|
573
|
-
readline.parse_and_bind(
|
|
615
|
+
readline.parse_and_bind(r'"\e[A": history-search-backward')
|
|
616
|
+
readline.parse_and_bind(r'"\e[B": history-search-forward')
|
|
617
|
+
readline.parse_and_bind(r'"\C-r": reverse-search-history')
|
|
618
|
+
readline.parse_and_bind(r'\C-e: end-of-line')
|
|
619
|
+
readline.parse_and_bind(r'\C-a: beginning-of-line')
|
|
574
620
|
|
|
575
621
|
return history_file
|
|
576
622
|
|
|
@@ -1788,8 +1834,8 @@ def execute_slash_command(
|
|
|
1788
1834
|
log_action("Command Executed", command)
|
|
1789
1835
|
|
|
1790
1836
|
command_parts = command.split()
|
|
1791
|
-
command_name = command_parts[0]
|
|
1792
|
-
args = command_parts[1:]
|
|
1837
|
+
command_name = command_parts[0] if len(command_parts) >= 1 else None
|
|
1838
|
+
args = command_parts[1:] if len(command_parts) >= 1 else []
|
|
1793
1839
|
|
|
1794
1840
|
current_npc = npc
|
|
1795
1841
|
if command_name in valid_npcs:
|
|
@@ -1972,6 +2018,7 @@ def execute_slash_command(
|
|
|
1972
2018
|
command_parts, model=model, provider=provider, npc=npc, api_url=api_url
|
|
1973
2019
|
)
|
|
1974
2020
|
elif command_name == "help": # New help command
|
|
2021
|
+
print(get_help())
|
|
1975
2022
|
return {
|
|
1976
2023
|
"messages": messages,
|
|
1977
2024
|
"output": get_help(),
|
|
@@ -2003,6 +2050,17 @@ def execute_slash_command(
|
|
|
2003
2050
|
output = execute_rag_command(command, messages=messages)
|
|
2004
2051
|
messages = output["messages"]
|
|
2005
2052
|
output = output["output"]
|
|
2053
|
+
elif command_name == "roll":
|
|
2054
|
+
|
|
2055
|
+
output = generate_video(
|
|
2056
|
+
command,
|
|
2057
|
+
model=NPCSH_VIDEO_GEN_MODEL,
|
|
2058
|
+
provider=NPCSH_VIDEO_GEN_PROVIDER,
|
|
2059
|
+
npc=npc,
|
|
2060
|
+
messages=messages,
|
|
2061
|
+
)
|
|
2062
|
+
messages = output["messages"]
|
|
2063
|
+
output = output["output"]
|
|
2006
2064
|
|
|
2007
2065
|
elif command_name == "set":
|
|
2008
2066
|
parts = command.split()
|
|
@@ -2073,13 +2131,15 @@ def execute_slash_command(
|
|
|
2073
2131
|
files = None
|
|
2074
2132
|
|
|
2075
2133
|
if len(command_parts) >= 2 and command_parts[1] == "reattach":
|
|
2134
|
+
command_history = CommandHistory()
|
|
2076
2135
|
last_conversation = command_history.get_last_conversation_by_path(
|
|
2077
2136
|
os.getcwd()
|
|
2078
2137
|
)
|
|
2079
2138
|
print(last_conversation)
|
|
2080
2139
|
if last_conversation:
|
|
2081
2140
|
spool_context = [
|
|
2082
|
-
{"role": part[
|
|
2141
|
+
{"role": part["role"], "content": part["content"]}
|
|
2142
|
+
for part in last_conversation
|
|
2083
2143
|
]
|
|
2084
2144
|
|
|
2085
2145
|
print(f"Reattached to previous conversation:\n\n")
|
|
@@ -2742,98 +2802,428 @@ def enter_whisper_mode(
|
|
|
2742
2802
|
npc: Any = None,
|
|
2743
2803
|
spool=False,
|
|
2744
2804
|
continuous=False,
|
|
2745
|
-
stream=
|
|
2746
|
-
|
|
2747
|
-
""
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
|
|
2754
|
-
|
|
2755
|
-
"""
|
|
2805
|
+
stream=True,
|
|
2806
|
+
tts_model="kokoro",
|
|
2807
|
+
voice="af_heart", # Default voice,
|
|
2808
|
+
) -> Dict[str, Any]:
|
|
2809
|
+
# Initialize state
|
|
2810
|
+
running = True
|
|
2811
|
+
is_recording = False
|
|
2812
|
+
recording_data = []
|
|
2813
|
+
buffer_data = []
|
|
2814
|
+
last_speech_time = 0
|
|
2756
2815
|
|
|
2757
|
-
|
|
2758
|
-
model = whisper.load_model("base")
|
|
2759
|
-
except Exception as e:
|
|
2760
|
-
return f"Error: Unable to load Whisper model due to {str(e)}"
|
|
2816
|
+
print("Entering whisper mode. Initializing...")
|
|
2761
2817
|
|
|
2762
|
-
|
|
2763
|
-
|
|
2764
|
-
|
|
2765
|
-
else
|
|
2766
|
-
|
|
2818
|
+
# Update the system message to encourage concise responses
|
|
2819
|
+
concise_instruction = "Please provide brief responses of 1-2 sentences unless the user specifically asks for more detailed information. Keep responses clear and concise."
|
|
2820
|
+
|
|
2821
|
+
model = select_model() if npc is None else npc.model or NPCSH_CHAT_MODEL
|
|
2822
|
+
provider = (
|
|
2823
|
+
NPCSH_CHAT_PROVIDER if npc is None else npc.provider or NPCSH_CHAT_PROVIDER
|
|
2824
|
+
)
|
|
2825
|
+
api_url = NPCSH_API_URL if npc is None else npc.api_url or NPCSH_API_URL
|
|
2826
|
+
|
|
2827
|
+
print(f"\nUsing model: {model} with provider: {provider}")
|
|
2828
|
+
|
|
2829
|
+
system_message = get_system_message(npc) if npc else "You are a helpful assistant."
|
|
2830
|
+
|
|
2831
|
+
# Add conciseness instruction to the system message
|
|
2832
|
+
system_message = system_message + " " + concise_instruction
|
|
2767
2833
|
|
|
2768
2834
|
if messages is None:
|
|
2769
|
-
messages = [
|
|
2835
|
+
messages = [{"role": "system", "content": system_message}]
|
|
2836
|
+
elif messages and messages[0]["role"] == "system":
|
|
2837
|
+
# Update the existing system message
|
|
2838
|
+
messages[0]["content"] = messages[0]["content"] + " " + concise_instruction
|
|
2839
|
+
else:
|
|
2840
|
+
messages.insert(0, {"role": "system", "content": system_message})
|
|
2770
2841
|
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
|
|
2842
|
+
kokoro_pipeline = None
|
|
2843
|
+
if tts_model == "kokoro":
|
|
2844
|
+
try:
|
|
2845
|
+
from kokoro import KPipeline
|
|
2846
|
+
import soundfile as sf
|
|
2847
|
+
|
|
2848
|
+
kokoro_pipeline = KPipeline(lang_code="a")
|
|
2849
|
+
print("Kokoro TTS model initialized")
|
|
2850
|
+
except ImportError:
|
|
2851
|
+
print("Kokoro not installed, falling back to gTTS")
|
|
2852
|
+
tts_model = "gtts"
|
|
2853
|
+
|
|
2854
|
+
# Initialize PyAudio
|
|
2855
|
+
pyaudio_instance = pyaudio.PyAudio()
|
|
2856
|
+
audio_stream = None # We'll open and close as needed
|
|
2857
|
+
transcription_queue = queue.Queue()
|
|
2858
|
+
|
|
2859
|
+
# Create and properly use the is_speaking event
|
|
2860
|
+
is_speaking = threading.Event()
|
|
2861
|
+
is_speaking.clear() # Not speaking initially
|
|
2862
|
+
|
|
2863
|
+
speech_queue = queue.Queue(maxsize=20)
|
|
2864
|
+
speech_thread_active = threading.Event()
|
|
2865
|
+
speech_thread_active.set()
|
|
2866
|
+
|
|
2867
|
+
def speech_playback_thread():
|
|
2868
|
+
nonlocal running, audio_stream
|
|
2869
|
+
|
|
2870
|
+
while running and speech_thread_active.is_set():
|
|
2871
|
+
try:
|
|
2872
|
+
# Get next speech item from queue
|
|
2873
|
+
if not speech_queue.empty():
|
|
2874
|
+
text_to_speak = speech_queue.get(timeout=0.1)
|
|
2875
|
+
|
|
2876
|
+
# Only process if there's text to speak
|
|
2877
|
+
if text_to_speak.strip():
|
|
2878
|
+
# IMPORTANT: Set is_speaking flag BEFORE starting audio output
|
|
2879
|
+
is_speaking.set()
|
|
2880
|
+
|
|
2881
|
+
# Safely close the audio input stream before speaking
|
|
2882
|
+
current_audio_stream = audio_stream
|
|
2883
|
+
audio_stream = (
|
|
2884
|
+
None # Set to None to prevent capture thread from using it
|
|
2885
|
+
)
|
|
2886
|
+
|
|
2887
|
+
if current_audio_stream and current_audio_stream.is_active():
|
|
2888
|
+
current_audio_stream.stop_stream()
|
|
2889
|
+
current_audio_stream.close()
|
|
2890
|
+
|
|
2891
|
+
print(f"Speaking full response...")
|
|
2892
|
+
|
|
2893
|
+
# Generate and play speech
|
|
2894
|
+
generate_and_play_speech(text_to_speak)
|
|
2895
|
+
|
|
2896
|
+
# Delay after speech to prevent echo
|
|
2897
|
+
time.sleep(0.005 * len(text_to_speak))
|
|
2898
|
+
print(len(text_to_speak))
|
|
2899
|
+
|
|
2900
|
+
# Clear the speaking flag to allow listening again
|
|
2901
|
+
is_speaking.clear()
|
|
2902
|
+
else:
|
|
2903
|
+
time.sleep(0.5)
|
|
2904
|
+
except Exception as e:
|
|
2905
|
+
print(f"Error in speech thread: {e}")
|
|
2906
|
+
is_speaking.clear() # Make sure to clear the flag if there's an error
|
|
2907
|
+
time.sleep(0.1)
|
|
2908
|
+
|
|
2909
|
+
def safely_close_audio_stream(stream):
|
|
2910
|
+
"""Safely close an audio stream with error handling"""
|
|
2911
|
+
if stream:
|
|
2912
|
+
try:
|
|
2913
|
+
if stream.is_active():
|
|
2914
|
+
stream.stop_stream()
|
|
2915
|
+
stream.close()
|
|
2916
|
+
except Exception as e:
|
|
2917
|
+
print(f"Error closing audio stream: {e}")
|
|
2918
|
+
|
|
2919
|
+
# Start speech thread
|
|
2920
|
+
speech_thread = threading.Thread(target=speech_playback_thread)
|
|
2921
|
+
speech_thread.daemon = True
|
|
2922
|
+
speech_thread.start()
|
|
2923
|
+
|
|
2924
|
+
def generate_and_play_speech(text):
|
|
2925
|
+
try:
|
|
2926
|
+
# Create a temporary file for audio
|
|
2927
|
+
unique_id = str(time.time()).replace(".", "")
|
|
2928
|
+
temp_dir = tempfile.gettempdir()
|
|
2929
|
+
wav_file = os.path.join(temp_dir, f"temp_{unique_id}.wav")
|
|
2930
|
+
|
|
2931
|
+
# Generate speech based on selected TTS model
|
|
2932
|
+
if tts_model == "kokoro" and kokoro_pipeline:
|
|
2933
|
+
# Use Kokoro for generation
|
|
2934
|
+
generator = kokoro_pipeline(text, voice=voice)
|
|
2935
|
+
|
|
2936
|
+
# Get the audio from the generator
|
|
2937
|
+
for _, _, audio in generator:
|
|
2938
|
+
# Save audio to WAV file
|
|
2939
|
+
import soundfile as sf
|
|
2940
|
+
|
|
2941
|
+
sf.write(wav_file, audio, 24000)
|
|
2942
|
+
break # Just use the first chunk for now
|
|
2943
|
+
else:
|
|
2944
|
+
# Fall back to gTTS
|
|
2945
|
+
mp3_file = os.path.join(temp_dir, f"temp_{unique_id}.mp3")
|
|
2946
|
+
tts = gTTS(text=text, lang="en", slow=False)
|
|
2947
|
+
tts.save(mp3_file)
|
|
2948
|
+
convert_mp3_to_wav(mp3_file, wav_file)
|
|
2949
|
+
|
|
2950
|
+
# Play the audio
|
|
2951
|
+
wf = wave.open(wav_file, "rb")
|
|
2952
|
+
p = pyaudio.PyAudio()
|
|
2953
|
+
|
|
2954
|
+
stream = p.open(
|
|
2955
|
+
format=p.get_format_from_width(wf.getsampwidth()),
|
|
2956
|
+
channels=wf.getnchannels(),
|
|
2957
|
+
rate=wf.getframerate(),
|
|
2958
|
+
output=True,
|
|
2959
|
+
)
|
|
2960
|
+
|
|
2961
|
+
data = wf.readframes(4096)
|
|
2962
|
+
while data and running:
|
|
2963
|
+
stream.write(data)
|
|
2964
|
+
data = wf.readframes(4096)
|
|
2965
|
+
|
|
2966
|
+
stream.stop_stream()
|
|
2967
|
+
stream.close()
|
|
2968
|
+
p.terminate()
|
|
2969
|
+
|
|
2970
|
+
# Cleanup temp files
|
|
2971
|
+
try:
|
|
2972
|
+
if os.path.exists(wav_file):
|
|
2973
|
+
os.remove(wav_file)
|
|
2974
|
+
if tts_model == "gtts" and "mp3_file" in locals():
|
|
2975
|
+
if os.path.exists(mp3_file):
|
|
2976
|
+
os.remove(mp3_file)
|
|
2977
|
+
except Exception as e:
|
|
2978
|
+
print(f"Error removing temp file: {e}")
|
|
2979
|
+
|
|
2980
|
+
except Exception as e:
|
|
2981
|
+
print(f"Error in TTS process: {e}")
|
|
2982
|
+
|
|
2983
|
+
# Modified speak_text function that just queues text
|
|
2984
|
+
def speak_text(text):
|
|
2985
|
+
speech_queue.put(text)
|
|
2986
|
+
|
|
2987
|
+
def process_input(user_input):
|
|
2988
|
+
nonlocal messages
|
|
2989
|
+
|
|
2990
|
+
# Add user message
|
|
2991
|
+
messages.append({"role": "user", "content": user_input})
|
|
2992
|
+
|
|
2993
|
+
# Process with LLM and collect the ENTIRE response first
|
|
2994
|
+
try:
|
|
2995
|
+
full_response = ""
|
|
2996
|
+
|
|
2997
|
+
# Use get_stream for streaming response
|
|
2998
|
+
check = check_llm_command(
|
|
2999
|
+
user_input,
|
|
3000
|
+
npc=npc,
|
|
3001
|
+
messages=messages,
|
|
3002
|
+
model=model,
|
|
3003
|
+
provider=provider,
|
|
3004
|
+
stream=True,
|
|
3005
|
+
whisper=True,
|
|
3006
|
+
)
|
|
3007
|
+
|
|
3008
|
+
# Collect the entire response first
|
|
3009
|
+
for chunk in check:
|
|
3010
|
+
if chunk:
|
|
3011
|
+
chunk_content = "".join(
|
|
3012
|
+
choice.delta.content
|
|
3013
|
+
for choice in chunk.choices
|
|
3014
|
+
if choice.delta.content is not None
|
|
3015
|
+
)
|
|
3016
|
+
|
|
3017
|
+
full_response += chunk_content
|
|
3018
|
+
|
|
3019
|
+
# Show progress in console
|
|
3020
|
+
print(chunk_content, end="", flush=True)
|
|
3021
|
+
|
|
3022
|
+
print("\n") # End the progress display
|
|
3023
|
+
|
|
3024
|
+
# Process and speak the entire response at once
|
|
3025
|
+
if full_response.strip():
|
|
3026
|
+
processed_text = process_text_for_tts(full_response)
|
|
3027
|
+
speak_text(processed_text)
|
|
3028
|
+
|
|
3029
|
+
# Add assistant's response to messages
|
|
3030
|
+
messages.append({"role": "assistant", "content": full_response})
|
|
3031
|
+
|
|
3032
|
+
except Exception as e:
|
|
3033
|
+
print(f"Error in LLM response: {e}")
|
|
3034
|
+
speak_text("I'm sorry, there was an error processing your request.")
|
|
3035
|
+
|
|
3036
|
+
# Function to capture and process audio
|
|
3037
|
+
def capture_audio():
|
|
3038
|
+
nonlocal is_recording, recording_data, buffer_data, last_speech_time, running, is_speaking
|
|
3039
|
+
nonlocal audio_stream, transcription_queue
|
|
3040
|
+
|
|
3041
|
+
# Don't try to record if we're speaking
|
|
3042
|
+
if is_speaking.is_set():
|
|
3043
|
+
return False
|
|
3044
|
+
|
|
3045
|
+
try:
|
|
3046
|
+
# Only create a new audio stream if we don't have one
|
|
3047
|
+
if audio_stream is None and not is_speaking.is_set():
|
|
3048
|
+
audio_stream = pyaudio_instance.open(
|
|
3049
|
+
format=FORMAT,
|
|
3050
|
+
channels=CHANNELS,
|
|
3051
|
+
rate=RATE,
|
|
3052
|
+
input=True,
|
|
3053
|
+
frames_per_buffer=CHUNK,
|
|
3054
|
+
)
|
|
3055
|
+
|
|
3056
|
+
# Initialize or reset the recording variables
|
|
3057
|
+
is_recording = False
|
|
3058
|
+
recording_data = []
|
|
3059
|
+
buffer_data = []
|
|
3060
|
+
|
|
3061
|
+
print("\nListening for speech...")
|
|
3062
|
+
|
|
3063
|
+
while (
|
|
3064
|
+
running
|
|
3065
|
+
and audio_stream
|
|
3066
|
+
and audio_stream.is_active()
|
|
3067
|
+
and not is_speaking.is_set()
|
|
3068
|
+
):
|
|
3069
|
+
try:
|
|
3070
|
+
data = audio_stream.read(CHUNK, exception_on_overflow=False)
|
|
3071
|
+
if data:
|
|
3072
|
+
audio_array = np.frombuffer(data, dtype=np.int16)
|
|
3073
|
+
audio_float = audio_array.astype(np.float32) / 32768.0
|
|
3074
|
+
|
|
3075
|
+
tensor = torch.from_numpy(audio_float).to(device)
|
|
3076
|
+
speech_prob = vad_model(tensor, RATE).item()
|
|
3077
|
+
current_time = time.time()
|
|
3078
|
+
|
|
3079
|
+
if speech_prob > 0.5: # VAD threshold
|
|
3080
|
+
last_speech_time = current_time
|
|
3081
|
+
if not is_recording:
|
|
3082
|
+
is_recording = True
|
|
3083
|
+
print("\nSpeech detected, listening...")
|
|
3084
|
+
recording_data.extend(buffer_data)
|
|
3085
|
+
buffer_data = []
|
|
3086
|
+
recording_data.append(data)
|
|
3087
|
+
else:
|
|
3088
|
+
if is_recording:
|
|
3089
|
+
if (
|
|
3090
|
+
current_time - last_speech_time > 1
|
|
3091
|
+
): # silence duration
|
|
3092
|
+
is_recording = False
|
|
3093
|
+
print("Speech ended, transcribing...")
|
|
3094
|
+
|
|
3095
|
+
# Stop stream before transcribing
|
|
3096
|
+
safely_close_audio_stream(audio_stream)
|
|
3097
|
+
audio_stream = None
|
|
3098
|
+
|
|
3099
|
+
# Transcribe in this thread to avoid race conditions
|
|
3100
|
+
transcription = transcribe_recording(recording_data)
|
|
3101
|
+
if transcription:
|
|
3102
|
+
transcription_queue.put(transcription)
|
|
3103
|
+
recording_data = []
|
|
3104
|
+
return True # Got speech
|
|
3105
|
+
else:
|
|
3106
|
+
buffer_data.append(data)
|
|
3107
|
+
if len(buffer_data) > int(
|
|
3108
|
+
0.65 * RATE / CHUNK
|
|
3109
|
+
): # buffer duration
|
|
3110
|
+
buffer_data.pop(0)
|
|
3111
|
+
|
|
3112
|
+
# Check frequently if we need to stop capturing
|
|
3113
|
+
if is_speaking.is_set():
|
|
3114
|
+
safely_close_audio_stream(audio_stream)
|
|
3115
|
+
audio_stream = None
|
|
3116
|
+
return False
|
|
3117
|
+
|
|
3118
|
+
except Exception as e:
|
|
3119
|
+
print(f"Error processing audio frame: {e}")
|
|
3120
|
+
time.sleep(0.1)
|
|
3121
|
+
|
|
3122
|
+
except Exception as e:
|
|
3123
|
+
print(f"Error in audio capture: {e}")
|
|
3124
|
+
|
|
3125
|
+
# Close stream if we exit without finding speech
|
|
3126
|
+
safely_close_audio_stream(audio_stream)
|
|
3127
|
+
audio_stream = None
|
|
3128
|
+
|
|
3129
|
+
return False
|
|
3130
|
+
|
|
3131
|
+
def process_text_for_tts(text):
|
|
3132
|
+
# Remove special characters that might cause issues in TTS
|
|
3133
|
+
text = re.sub(r"[*<>{}()\[\]&%#@^_=+~]", "", text)
|
|
3134
|
+
text = text.strip()
|
|
3135
|
+
# Add spaces after periods that are followed by words (for better pronunciation)
|
|
3136
|
+
text = re.sub(r"(\w)\.(\w)\.", r"\1 \2 ", text)
|
|
3137
|
+
text = re.sub(r"([.!?])(\w)", r"\1 \2", text)
|
|
3138
|
+
return text
|
|
3139
|
+
|
|
3140
|
+
# Now that functions are defined, play welcome messages
|
|
3141
|
+
speak_text("Entering whisper mode. Please wait.")
|
|
2775
3142
|
|
|
2776
3143
|
try:
|
|
2777
|
-
silence_threshold = calibrate_silence()
|
|
2778
|
-
except Exception as e:
|
|
2779
|
-
return f"Error: Unable to calibrate silence due to {str(e)}"
|
|
2780
3144
|
|
|
2781
|
-
|
|
2782
|
-
"Ready. Speak after seeing 'Listening...'. Say 'exit' or type '/wq' to quit."
|
|
2783
|
-
)
|
|
2784
|
-
speak_text("Whisper mode activated. Ready for your input.")
|
|
3145
|
+
while running:
|
|
2785
3146
|
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
wf = wave.open(temp_audio.name, "wb")
|
|
2790
|
-
wf.setnchannels(1)
|
|
2791
|
-
wf.setsampwidth(2)
|
|
2792
|
-
wf.setframerate(16000)
|
|
2793
|
-
wf.writeframes(audio_data)
|
|
2794
|
-
wf.close()
|
|
2795
|
-
|
|
2796
|
-
result = model.transcribe(temp_audio.name)
|
|
2797
|
-
text = result["text"].strip()
|
|
2798
|
-
print(f"You said: {text}")
|
|
2799
|
-
os.unlink(temp_audio.name)
|
|
2800
|
-
|
|
2801
|
-
messages.append({"role": "user", "content": text}) # Add user message
|
|
2802
|
-
if text.lower() in ["exit", "/wq"]:
|
|
2803
|
-
whisper_output.append("Exiting whisper mode.")
|
|
2804
|
-
speak_text("Exiting whisper mode. Goodbye!")
|
|
2805
|
-
break
|
|
2806
|
-
if not spool:
|
|
2807
|
-
llm_response = check_llm_command(
|
|
2808
|
-
text, npc=npc, messages=messages, stream=stream
|
|
2809
|
-
) # Use
|
|
3147
|
+
# First check for typed input (non-blocking)
|
|
3148
|
+
import select
|
|
3149
|
+
import sys
|
|
2810
3150
|
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
|
|
2815
|
-
|
|
2816
|
-
|
|
2817
|
-
model=model,
|
|
2818
|
-
provider=provider,
|
|
2819
|
-
npc=npc,
|
|
3151
|
+
# Don't spam the console with prompts when speaking
|
|
3152
|
+
if not is_speaking.is_set():
|
|
3153
|
+
print(
|
|
3154
|
+
"\Speak or type your message (or 'exit' to quit): ",
|
|
3155
|
+
end="",
|
|
3156
|
+
flush=True,
|
|
2820
3157
|
)
|
|
3158
|
+
|
|
3159
|
+
rlist, _, _ = select.select([sys.stdin], [], [], 0.1)
|
|
3160
|
+
if rlist:
|
|
3161
|
+
user_input = sys.stdin.readline().strip()
|
|
3162
|
+
if user_input.lower() in ("exit", "quit", "goodbye"):
|
|
3163
|
+
print("\nExiting whisper mode.")
|
|
3164
|
+
break
|
|
3165
|
+
if user_input:
|
|
3166
|
+
print(f"\nYou (typed): {user_input}")
|
|
3167
|
+
process_input(user_input)
|
|
3168
|
+
continue # Skip audio capture this cycle
|
|
3169
|
+
|
|
3170
|
+
# Then try to capture some audio (if no typed input)
|
|
3171
|
+
if not is_speaking.is_set(): # Only capture if not currently speaking
|
|
3172
|
+
got_speech = capture_audio()
|
|
3173
|
+
|
|
3174
|
+
# If we got speech, process it
|
|
3175
|
+
if got_speech:
|
|
3176
|
+
try:
|
|
3177
|
+
transcription = transcription_queue.get_nowait()
|
|
3178
|
+
print(f"\nYou (spoke): {transcription}")
|
|
3179
|
+
process_input(transcription)
|
|
3180
|
+
except queue.Empty:
|
|
3181
|
+
pass
|
|
2821
3182
|
else:
|
|
2822
|
-
|
|
2823
|
-
|
|
2824
|
-
model=model,
|
|
2825
|
-
provider=provider,
|
|
2826
|
-
npc=npc,
|
|
2827
|
-
)
|
|
3183
|
+
# If we're speaking, just wait a bit without spamming the console
|
|
3184
|
+
time.sleep(0.1)
|
|
2828
3185
|
|
|
2829
|
-
|
|
2830
|
-
print(
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
3186
|
+
except KeyboardInterrupt:
|
|
3187
|
+
print("\nInterrupted by user.")
|
|
3188
|
+
|
|
3189
|
+
finally:
|
|
3190
|
+
# Set running to False to signal threads to exit
|
|
3191
|
+
running = False
|
|
3192
|
+
speech_thread_active.clear()
|
|
3193
|
+
|
|
3194
|
+
# Clean up audio resources
|
|
3195
|
+
safely_close_audio_stream(audio_stream)
|
|
2835
3196
|
|
|
2836
|
-
|
|
3197
|
+
if pyaudio_instance:
|
|
3198
|
+
pyaudio_instance.terminate()
|
|
3199
|
+
|
|
3200
|
+
print("\nExiting whisper mode.")
|
|
3201
|
+
speak_text("Exiting whisper mode. Goodbye!")
|
|
3202
|
+
time.sleep(1)
|
|
3203
|
+
cleanup_temp_files()
|
|
3204
|
+
|
|
3205
|
+
return {"messages": messages, "output": "Whisper mode session ended."}
|
|
3206
|
+
|
|
3207
|
+
|
|
3208
|
+
def get_context_string(messages):
|
|
3209
|
+
context = []
|
|
3210
|
+
for message in messages[-5:]: # Get last 5 messages for context
|
|
3211
|
+
role = message.get("role", "")
|
|
3212
|
+
content = message.get("content", "")
|
|
3213
|
+
context.append(f"{role.capitalize()}: {content}")
|
|
3214
|
+
return "\n".join(context)
|
|
3215
|
+
|
|
3216
|
+
|
|
3217
|
+
def input_with_timeout(prompt, timeout=0.1):
|
|
3218
|
+
"""Non-blocking input function with a timeout."""
|
|
3219
|
+
import select
|
|
3220
|
+
import sys
|
|
3221
|
+
|
|
3222
|
+
print(prompt, end="", flush=True)
|
|
3223
|
+
rlist, _, _ = select.select([sys.stdin], [], [], timeout)
|
|
3224
|
+
if rlist:
|
|
3225
|
+
return sys.stdin.readline().strip()
|
|
3226
|
+
return None
|
|
2837
3227
|
|
|
2838
3228
|
|
|
2839
3229
|
def enter_notes_mode(npc: Any = None) -> None:
|