npcsh 0.3.30__py3-none-any.whl → 0.3.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcsh/audio.py +540 -181
- npcsh/audio_gen.py +1 -0
- npcsh/cli.py +37 -19
- npcsh/conversation.py +14 -251
- npcsh/dataframes.py +13 -5
- npcsh/helpers.py +5 -0
- npcsh/image.py +2 -4
- npcsh/image_gen.py +38 -38
- npcsh/knowledge_graph.py +4 -4
- npcsh/llm_funcs.py +517 -349
- npcsh/npc_compiler.py +44 -23
- npcsh/npc_sysenv.py +5 -0
- npcsh/npc_team/npcsh.ctx +8 -2
- npcsh/npc_team/tools/generic_search.tool +9 -1
- npcsh/plonk.py +2 -2
- npcsh/response.py +131 -482
- npcsh/search.py +20 -9
- npcsh/serve.py +210 -203
- npcsh/shell.py +78 -80
- npcsh/shell_helpers.py +513 -102
- npcsh/stream.py +87 -554
- npcsh/video.py +5 -2
- npcsh/video_gen.py +69 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/generic_search.tool +9 -1
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/npcsh.ctx +8 -2
- npcsh-0.3.32.dist-info/METADATA +779 -0
- npcsh-0.3.32.dist-info/RECORD +78 -0
- npcsh-0.3.30.dist-info/METADATA +0 -1862
- npcsh-0.3.30.dist-info/RECORD +0 -76
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/bash_executer.tool +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/calculator.tool +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/celona.npc +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/code_executor.tool +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/corca.npc +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/eriane.npc +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/foreman.npc +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/image_generation.tool +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/lineru.npc +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/local_search.tool +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/maurawa.npc +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/npcsh_executor.tool +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/raone.npc +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/screen_cap.tool +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/sibiji.npc +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/slean.npc +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/sql_executor.tool +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/test_pipeline.py +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/turnic.npc +0 -0
- {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/welxor.npc +0 -0
- {npcsh-0.3.30.dist-info → npcsh-0.3.32.dist-info}/WHEEL +0 -0
- {npcsh-0.3.30.dist-info → npcsh-0.3.32.dist-info}/entry_points.txt +0 -0
- {npcsh-0.3.30.dist-info → npcsh-0.3.32.dist-info}/licenses/LICENSE +0 -0
- {npcsh-0.3.30.dist-info → npcsh-0.3.32.dist-info}/top_level.txt +0 -0
npcsh/shell_helpers.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pandas as pd
|
|
3
3
|
|
|
4
|
+
import threading
|
|
5
|
+
|
|
4
6
|
from typing import Dict, Any, List, Optional, Union
|
|
5
7
|
import numpy as np
|
|
6
8
|
import readline
|
|
@@ -25,12 +27,46 @@ import signal
|
|
|
25
27
|
import platform
|
|
26
28
|
import time
|
|
27
29
|
|
|
30
|
+
import tempfile
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Global variables
|
|
34
|
+
running = True
|
|
35
|
+
is_recording = False
|
|
36
|
+
recording_data = []
|
|
37
|
+
buffer_data = []
|
|
38
|
+
last_speech_time = 0
|
|
39
|
+
|
|
28
40
|
|
|
29
41
|
try:
|
|
30
42
|
import whisper
|
|
31
|
-
|
|
43
|
+
from faster_whisper import WhisperModel
|
|
44
|
+
from gtts import gTTS
|
|
45
|
+
import torch
|
|
46
|
+
import pyaudio
|
|
47
|
+
import wave
|
|
48
|
+
import queue
|
|
49
|
+
|
|
50
|
+
from npcsh.audio import (
|
|
51
|
+
cleanup_temp_files,
|
|
52
|
+
FORMAT,
|
|
53
|
+
CHANNELS,
|
|
54
|
+
RATE,
|
|
55
|
+
device,
|
|
56
|
+
vad_model,
|
|
57
|
+
CHUNK,
|
|
58
|
+
whisper_model,
|
|
59
|
+
transcribe_recording,
|
|
60
|
+
convert_mp3_to_wav,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
except Exception as e:
|
|
32
65
|
print(
|
|
33
|
-
"
|
|
66
|
+
"Exception: "
|
|
67
|
+
+ str(e)
|
|
68
|
+
+ "\n"
|
|
69
|
+
+ "Could not load the whisper package. If you want to use tts/stt features, please run `pip install npcsh[audio]` and follow the instructions in the npcsh github readme to ensure your OS can handle the audio dependencies."
|
|
34
70
|
)
|
|
35
71
|
try:
|
|
36
72
|
from sentence_transformers import SentenceTransformer
|
|
@@ -40,8 +76,15 @@ except:
|
|
|
40
76
|
"Could not load the sentence-transformers package. If you want to use it or other local AI features, please run `pip install npcsh[local]` ."
|
|
41
77
|
)
|
|
42
78
|
|
|
43
|
-
from .load_data import
|
|
44
|
-
|
|
79
|
+
from npcsh.load_data import (
|
|
80
|
+
load_pdf,
|
|
81
|
+
load_csv,
|
|
82
|
+
load_json,
|
|
83
|
+
load_excel,
|
|
84
|
+
load_txt,
|
|
85
|
+
load_image,
|
|
86
|
+
)
|
|
87
|
+
from npcsh.npc_sysenv import (
|
|
45
88
|
get_model_and_provider,
|
|
46
89
|
get_available_models,
|
|
47
90
|
get_system_message,
|
|
@@ -53,16 +96,18 @@ from .npc_sysenv import (
|
|
|
53
96
|
NPCSH_VISION_PROVIDER,
|
|
54
97
|
NPCSH_IMAGE_GEN_MODEL,
|
|
55
98
|
NPCSH_IMAGE_GEN_PROVIDER,
|
|
99
|
+
NPCSH_VIDEO_GEN_MODEL,
|
|
100
|
+
NPCSH_VIDEO_GEN_PROVIDER,
|
|
56
101
|
)
|
|
57
|
-
from .command_history import (
|
|
102
|
+
from npcsh.command_history import (
|
|
58
103
|
CommandHistory,
|
|
59
104
|
save_attachment_to_message,
|
|
60
105
|
save_conversation_message,
|
|
61
106
|
start_new_conversation,
|
|
62
107
|
)
|
|
63
|
-
from .embeddings import search_similar_texts, chroma_client
|
|
108
|
+
from npcsh.embeddings import search_similar_texts, chroma_client
|
|
64
109
|
|
|
65
|
-
from .llm_funcs import (
|
|
110
|
+
from npcsh.llm_funcs import (
|
|
66
111
|
execute_llm_command,
|
|
67
112
|
execute_llm_question,
|
|
68
113
|
get_stream,
|
|
@@ -70,13 +115,14 @@ from .llm_funcs import (
|
|
|
70
115
|
get_llm_response,
|
|
71
116
|
check_llm_command,
|
|
72
117
|
generate_image,
|
|
118
|
+
generate_video,
|
|
73
119
|
get_embeddings,
|
|
74
120
|
get_stream,
|
|
75
121
|
)
|
|
76
|
-
from .plonk import plonk, action_space
|
|
77
|
-
from .helpers import get_db_npcs, get_npc_path
|
|
122
|
+
from npcsh.plonk import plonk, action_space
|
|
123
|
+
from npcsh.helpers import get_db_npcs, get_npc_path
|
|
78
124
|
|
|
79
|
-
from .npc_compiler import (
|
|
125
|
+
from npcsh.npc_compiler import (
|
|
80
126
|
NPCCompiler,
|
|
81
127
|
NPC,
|
|
82
128
|
load_npc_from_file,
|
|
@@ -86,10 +132,10 @@ from .npc_compiler import (
|
|
|
86
132
|
)
|
|
87
133
|
|
|
88
134
|
|
|
89
|
-
from .search import rag_search, search_web
|
|
90
|
-
from .image import capture_screenshot, analyze_image
|
|
135
|
+
from npcsh.search import rag_search, search_web
|
|
136
|
+
from npcsh.image import capture_screenshot, analyze_image
|
|
91
137
|
|
|
92
|
-
from .audio import calibrate_silence, record_audio, speak_text
|
|
138
|
+
# from npcsh.audio import calibrate_silence, record_audio, speak_text
|
|
93
139
|
from rich.console import Console
|
|
94
140
|
from rich.markdown import Markdown
|
|
95
141
|
from rich.syntax import Syntax
|
|
@@ -566,11 +612,11 @@ def setup_readline() -> str:
|
|
|
566
612
|
|
|
567
613
|
readline.set_history_length(1000)
|
|
568
614
|
readline.parse_and_bind("set enable-bracketed-paste on") # Enable paste mode
|
|
569
|
-
readline.parse_and_bind('"\e[A": history-search-backward')
|
|
570
|
-
readline.parse_and_bind('"\e[B": history-search-forward')
|
|
571
|
-
readline.parse_and_bind('"\C-r": reverse-search-history')
|
|
572
|
-
readline.parse_and_bind(
|
|
573
|
-
readline.parse_and_bind(
|
|
615
|
+
readline.parse_and_bind(r'"\e[A": history-search-backward')
|
|
616
|
+
readline.parse_and_bind(r'"\e[B": history-search-forward')
|
|
617
|
+
readline.parse_and_bind(r'"\C-r": reverse-search-history')
|
|
618
|
+
readline.parse_and_bind(r'\C-e: end-of-line')
|
|
619
|
+
readline.parse_and_bind(r'\C-a: beginning-of-line')
|
|
574
620
|
|
|
575
621
|
return history_file
|
|
576
622
|
|
|
@@ -1631,6 +1677,9 @@ def ots(
|
|
|
1631
1677
|
output = analyze_image(
|
|
1632
1678
|
user_prompt, file_path, filename, npc=npc, model=model, provider=provider
|
|
1633
1679
|
)
|
|
1680
|
+
messages = [
|
|
1681
|
+
{"role": "user", "content": user_prompt},
|
|
1682
|
+
]
|
|
1634
1683
|
|
|
1635
1684
|
else:
|
|
1636
1685
|
output = capture_screenshot(npc=npc)
|
|
@@ -1651,12 +1700,15 @@ def ots(
|
|
|
1651
1700
|
# messages = output["messages"]
|
|
1652
1701
|
|
|
1653
1702
|
output = output["response"]
|
|
1654
|
-
|
|
1703
|
+
messages = [
|
|
1704
|
+
{"role": "user", "content": user_prompt},
|
|
1705
|
+
]
|
|
1655
1706
|
if output:
|
|
1656
1707
|
if isinstance(output, dict) and "filename" in output:
|
|
1657
1708
|
message = f"Screenshot captured: {output['filename']}\nFull path: {output['file_path']}\nLLM-ready data available."
|
|
1658
1709
|
else: # This handles both LLM responses and error messages (both strings)
|
|
1659
1710
|
message = output
|
|
1711
|
+
messages.append({"role": "assistant", "content": message})
|
|
1660
1712
|
return {"messages": messages, "output": message} # Return the message
|
|
1661
1713
|
else: # Handle the case where capture_screenshot returns None
|
|
1662
1714
|
print("Screenshot capture failed.")
|
|
@@ -1782,8 +1834,8 @@ def execute_slash_command(
|
|
|
1782
1834
|
log_action("Command Executed", command)
|
|
1783
1835
|
|
|
1784
1836
|
command_parts = command.split()
|
|
1785
|
-
command_name = command_parts[0]
|
|
1786
|
-
args = command_parts[1:]
|
|
1837
|
+
command_name = command_parts[0] if len(command_parts) >= 1 else None
|
|
1838
|
+
args = command_parts[1:] if len(command_parts) >= 1 else []
|
|
1787
1839
|
|
|
1788
1840
|
current_npc = npc
|
|
1789
1841
|
if command_name in valid_npcs:
|
|
@@ -1966,6 +2018,7 @@ def execute_slash_command(
|
|
|
1966
2018
|
command_parts, model=model, provider=provider, npc=npc, api_url=api_url
|
|
1967
2019
|
)
|
|
1968
2020
|
elif command_name == "help": # New help command
|
|
2021
|
+
print(get_help())
|
|
1969
2022
|
return {
|
|
1970
2023
|
"messages": messages,
|
|
1971
2024
|
"output": get_help(),
|
|
@@ -1997,6 +2050,17 @@ def execute_slash_command(
|
|
|
1997
2050
|
output = execute_rag_command(command, messages=messages)
|
|
1998
2051
|
messages = output["messages"]
|
|
1999
2052
|
output = output["output"]
|
|
2053
|
+
elif command_name == "roll":
|
|
2054
|
+
|
|
2055
|
+
output = generate_video(
|
|
2056
|
+
command,
|
|
2057
|
+
model=NPCSH_VIDEO_GEN_MODEL,
|
|
2058
|
+
provider=NPCSH_VIDEO_GEN_PROVIDER,
|
|
2059
|
+
npc=npc,
|
|
2060
|
+
messages=messages,
|
|
2061
|
+
)
|
|
2062
|
+
messages = output["messages"]
|
|
2063
|
+
output = output["output"]
|
|
2000
2064
|
|
|
2001
2065
|
elif command_name == "set":
|
|
2002
2066
|
parts = command.split()
|
|
@@ -2042,6 +2106,18 @@ def execute_slash_command(
|
|
|
2042
2106
|
device = part.split("=")[1]
|
|
2043
2107
|
if part.startswith("rag_similarity_threshold="):
|
|
2044
2108
|
rag_similarity_threshold = float(part.split("=")[1])
|
|
2109
|
+
if part.startswith("model="):
|
|
2110
|
+
model = part.split("=")[1]
|
|
2111
|
+
|
|
2112
|
+
if part.startswith("provider="):
|
|
2113
|
+
provider = part.split("=")[1]
|
|
2114
|
+
if part.startswith("api_url="):
|
|
2115
|
+
api_url = part.split("=")[1]
|
|
2116
|
+
if part.startswith("api_key="):
|
|
2117
|
+
api_key = part.split("=")[1]
|
|
2118
|
+
|
|
2119
|
+
# load the npc properly
|
|
2120
|
+
|
|
2045
2121
|
match = re.search(r"files=\s*\[(.*?)\]", command)
|
|
2046
2122
|
files = []
|
|
2047
2123
|
if match:
|
|
@@ -2055,21 +2131,24 @@ def execute_slash_command(
|
|
|
2055
2131
|
files = None
|
|
2056
2132
|
|
|
2057
2133
|
if len(command_parts) >= 2 and command_parts[1] == "reattach":
|
|
2134
|
+
command_history = CommandHistory()
|
|
2058
2135
|
last_conversation = command_history.get_last_conversation_by_path(
|
|
2059
2136
|
os.getcwd()
|
|
2060
2137
|
)
|
|
2061
2138
|
print(last_conversation)
|
|
2062
2139
|
if last_conversation:
|
|
2063
2140
|
spool_context = [
|
|
2064
|
-
{"role": part[
|
|
2141
|
+
{"role": part["role"], "content": part["content"]}
|
|
2142
|
+
for part in last_conversation
|
|
2065
2143
|
]
|
|
2066
2144
|
|
|
2067
2145
|
print(f"Reattached to previous conversation:\n\n")
|
|
2068
2146
|
output = enter_spool_mode(
|
|
2069
|
-
command_history,
|
|
2070
2147
|
inherit_last,
|
|
2071
2148
|
files=files,
|
|
2072
2149
|
npc=npc,
|
|
2150
|
+
model=model,
|
|
2151
|
+
provider=provider,
|
|
2073
2152
|
rag_similarity_threshold=rag_similarity_threshold,
|
|
2074
2153
|
device=device,
|
|
2075
2154
|
messages=spool_context,
|
|
@@ -2082,7 +2161,6 @@ def execute_slash_command(
|
|
|
2082
2161
|
return {"messages": [], "output": "No previous conversation found."}
|
|
2083
2162
|
|
|
2084
2163
|
output = enter_spool_mode(
|
|
2085
|
-
command_history,
|
|
2086
2164
|
inherit_last,
|
|
2087
2165
|
files=files,
|
|
2088
2166
|
npc=npc,
|
|
@@ -2367,11 +2445,13 @@ def execute_command(
|
|
|
2367
2445
|
valid_npcs = get_db_npcs(db_path)
|
|
2368
2446
|
|
|
2369
2447
|
npc_name = get_npc_from_command(command)
|
|
2448
|
+
|
|
2370
2449
|
if npc_name is None:
|
|
2371
2450
|
npc_name = "sibiji" # Default NPC
|
|
2372
2451
|
npc_path = get_npc_path(npc_name, db_path)
|
|
2373
2452
|
|
|
2374
2453
|
npc = load_npc_from_file(npc_path, db_conn)
|
|
2454
|
+
current_npc = npc
|
|
2375
2455
|
else:
|
|
2376
2456
|
valid_npcs = [current_npc]
|
|
2377
2457
|
npc = current_npc
|
|
@@ -2722,98 +2802,428 @@ def enter_whisper_mode(
|
|
|
2722
2802
|
npc: Any = None,
|
|
2723
2803
|
spool=False,
|
|
2724
2804
|
continuous=False,
|
|
2725
|
-
stream=
|
|
2726
|
-
|
|
2727
|
-
""
|
|
2728
|
-
|
|
2729
|
-
|
|
2730
|
-
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
|
|
2734
|
-
|
|
2735
|
-
"""
|
|
2805
|
+
stream=True,
|
|
2806
|
+
tts_model="kokoro",
|
|
2807
|
+
voice="af_heart", # Default voice,
|
|
2808
|
+
) -> Dict[str, Any]:
|
|
2809
|
+
# Initialize state
|
|
2810
|
+
running = True
|
|
2811
|
+
is_recording = False
|
|
2812
|
+
recording_data = []
|
|
2813
|
+
buffer_data = []
|
|
2814
|
+
last_speech_time = 0
|
|
2736
2815
|
|
|
2737
|
-
|
|
2738
|
-
model = whisper.load_model("base")
|
|
2739
|
-
except Exception as e:
|
|
2740
|
-
return f"Error: Unable to load Whisper model due to {str(e)}"
|
|
2816
|
+
print("Entering whisper mode. Initializing...")
|
|
2741
2817
|
|
|
2742
|
-
|
|
2743
|
-
|
|
2744
|
-
|
|
2745
|
-
else
|
|
2746
|
-
|
|
2818
|
+
# Update the system message to encourage concise responses
|
|
2819
|
+
concise_instruction = "Please provide brief responses of 1-2 sentences unless the user specifically asks for more detailed information. Keep responses clear and concise."
|
|
2820
|
+
|
|
2821
|
+
model = select_model() if npc is None else npc.model or NPCSH_CHAT_MODEL
|
|
2822
|
+
provider = (
|
|
2823
|
+
NPCSH_CHAT_PROVIDER if npc is None else npc.provider or NPCSH_CHAT_PROVIDER
|
|
2824
|
+
)
|
|
2825
|
+
api_url = NPCSH_API_URL if npc is None else npc.api_url or NPCSH_API_URL
|
|
2826
|
+
|
|
2827
|
+
print(f"\nUsing model: {model} with provider: {provider}")
|
|
2828
|
+
|
|
2829
|
+
system_message = get_system_message(npc) if npc else "You are a helpful assistant."
|
|
2830
|
+
|
|
2831
|
+
# Add conciseness instruction to the system message
|
|
2832
|
+
system_message = system_message + " " + concise_instruction
|
|
2747
2833
|
|
|
2748
2834
|
if messages is None:
|
|
2749
|
-
messages = [
|
|
2835
|
+
messages = [{"role": "system", "content": system_message}]
|
|
2836
|
+
elif messages and messages[0]["role"] == "system":
|
|
2837
|
+
# Update the existing system message
|
|
2838
|
+
messages[0]["content"] = messages[0]["content"] + " " + concise_instruction
|
|
2839
|
+
else:
|
|
2840
|
+
messages.insert(0, {"role": "system", "content": system_message})
|
|
2750
2841
|
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
|
|
2754
|
-
|
|
2842
|
+
kokoro_pipeline = None
|
|
2843
|
+
if tts_model == "kokoro":
|
|
2844
|
+
try:
|
|
2845
|
+
from kokoro import KPipeline
|
|
2846
|
+
import soundfile as sf
|
|
2847
|
+
|
|
2848
|
+
kokoro_pipeline = KPipeline(lang_code="a")
|
|
2849
|
+
print("Kokoro TTS model initialized")
|
|
2850
|
+
except ImportError:
|
|
2851
|
+
print("Kokoro not installed, falling back to gTTS")
|
|
2852
|
+
tts_model = "gtts"
|
|
2853
|
+
|
|
2854
|
+
# Initialize PyAudio
|
|
2855
|
+
pyaudio_instance = pyaudio.PyAudio()
|
|
2856
|
+
audio_stream = None # We'll open and close as needed
|
|
2857
|
+
transcription_queue = queue.Queue()
|
|
2858
|
+
|
|
2859
|
+
# Create and properly use the is_speaking event
|
|
2860
|
+
is_speaking = threading.Event()
|
|
2861
|
+
is_speaking.clear() # Not speaking initially
|
|
2862
|
+
|
|
2863
|
+
speech_queue = queue.Queue(maxsize=20)
|
|
2864
|
+
speech_thread_active = threading.Event()
|
|
2865
|
+
speech_thread_active.set()
|
|
2866
|
+
|
|
2867
|
+
def speech_playback_thread():
|
|
2868
|
+
nonlocal running, audio_stream
|
|
2869
|
+
|
|
2870
|
+
while running and speech_thread_active.is_set():
|
|
2871
|
+
try:
|
|
2872
|
+
# Get next speech item from queue
|
|
2873
|
+
if not speech_queue.empty():
|
|
2874
|
+
text_to_speak = speech_queue.get(timeout=0.1)
|
|
2875
|
+
|
|
2876
|
+
# Only process if there's text to speak
|
|
2877
|
+
if text_to_speak.strip():
|
|
2878
|
+
# IMPORTANT: Set is_speaking flag BEFORE starting audio output
|
|
2879
|
+
is_speaking.set()
|
|
2880
|
+
|
|
2881
|
+
# Safely close the audio input stream before speaking
|
|
2882
|
+
current_audio_stream = audio_stream
|
|
2883
|
+
audio_stream = (
|
|
2884
|
+
None # Set to None to prevent capture thread from using it
|
|
2885
|
+
)
|
|
2886
|
+
|
|
2887
|
+
if current_audio_stream and current_audio_stream.is_active():
|
|
2888
|
+
current_audio_stream.stop_stream()
|
|
2889
|
+
current_audio_stream.close()
|
|
2890
|
+
|
|
2891
|
+
print(f"Speaking full response...")
|
|
2892
|
+
|
|
2893
|
+
# Generate and play speech
|
|
2894
|
+
generate_and_play_speech(text_to_speak)
|
|
2895
|
+
|
|
2896
|
+
# Delay after speech to prevent echo
|
|
2897
|
+
time.sleep(0.005 * len(text_to_speak))
|
|
2898
|
+
print(len(text_to_speak))
|
|
2899
|
+
|
|
2900
|
+
# Clear the speaking flag to allow listening again
|
|
2901
|
+
is_speaking.clear()
|
|
2902
|
+
else:
|
|
2903
|
+
time.sleep(0.5)
|
|
2904
|
+
except Exception as e:
|
|
2905
|
+
print(f"Error in speech thread: {e}")
|
|
2906
|
+
is_speaking.clear() # Make sure to clear the flag if there's an error
|
|
2907
|
+
time.sleep(0.1)
|
|
2908
|
+
|
|
2909
|
+
def safely_close_audio_stream(stream):
|
|
2910
|
+
"""Safely close an audio stream with error handling"""
|
|
2911
|
+
if stream:
|
|
2912
|
+
try:
|
|
2913
|
+
if stream.is_active():
|
|
2914
|
+
stream.stop_stream()
|
|
2915
|
+
stream.close()
|
|
2916
|
+
except Exception as e:
|
|
2917
|
+
print(f"Error closing audio stream: {e}")
|
|
2918
|
+
|
|
2919
|
+
# Start speech thread
|
|
2920
|
+
speech_thread = threading.Thread(target=speech_playback_thread)
|
|
2921
|
+
speech_thread.daemon = True
|
|
2922
|
+
speech_thread.start()
|
|
2923
|
+
|
|
2924
|
+
def generate_and_play_speech(text):
|
|
2925
|
+
try:
|
|
2926
|
+
# Create a temporary file for audio
|
|
2927
|
+
unique_id = str(time.time()).replace(".", "")
|
|
2928
|
+
temp_dir = tempfile.gettempdir()
|
|
2929
|
+
wav_file = os.path.join(temp_dir, f"temp_{unique_id}.wav")
|
|
2930
|
+
|
|
2931
|
+
# Generate speech based on selected TTS model
|
|
2932
|
+
if tts_model == "kokoro" and kokoro_pipeline:
|
|
2933
|
+
# Use Kokoro for generation
|
|
2934
|
+
generator = kokoro_pipeline(text, voice=voice)
|
|
2935
|
+
|
|
2936
|
+
# Get the audio from the generator
|
|
2937
|
+
for _, _, audio in generator:
|
|
2938
|
+
# Save audio to WAV file
|
|
2939
|
+
import soundfile as sf
|
|
2940
|
+
|
|
2941
|
+
sf.write(wav_file, audio, 24000)
|
|
2942
|
+
break # Just use the first chunk for now
|
|
2943
|
+
else:
|
|
2944
|
+
# Fall back to gTTS
|
|
2945
|
+
mp3_file = os.path.join(temp_dir, f"temp_{unique_id}.mp3")
|
|
2946
|
+
tts = gTTS(text=text, lang="en", slow=False)
|
|
2947
|
+
tts.save(mp3_file)
|
|
2948
|
+
convert_mp3_to_wav(mp3_file, wav_file)
|
|
2949
|
+
|
|
2950
|
+
# Play the audio
|
|
2951
|
+
wf = wave.open(wav_file, "rb")
|
|
2952
|
+
p = pyaudio.PyAudio()
|
|
2953
|
+
|
|
2954
|
+
stream = p.open(
|
|
2955
|
+
format=p.get_format_from_width(wf.getsampwidth()),
|
|
2956
|
+
channels=wf.getnchannels(),
|
|
2957
|
+
rate=wf.getframerate(),
|
|
2958
|
+
output=True,
|
|
2959
|
+
)
|
|
2960
|
+
|
|
2961
|
+
data = wf.readframes(4096)
|
|
2962
|
+
while data and running:
|
|
2963
|
+
stream.write(data)
|
|
2964
|
+
data = wf.readframes(4096)
|
|
2965
|
+
|
|
2966
|
+
stream.stop_stream()
|
|
2967
|
+
stream.close()
|
|
2968
|
+
p.terminate()
|
|
2969
|
+
|
|
2970
|
+
# Cleanup temp files
|
|
2971
|
+
try:
|
|
2972
|
+
if os.path.exists(wav_file):
|
|
2973
|
+
os.remove(wav_file)
|
|
2974
|
+
if tts_model == "gtts" and "mp3_file" in locals():
|
|
2975
|
+
if os.path.exists(mp3_file):
|
|
2976
|
+
os.remove(mp3_file)
|
|
2977
|
+
except Exception as e:
|
|
2978
|
+
print(f"Error removing temp file: {e}")
|
|
2979
|
+
|
|
2980
|
+
except Exception as e:
|
|
2981
|
+
print(f"Error in TTS process: {e}")
|
|
2982
|
+
|
|
2983
|
+
# Modified speak_text function that just queues text
|
|
2984
|
+
def speak_text(text):
|
|
2985
|
+
speech_queue.put(text)
|
|
2986
|
+
|
|
2987
|
+
def process_input(user_input):
|
|
2988
|
+
nonlocal messages
|
|
2989
|
+
|
|
2990
|
+
# Add user message
|
|
2991
|
+
messages.append({"role": "user", "content": user_input})
|
|
2992
|
+
|
|
2993
|
+
# Process with LLM and collect the ENTIRE response first
|
|
2994
|
+
try:
|
|
2995
|
+
full_response = ""
|
|
2996
|
+
|
|
2997
|
+
# Use get_stream for streaming response
|
|
2998
|
+
check = check_llm_command(
|
|
2999
|
+
user_input,
|
|
3000
|
+
npc=npc,
|
|
3001
|
+
messages=messages,
|
|
3002
|
+
model=model,
|
|
3003
|
+
provider=provider,
|
|
3004
|
+
stream=True,
|
|
3005
|
+
whisper=True,
|
|
3006
|
+
)
|
|
3007
|
+
|
|
3008
|
+
# Collect the entire response first
|
|
3009
|
+
for chunk in check:
|
|
3010
|
+
if chunk:
|
|
3011
|
+
chunk_content = "".join(
|
|
3012
|
+
choice.delta.content
|
|
3013
|
+
for choice in chunk.choices
|
|
3014
|
+
if choice.delta.content is not None
|
|
3015
|
+
)
|
|
3016
|
+
|
|
3017
|
+
full_response += chunk_content
|
|
3018
|
+
|
|
3019
|
+
# Show progress in console
|
|
3020
|
+
print(chunk_content, end="", flush=True)
|
|
3021
|
+
|
|
3022
|
+
print("\n") # End the progress display
|
|
3023
|
+
|
|
3024
|
+
# Process and speak the entire response at once
|
|
3025
|
+
if full_response.strip():
|
|
3026
|
+
processed_text = process_text_for_tts(full_response)
|
|
3027
|
+
speak_text(processed_text)
|
|
3028
|
+
|
|
3029
|
+
# Add assistant's response to messages
|
|
3030
|
+
messages.append({"role": "assistant", "content": full_response})
|
|
3031
|
+
|
|
3032
|
+
except Exception as e:
|
|
3033
|
+
print(f"Error in LLM response: {e}")
|
|
3034
|
+
speak_text("I'm sorry, there was an error processing your request.")
|
|
3035
|
+
|
|
3036
|
+
# Function to capture and process audio
|
|
3037
|
+
def capture_audio():
|
|
3038
|
+
nonlocal is_recording, recording_data, buffer_data, last_speech_time, running, is_speaking
|
|
3039
|
+
nonlocal audio_stream, transcription_queue
|
|
3040
|
+
|
|
3041
|
+
# Don't try to record if we're speaking
|
|
3042
|
+
if is_speaking.is_set():
|
|
3043
|
+
return False
|
|
3044
|
+
|
|
3045
|
+
try:
|
|
3046
|
+
# Only create a new audio stream if we don't have one
|
|
3047
|
+
if audio_stream is None and not is_speaking.is_set():
|
|
3048
|
+
audio_stream = pyaudio_instance.open(
|
|
3049
|
+
format=FORMAT,
|
|
3050
|
+
channels=CHANNELS,
|
|
3051
|
+
rate=RATE,
|
|
3052
|
+
input=True,
|
|
3053
|
+
frames_per_buffer=CHUNK,
|
|
3054
|
+
)
|
|
3055
|
+
|
|
3056
|
+
# Initialize or reset the recording variables
|
|
3057
|
+
is_recording = False
|
|
3058
|
+
recording_data = []
|
|
3059
|
+
buffer_data = []
|
|
3060
|
+
|
|
3061
|
+
print("\nListening for speech...")
|
|
3062
|
+
|
|
3063
|
+
while (
|
|
3064
|
+
running
|
|
3065
|
+
and audio_stream
|
|
3066
|
+
and audio_stream.is_active()
|
|
3067
|
+
and not is_speaking.is_set()
|
|
3068
|
+
):
|
|
3069
|
+
try:
|
|
3070
|
+
data = audio_stream.read(CHUNK, exception_on_overflow=False)
|
|
3071
|
+
if data:
|
|
3072
|
+
audio_array = np.frombuffer(data, dtype=np.int16)
|
|
3073
|
+
audio_float = audio_array.astype(np.float32) / 32768.0
|
|
3074
|
+
|
|
3075
|
+
tensor = torch.from_numpy(audio_float).to(device)
|
|
3076
|
+
speech_prob = vad_model(tensor, RATE).item()
|
|
3077
|
+
current_time = time.time()
|
|
3078
|
+
|
|
3079
|
+
if speech_prob > 0.5: # VAD threshold
|
|
3080
|
+
last_speech_time = current_time
|
|
3081
|
+
if not is_recording:
|
|
3082
|
+
is_recording = True
|
|
3083
|
+
print("\nSpeech detected, listening...")
|
|
3084
|
+
recording_data.extend(buffer_data)
|
|
3085
|
+
buffer_data = []
|
|
3086
|
+
recording_data.append(data)
|
|
3087
|
+
else:
|
|
3088
|
+
if is_recording:
|
|
3089
|
+
if (
|
|
3090
|
+
current_time - last_speech_time > 1
|
|
3091
|
+
): # silence duration
|
|
3092
|
+
is_recording = False
|
|
3093
|
+
print("Speech ended, transcribing...")
|
|
3094
|
+
|
|
3095
|
+
# Stop stream before transcribing
|
|
3096
|
+
safely_close_audio_stream(audio_stream)
|
|
3097
|
+
audio_stream = None
|
|
3098
|
+
|
|
3099
|
+
# Transcribe in this thread to avoid race conditions
|
|
3100
|
+
transcription = transcribe_recording(recording_data)
|
|
3101
|
+
if transcription:
|
|
3102
|
+
transcription_queue.put(transcription)
|
|
3103
|
+
recording_data = []
|
|
3104
|
+
return True # Got speech
|
|
3105
|
+
else:
|
|
3106
|
+
buffer_data.append(data)
|
|
3107
|
+
if len(buffer_data) > int(
|
|
3108
|
+
0.65 * RATE / CHUNK
|
|
3109
|
+
): # buffer duration
|
|
3110
|
+
buffer_data.pop(0)
|
|
3111
|
+
|
|
3112
|
+
# Check frequently if we need to stop capturing
|
|
3113
|
+
if is_speaking.is_set():
|
|
3114
|
+
safely_close_audio_stream(audio_stream)
|
|
3115
|
+
audio_stream = None
|
|
3116
|
+
return False
|
|
3117
|
+
|
|
3118
|
+
except Exception as e:
|
|
3119
|
+
print(f"Error processing audio frame: {e}")
|
|
3120
|
+
time.sleep(0.1)
|
|
3121
|
+
|
|
3122
|
+
except Exception as e:
|
|
3123
|
+
print(f"Error in audio capture: {e}")
|
|
3124
|
+
|
|
3125
|
+
# Close stream if we exit without finding speech
|
|
3126
|
+
safely_close_audio_stream(audio_stream)
|
|
3127
|
+
audio_stream = None
|
|
3128
|
+
|
|
3129
|
+
return False
|
|
3130
|
+
|
|
3131
|
+
def process_text_for_tts(text):
|
|
3132
|
+
# Remove special characters that might cause issues in TTS
|
|
3133
|
+
text = re.sub(r"[*<>{}()\[\]&%#@^_=+~]", "", text)
|
|
3134
|
+
text = text.strip()
|
|
3135
|
+
# Add spaces after periods that are followed by words (for better pronunciation)
|
|
3136
|
+
text = re.sub(r"(\w)\.(\w)\.", r"\1 \2 ", text)
|
|
3137
|
+
text = re.sub(r"([.!?])(\w)", r"\1 \2", text)
|
|
3138
|
+
return text
|
|
3139
|
+
|
|
3140
|
+
# Now that functions are defined, play welcome messages
|
|
3141
|
+
speak_text("Entering whisper mode. Please wait.")
|
|
2755
3142
|
|
|
2756
3143
|
try:
|
|
2757
|
-
silence_threshold = calibrate_silence()
|
|
2758
|
-
except Exception as e:
|
|
2759
|
-
return f"Error: Unable to calibrate silence due to {str(e)}"
|
|
2760
3144
|
|
|
2761
|
-
|
|
2762
|
-
"Ready. Speak after seeing 'Listening...'. Say 'exit' or type '/wq' to quit."
|
|
2763
|
-
)
|
|
2764
|
-
speak_text("Whisper mode activated. Ready for your input.")
|
|
3145
|
+
while running:
|
|
2765
3146
|
|
|
2766
|
-
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
wf = wave.open(temp_audio.name, "wb")
|
|
2770
|
-
wf.setnchannels(1)
|
|
2771
|
-
wf.setsampwidth(2)
|
|
2772
|
-
wf.setframerate(16000)
|
|
2773
|
-
wf.writeframes(audio_data)
|
|
2774
|
-
wf.close()
|
|
2775
|
-
|
|
2776
|
-
result = model.transcribe(temp_audio.name)
|
|
2777
|
-
text = result["text"].strip()
|
|
2778
|
-
print(f"You said: {text}")
|
|
2779
|
-
os.unlink(temp_audio.name)
|
|
2780
|
-
|
|
2781
|
-
messages.append({"role": "user", "content": text}) # Add user message
|
|
2782
|
-
if text.lower() in ["exit", "/wq"]:
|
|
2783
|
-
whisper_output.append("Exiting whisper mode.")
|
|
2784
|
-
speak_text("Exiting whisper mode. Goodbye!")
|
|
2785
|
-
break
|
|
2786
|
-
if not spool:
|
|
2787
|
-
llm_response = check_llm_command(
|
|
2788
|
-
text, npc=npc, messages=messages, stream=stream
|
|
2789
|
-
) # Use
|
|
3147
|
+
# First check for typed input (non-blocking)
|
|
3148
|
+
import select
|
|
3149
|
+
import sys
|
|
2790
3150
|
|
|
2791
|
-
|
|
2792
|
-
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
|
|
2796
|
-
|
|
2797
|
-
model=model,
|
|
2798
|
-
provider=provider,
|
|
2799
|
-
npc=npc,
|
|
3151
|
+
# Don't spam the console with prompts when speaking
|
|
3152
|
+
if not is_speaking.is_set():
|
|
3153
|
+
print(
|
|
3154
|
+
"\Speak or type your message (or 'exit' to quit): ",
|
|
3155
|
+
end="",
|
|
3156
|
+
flush=True,
|
|
2800
3157
|
)
|
|
3158
|
+
|
|
3159
|
+
rlist, _, _ = select.select([sys.stdin], [], [], 0.1)
|
|
3160
|
+
if rlist:
|
|
3161
|
+
user_input = sys.stdin.readline().strip()
|
|
3162
|
+
if user_input.lower() in ("exit", "quit", "goodbye"):
|
|
3163
|
+
print("\nExiting whisper mode.")
|
|
3164
|
+
break
|
|
3165
|
+
if user_input:
|
|
3166
|
+
print(f"\nYou (typed): {user_input}")
|
|
3167
|
+
process_input(user_input)
|
|
3168
|
+
continue # Skip audio capture this cycle
|
|
3169
|
+
|
|
3170
|
+
# Then try to capture some audio (if no typed input)
|
|
3171
|
+
if not is_speaking.is_set(): # Only capture if not currently speaking
|
|
3172
|
+
got_speech = capture_audio()
|
|
3173
|
+
|
|
3174
|
+
# If we got speech, process it
|
|
3175
|
+
if got_speech:
|
|
3176
|
+
try:
|
|
3177
|
+
transcription = transcription_queue.get_nowait()
|
|
3178
|
+
print(f"\nYou (spoke): {transcription}")
|
|
3179
|
+
process_input(transcription)
|
|
3180
|
+
except queue.Empty:
|
|
3181
|
+
pass
|
|
2801
3182
|
else:
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
model=model,
|
|
2805
|
-
provider=provider,
|
|
2806
|
-
npc=npc,
|
|
2807
|
-
)
|
|
3183
|
+
# If we're speaking, just wait a bit without spamming the console
|
|
3184
|
+
time.sleep(0.1)
|
|
2808
3185
|
|
|
2809
|
-
|
|
2810
|
-
print(
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
|
|
3186
|
+
except KeyboardInterrupt:
|
|
3187
|
+
print("\nInterrupted by user.")
|
|
3188
|
+
|
|
3189
|
+
finally:
|
|
3190
|
+
# Set running to False to signal threads to exit
|
|
3191
|
+
running = False
|
|
3192
|
+
speech_thread_active.clear()
|
|
3193
|
+
|
|
3194
|
+
# Clean up audio resources
|
|
3195
|
+
safely_close_audio_stream(audio_stream)
|
|
2815
3196
|
|
|
2816
|
-
|
|
3197
|
+
if pyaudio_instance:
|
|
3198
|
+
pyaudio_instance.terminate()
|
|
3199
|
+
|
|
3200
|
+
print("\nExiting whisper mode.")
|
|
3201
|
+
speak_text("Exiting whisper mode. Goodbye!")
|
|
3202
|
+
time.sleep(1)
|
|
3203
|
+
cleanup_temp_files()
|
|
3204
|
+
|
|
3205
|
+
return {"messages": messages, "output": "Whisper mode session ended."}
|
|
3206
|
+
|
|
3207
|
+
|
|
3208
|
+
def get_context_string(messages):
|
|
3209
|
+
context = []
|
|
3210
|
+
for message in messages[-5:]: # Get last 5 messages for context
|
|
3211
|
+
role = message.get("role", "")
|
|
3212
|
+
content = message.get("content", "")
|
|
3213
|
+
context.append(f"{role.capitalize()}: {content}")
|
|
3214
|
+
return "\n".join(context)
|
|
3215
|
+
|
|
3216
|
+
|
|
3217
|
+
def input_with_timeout(prompt, timeout=0.1):
|
|
3218
|
+
"""Non-blocking input function with a timeout."""
|
|
3219
|
+
import select
|
|
3220
|
+
import sys
|
|
3221
|
+
|
|
3222
|
+
print(prompt, end="", flush=True)
|
|
3223
|
+
rlist, _, _ = select.select([sys.stdin], [], [], timeout)
|
|
3224
|
+
if rlist:
|
|
3225
|
+
return sys.stdin.readline().strip()
|
|
3226
|
+
return None
|
|
2817
3227
|
|
|
2818
3228
|
|
|
2819
3229
|
def enter_notes_mode(npc: Any = None) -> None:
|
|
@@ -3163,6 +3573,7 @@ def enter_spool_mode(
|
|
|
3163
3573
|
Dict : The messages and output.
|
|
3164
3574
|
|
|
3165
3575
|
"""
|
|
3576
|
+
|
|
3166
3577
|
command_history = CommandHistory()
|
|
3167
3578
|
npc_info = f" (NPC: {npc.name})" if npc else ""
|
|
3168
3579
|
print(f"Entering spool mode{npc_info}. Type '/sq' to exit spool mode.")
|