npcsh 1.0.13__py3-none-any.whl → 1.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcsh/_state.py +23 -41
- npcsh/npc.py +124 -98
- npcsh/npcsh.py +124 -77
- npcsh/routes.py +16 -28
- npcsh/yap.py +115 -106
- {npcsh-1.0.13.dist-info → npcsh-1.0.16.dist-info}/METADATA +108 -58
- {npcsh-1.0.13.dist-info → npcsh-1.0.16.dist-info}/RECORD +11 -11
- {npcsh-1.0.13.dist-info → npcsh-1.0.16.dist-info}/WHEEL +0 -0
- {npcsh-1.0.13.dist-info → npcsh-1.0.16.dist-info}/entry_points.txt +0 -0
- {npcsh-1.0.13.dist-info → npcsh-1.0.16.dist-info}/licenses/LICENSE +0 -0
- {npcsh-1.0.13.dist-info → npcsh-1.0.16.dist-info}/top_level.txt +0 -0
npcsh/yap.py
CHANGED
|
@@ -54,18 +54,20 @@ from npcpy.npc_compiler import (
|
|
|
54
54
|
from npcpy.memory.command_history import CommandHistory, save_conversation_message,start_new_conversation
|
|
55
55
|
from typing import Dict, Any, List
|
|
56
56
|
def enter_yap_mode(
|
|
57
|
-
|
|
58
|
-
model: str ,
|
|
59
|
-
provider: str ,
|
|
60
|
-
messages: list = None,
|
|
57
|
+
messages: list = None,
|
|
58
|
+
model: str = None,
|
|
59
|
+
provider: str = None ,
|
|
61
60
|
npc = None,
|
|
62
|
-
team= None,
|
|
61
|
+
team = None,
|
|
62
|
+
stream: bool = False,
|
|
63
|
+
api_url: str = None,
|
|
64
|
+
api_key: str=None,
|
|
65
|
+
conversation_id = None,
|
|
63
66
|
tts_model="kokoro",
|
|
64
67
|
voice="af_heart",
|
|
65
68
|
files: List[str] = None,
|
|
66
69
|
rag_similarity_threshold: float = 0.3,
|
|
67
|
-
|
|
68
|
-
conversation_id = None,
|
|
70
|
+
**kwargs
|
|
69
71
|
) -> Dict[str, Any]:
|
|
70
72
|
running = True
|
|
71
73
|
is_recording = False
|
|
@@ -100,22 +102,20 @@ def enter_yap_mode(
|
|
|
100
102
|
# Add conciseness instruction to the system message
|
|
101
103
|
system_message = system_message + " " + concise_instruction
|
|
102
104
|
|
|
103
|
-
if messages is None:
|
|
105
|
+
if messages is None or len(messages) == 0:
|
|
104
106
|
messages = [{"role": "system", "content": system_message}]
|
|
105
107
|
elif messages is not None and messages[0]['role'] != 'system':
|
|
106
108
|
messages.insert(0, {"role": "system", "content": system_message})
|
|
107
109
|
|
|
108
110
|
kokoro_pipeline = None
|
|
109
111
|
if tts_model == "kokoro":
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
112
|
+
from kokoro import KPipeline
|
|
113
|
+
import soundfile as sf
|
|
114
|
+
|
|
115
|
+
kokoro_pipeline = KPipeline(lang_code="a")
|
|
116
|
+
print("Kokoro TTS model initialized")
|
|
117
|
+
|
|
113
118
|
|
|
114
|
-
kokoro_pipeline = KPipeline(lang_code="a")
|
|
115
|
-
print("Kokoro TTS model initialized")
|
|
116
|
-
except ImportError:
|
|
117
|
-
print("Kokoro not installed, falling back to gTTS")
|
|
118
|
-
tts_model = "gtts"
|
|
119
119
|
|
|
120
120
|
# Initialize PyAudio
|
|
121
121
|
pyaudio_instance = pyaudio.PyAudio()
|
|
@@ -134,43 +134,45 @@ def enter_yap_mode(
|
|
|
134
134
|
nonlocal running, audio_stream
|
|
135
135
|
|
|
136
136
|
while running and speech_thread_active.is_set():
|
|
137
|
-
try:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
137
|
+
#try:
|
|
138
|
+
# Get next speech item from queue
|
|
139
|
+
print('.', end='', flush=True)
|
|
140
|
+
if not speech_queue.empty():
|
|
141
|
+
print('\n')
|
|
142
|
+
text_to_speak = speech_queue.get(timeout=0.1)
|
|
143
|
+
|
|
144
|
+
# Only process if there's text to speak
|
|
145
|
+
if text_to_speak.strip():
|
|
146
|
+
# IMPORTANT: Set is_speaking flag BEFORE starting audio output
|
|
147
|
+
is_speaking.set()
|
|
148
|
+
|
|
149
|
+
# Safely close the audio input stream before speaking
|
|
150
|
+
current_audio_stream = audio_stream
|
|
151
|
+
audio_stream = (
|
|
152
|
+
None # Set to None to prevent capture thread from using it
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if current_audio_stream and current_audio_stream.is_active():
|
|
156
|
+
current_audio_stream.stop_stream()
|
|
157
|
+
current_audio_stream.close()
|
|
158
|
+
|
|
159
|
+
print(f"Speaking full response...")
|
|
160
|
+
print(text_to_speak)
|
|
161
|
+
# Generate and play speech
|
|
162
|
+
generate_and_play_speech(text_to_speak)
|
|
163
|
+
|
|
164
|
+
# Delay after speech to prevent echo
|
|
165
|
+
time.sleep(0.005 * len(text_to_speak))
|
|
166
|
+
print(len(text_to_speak))
|
|
167
|
+
|
|
168
|
+
# Clear the speaking flag to allow listening again
|
|
169
|
+
is_speaking.clear()
|
|
170
|
+
else:
|
|
171
|
+
time.sleep(0.5)
|
|
172
|
+
#except Exception as e:
|
|
173
|
+
# print(f"Error in speech thread: {e}")
|
|
174
|
+
# is_speaking.clear() # Make sure to clear the flag if there's an error
|
|
175
|
+
# time.sleep(0.1)
|
|
174
176
|
|
|
175
177
|
def safely_close_audio_stream(stream):
|
|
176
178
|
"""Safely close an audio stream with error handling"""
|
|
@@ -315,10 +317,9 @@ def enter_yap_mode(
|
|
|
315
317
|
frames_per_buffer=CHUNK,
|
|
316
318
|
)
|
|
317
319
|
|
|
318
|
-
#
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
buffer_data = []
|
|
320
|
+
# Add timeout counter
|
|
321
|
+
timeout_counter = 0
|
|
322
|
+
max_timeout = 100 # About 10 seconds at 0.1s intervals
|
|
322
323
|
|
|
323
324
|
print("\nListening for speech...")
|
|
324
325
|
|
|
@@ -327,49 +328,63 @@ def enter_yap_mode(
|
|
|
327
328
|
and audio_stream
|
|
328
329
|
and audio_stream.is_active()
|
|
329
330
|
and not is_speaking.is_set()
|
|
331
|
+
and timeout_counter < max_timeout
|
|
330
332
|
):
|
|
331
333
|
try:
|
|
334
|
+
# Add non-blocking read with timeout
|
|
332
335
|
data = audio_stream.read(CHUNK, exception_on_overflow=False)
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
336
|
+
|
|
337
|
+
if not data:
|
|
338
|
+
timeout_counter += 1
|
|
339
|
+
time.sleep(0.1)
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
# Reset timeout on successful read
|
|
343
|
+
timeout_counter = 0
|
|
344
|
+
|
|
345
|
+
audio_array = np.frombuffer(data, dtype=np.int16)
|
|
346
|
+
if len(audio_array) == 0:
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
audio_float = audio_array.astype(np.float32) / 32768.0
|
|
350
|
+
tensor = torch.from_numpy(audio_float).to(device)
|
|
351
|
+
|
|
352
|
+
# Add timeout to VAD processing
|
|
353
|
+
speech_prob = vad_model(tensor, RATE).item()
|
|
354
|
+
current_time = time.time()
|
|
355
|
+
|
|
356
|
+
if speech_prob > 0.5: # VAD threshold
|
|
357
|
+
last_speech_time = current_time
|
|
358
|
+
if not is_recording:
|
|
359
|
+
is_recording = True
|
|
360
|
+
print("\nSpeech detected, listening...")
|
|
361
|
+
recording_data.extend(buffer_data)
|
|
362
|
+
buffer_data = []
|
|
363
|
+
recording_data.append(data)
|
|
364
|
+
else:
|
|
365
|
+
if is_recording:
|
|
366
|
+
if (
|
|
367
|
+
current_time - last_speech_time > 1
|
|
368
|
+
): # silence duration
|
|
369
|
+
is_recording = False
|
|
370
|
+
print("Speech ended, transcribing...")
|
|
371
|
+
|
|
372
|
+
# Stop stream before transcribing
|
|
373
|
+
safely_close_audio_stream(audio_stream)
|
|
374
|
+
audio_stream = None
|
|
375
|
+
|
|
376
|
+
# Transcribe in this thread to avoid race conditions
|
|
377
|
+
transcription = transcribe_recording(recording_data)
|
|
378
|
+
if transcription:
|
|
379
|
+
transcription_queue.put(transcription)
|
|
380
|
+
recording_data = []
|
|
381
|
+
return True # Got speech
|
|
349
382
|
else:
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
print("Speech ended, transcribing...")
|
|
356
|
-
|
|
357
|
-
# Stop stream before transcribing
|
|
358
|
-
safely_close_audio_stream(audio_stream)
|
|
359
|
-
audio_stream = None
|
|
360
|
-
|
|
361
|
-
# Transcribe in this thread to avoid race conditions
|
|
362
|
-
transcription = transcribe_recording(recording_data)
|
|
363
|
-
if transcription:
|
|
364
|
-
transcription_queue.put(transcription)
|
|
365
|
-
recording_data = []
|
|
366
|
-
return True # Got speech
|
|
367
|
-
else:
|
|
368
|
-
buffer_data.append(data)
|
|
369
|
-
if len(buffer_data) > int(
|
|
370
|
-
0.65 * RATE / CHUNK
|
|
371
|
-
): # buffer duration
|
|
372
|
-
buffer_data.pop(0)
|
|
383
|
+
buffer_data.append(data)
|
|
384
|
+
if len(buffer_data) > int(
|
|
385
|
+
0.65 * RATE / CHUNK
|
|
386
|
+
): # buffer duration
|
|
387
|
+
buffer_data.pop(0)
|
|
373
388
|
|
|
374
389
|
# Check frequently if we need to stop capturing
|
|
375
390
|
if is_speaking.is_set():
|
|
@@ -427,19 +442,14 @@ def enter_yap_mode(
|
|
|
427
442
|
|
|
428
443
|
|
|
429
444
|
while running:
|
|
430
|
-
|
|
431
|
-
# First check for typed input (non-blocking)
|
|
432
445
|
import select
|
|
433
446
|
import sys
|
|
434
|
-
|
|
435
|
-
# Don't spam the console with prompts when speaking
|
|
436
447
|
if not is_speaking.is_set():
|
|
437
448
|
print(
|
|
438
449
|
"🎤🎤🎤🎤\n Speak or type your message (or 'exit' to quit): ",
|
|
439
450
|
end="",
|
|
440
451
|
flush=True,
|
|
441
452
|
)
|
|
442
|
-
|
|
443
453
|
rlist, _, _ = select.select([sys.stdin], [], [], 0.1)
|
|
444
454
|
if rlist:
|
|
445
455
|
user_input = sys.stdin.readline().strip()
|
|
@@ -448,7 +458,7 @@ def enter_yap_mode(
|
|
|
448
458
|
break
|
|
449
459
|
if user_input:
|
|
450
460
|
print(f"\nYou (typed): {user_input}")
|
|
451
|
-
|
|
461
|
+
|
|
452
462
|
if loaded_content:
|
|
453
463
|
context_content = ""
|
|
454
464
|
for filename, content in loaded_content.items():
|
|
@@ -494,9 +504,8 @@ def enter_yap_mode(
|
|
|
494
504
|
|
|
495
505
|
|
|
496
506
|
continue # Skip audio capture this cycle
|
|
497
|
-
|
|
498
|
-
# Then try to capture some audio (if no typed input)
|
|
499
507
|
if not is_speaking.is_set(): # Only capture if not currently speaking
|
|
508
|
+
print('capturing audio')
|
|
500
509
|
got_speech = capture_audio()
|
|
501
510
|
|
|
502
511
|
# If we got speech, process it
|
|
@@ -560,9 +569,9 @@ def main():
|
|
|
560
569
|
provider = sibiji.provider
|
|
561
570
|
# Enter spool mode
|
|
562
571
|
enter_yap_mode(
|
|
563
|
-
model,
|
|
564
|
-
provider,
|
|
565
572
|
messages=None,
|
|
573
|
+
model= model,
|
|
574
|
+
provider = provider,
|
|
566
575
|
npc=sibiji,
|
|
567
576
|
team = team,
|
|
568
577
|
files=args.files,
|