npcsh 0.3.30__py3-none-any.whl → 0.3.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. npcsh/audio.py +540 -181
  2. npcsh/audio_gen.py +1 -0
  3. npcsh/cli.py +37 -19
  4. npcsh/conversation.py +14 -251
  5. npcsh/dataframes.py +13 -5
  6. npcsh/helpers.py +5 -0
  7. npcsh/image.py +2 -4
  8. npcsh/image_gen.py +38 -38
  9. npcsh/knowledge_graph.py +4 -4
  10. npcsh/llm_funcs.py +517 -349
  11. npcsh/npc_compiler.py +44 -23
  12. npcsh/npc_sysenv.py +5 -0
  13. npcsh/npc_team/npcsh.ctx +8 -2
  14. npcsh/npc_team/tools/generic_search.tool +9 -1
  15. npcsh/plonk.py +2 -2
  16. npcsh/response.py +131 -482
  17. npcsh/search.py +20 -9
  18. npcsh/serve.py +210 -203
  19. npcsh/shell.py +78 -80
  20. npcsh/shell_helpers.py +513 -102
  21. npcsh/stream.py +87 -554
  22. npcsh/video.py +5 -2
  23. npcsh/video_gen.py +69 -0
  24. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/generic_search.tool +9 -1
  25. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/npcsh.ctx +8 -2
  26. npcsh-0.3.32.dist-info/METADATA +779 -0
  27. npcsh-0.3.32.dist-info/RECORD +78 -0
  28. npcsh-0.3.30.dist-info/METADATA +0 -1862
  29. npcsh-0.3.30.dist-info/RECORD +0 -76
  30. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/bash_executer.tool +0 -0
  31. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/calculator.tool +0 -0
  32. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/celona.npc +0 -0
  33. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/code_executor.tool +0 -0
  34. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/corca.npc +0 -0
  35. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/eriane.npc +0 -0
  36. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/foreman.npc +0 -0
  37. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/image_generation.tool +0 -0
  38. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/lineru.npc +0 -0
  39. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/local_search.tool +0 -0
  40. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/maurawa.npc +0 -0
  41. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/npcsh_executor.tool +0 -0
  42. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/raone.npc +0 -0
  43. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/screen_cap.tool +0 -0
  44. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/sibiji.npc +0 -0
  45. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/slean.npc +0 -0
  46. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/sql_executor.tool +0 -0
  47. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/test_pipeline.py +0 -0
  48. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/turnic.npc +0 -0
  49. {npcsh-0.3.30.data → npcsh-0.3.32.data}/data/npcsh/npc_team/welxor.npc +0 -0
  50. {npcsh-0.3.30.dist-info → npcsh-0.3.32.dist-info}/WHEEL +0 -0
  51. {npcsh-0.3.30.dist-info → npcsh-0.3.32.dist-info}/entry_points.txt +0 -0
  52. {npcsh-0.3.30.dist-info → npcsh-0.3.32.dist-info}/licenses/LICENSE +0 -0
  53. {npcsh-0.3.30.dist-info → npcsh-0.3.32.dist-info}/top_level.txt +0 -0
npcsh/shell_helpers.py CHANGED
@@ -1,6 +1,8 @@
1
1
  import os
2
2
  import pandas as pd
3
3
 
4
+ import threading
5
+
4
6
  from typing import Dict, Any, List, Optional, Union
5
7
  import numpy as np
6
8
  import readline
@@ -25,12 +27,46 @@ import signal
25
27
  import platform
26
28
  import time
27
29
 
30
+ import tempfile
31
+
32
+
33
+ # Global variables
34
+ running = True
35
+ is_recording = False
36
+ recording_data = []
37
+ buffer_data = []
38
+ last_speech_time = 0
39
+
28
40
 
29
41
  try:
30
42
  import whisper
31
- except:
43
+ from faster_whisper import WhisperModel
44
+ from gtts import gTTS
45
+ import torch
46
+ import pyaudio
47
+ import wave
48
+ import queue
49
+
50
+ from npcsh.audio import (
51
+ cleanup_temp_files,
52
+ FORMAT,
53
+ CHANNELS,
54
+ RATE,
55
+ device,
56
+ vad_model,
57
+ CHUNK,
58
+ whisper_model,
59
+ transcribe_recording,
60
+ convert_mp3_to_wav,
61
+ )
62
+
63
+
64
+ except Exception as e:
32
65
  print(
33
- "Could not load the whisper package. If you want to use tts/stt features, please run `pip install npcsh[audio]` and follow the instructions in the npcsh github readme to ensure your OS can handle the audio dependencies."
66
+ "Exception: "
67
+ + str(e)
68
+ + "\n"
69
+ + "Could not load the whisper package. If you want to use tts/stt features, please run `pip install npcsh[audio]` and follow the instructions in the npcsh github readme to ensure your OS can handle the audio dependencies."
34
70
  )
35
71
  try:
36
72
  from sentence_transformers import SentenceTransformer
@@ -40,8 +76,15 @@ except:
40
76
  "Could not load the sentence-transformers package. If you want to use it or other local AI features, please run `pip install npcsh[local]` ."
41
77
  )
42
78
 
43
- from .load_data import load_pdf, load_csv, load_json, load_excel, load_txt, load_image
44
- from .npc_sysenv import (
79
+ from npcsh.load_data import (
80
+ load_pdf,
81
+ load_csv,
82
+ load_json,
83
+ load_excel,
84
+ load_txt,
85
+ load_image,
86
+ )
87
+ from npcsh.npc_sysenv import (
45
88
  get_model_and_provider,
46
89
  get_available_models,
47
90
  get_system_message,
@@ -53,16 +96,18 @@ from .npc_sysenv import (
53
96
  NPCSH_VISION_PROVIDER,
54
97
  NPCSH_IMAGE_GEN_MODEL,
55
98
  NPCSH_IMAGE_GEN_PROVIDER,
99
+ NPCSH_VIDEO_GEN_MODEL,
100
+ NPCSH_VIDEO_GEN_PROVIDER,
56
101
  )
57
- from .command_history import (
102
+ from npcsh.command_history import (
58
103
  CommandHistory,
59
104
  save_attachment_to_message,
60
105
  save_conversation_message,
61
106
  start_new_conversation,
62
107
  )
63
- from .embeddings import search_similar_texts, chroma_client
108
+ from npcsh.embeddings import search_similar_texts, chroma_client
64
109
 
65
- from .llm_funcs import (
110
+ from npcsh.llm_funcs import (
66
111
  execute_llm_command,
67
112
  execute_llm_question,
68
113
  get_stream,
@@ -70,13 +115,14 @@ from .llm_funcs import (
70
115
  get_llm_response,
71
116
  check_llm_command,
72
117
  generate_image,
118
+ generate_video,
73
119
  get_embeddings,
74
120
  get_stream,
75
121
  )
76
- from .plonk import plonk, action_space
77
- from .helpers import get_db_npcs, get_npc_path
122
+ from npcsh.plonk import plonk, action_space
123
+ from npcsh.helpers import get_db_npcs, get_npc_path
78
124
 
79
- from .npc_compiler import (
125
+ from npcsh.npc_compiler import (
80
126
  NPCCompiler,
81
127
  NPC,
82
128
  load_npc_from_file,
@@ -86,10 +132,10 @@ from .npc_compiler import (
86
132
  )
87
133
 
88
134
 
89
- from .search import rag_search, search_web
90
- from .image import capture_screenshot, analyze_image
135
+ from npcsh.search import rag_search, search_web
136
+ from npcsh.image import capture_screenshot, analyze_image
91
137
 
92
- from .audio import calibrate_silence, record_audio, speak_text
138
+ # from npcsh.audio import calibrate_silence, record_audio, speak_text
93
139
  from rich.console import Console
94
140
  from rich.markdown import Markdown
95
141
  from rich.syntax import Syntax
@@ -566,11 +612,11 @@ def setup_readline() -> str:
566
612
 
567
613
  readline.set_history_length(1000)
568
614
  readline.parse_and_bind("set enable-bracketed-paste on") # Enable paste mode
569
- readline.parse_and_bind('"\e[A": history-search-backward')
570
- readline.parse_and_bind('"\e[B": history-search-forward')
571
- readline.parse_and_bind('"\C-r": reverse-search-history')
572
- readline.parse_and_bind("\C-e: end-of-line")
573
- readline.parse_and_bind("\C-a: beginning-of-line")
615
+ readline.parse_and_bind(r'"\e[A": history-search-backward')
616
+ readline.parse_and_bind(r'"\e[B": history-search-forward')
617
+ readline.parse_and_bind(r'"\C-r": reverse-search-history')
618
+ readline.parse_and_bind(r'\C-e: end-of-line')
619
+ readline.parse_and_bind(r'\C-a: beginning-of-line')
574
620
 
575
621
  return history_file
576
622
 
@@ -1631,6 +1677,9 @@ def ots(
1631
1677
  output = analyze_image(
1632
1678
  user_prompt, file_path, filename, npc=npc, model=model, provider=provider
1633
1679
  )
1680
+ messages = [
1681
+ {"role": "user", "content": user_prompt},
1682
+ ]
1634
1683
 
1635
1684
  else:
1636
1685
  output = capture_screenshot(npc=npc)
@@ -1651,12 +1700,15 @@ def ots(
1651
1700
  # messages = output["messages"]
1652
1701
 
1653
1702
  output = output["response"]
1654
-
1703
+ messages = [
1704
+ {"role": "user", "content": user_prompt},
1705
+ ]
1655
1706
  if output:
1656
1707
  if isinstance(output, dict) and "filename" in output:
1657
1708
  message = f"Screenshot captured: {output['filename']}\nFull path: {output['file_path']}\nLLM-ready data available."
1658
1709
  else: # This handles both LLM responses and error messages (both strings)
1659
1710
  message = output
1711
+ messages.append({"role": "assistant", "content": message})
1660
1712
  return {"messages": messages, "output": message} # Return the message
1661
1713
  else: # Handle the case where capture_screenshot returns None
1662
1714
  print("Screenshot capture failed.")
@@ -1782,8 +1834,8 @@ def execute_slash_command(
1782
1834
  log_action("Command Executed", command)
1783
1835
 
1784
1836
  command_parts = command.split()
1785
- command_name = command_parts[0]
1786
- args = command_parts[1:]
1837
+ command_name = command_parts[0] if len(command_parts) >= 1 else None
1838
+ args = command_parts[1:] if len(command_parts) >= 1 else []
1787
1839
 
1788
1840
  current_npc = npc
1789
1841
  if command_name in valid_npcs:
@@ -1966,6 +2018,7 @@ def execute_slash_command(
1966
2018
  command_parts, model=model, provider=provider, npc=npc, api_url=api_url
1967
2019
  )
1968
2020
  elif command_name == "help": # New help command
2021
+ print(get_help())
1969
2022
  return {
1970
2023
  "messages": messages,
1971
2024
  "output": get_help(),
@@ -1997,6 +2050,17 @@ def execute_slash_command(
1997
2050
  output = execute_rag_command(command, messages=messages)
1998
2051
  messages = output["messages"]
1999
2052
  output = output["output"]
2053
+ elif command_name == "roll":
2054
+
2055
+ output = generate_video(
2056
+ command,
2057
+ model=NPCSH_VIDEO_GEN_MODEL,
2058
+ provider=NPCSH_VIDEO_GEN_PROVIDER,
2059
+ npc=npc,
2060
+ messages=messages,
2061
+ )
2062
+ messages = output["messages"]
2063
+ output = output["output"]
2000
2064
 
2001
2065
  elif command_name == "set":
2002
2066
  parts = command.split()
@@ -2042,6 +2106,18 @@ def execute_slash_command(
2042
2106
  device = part.split("=")[1]
2043
2107
  if part.startswith("rag_similarity_threshold="):
2044
2108
  rag_similarity_threshold = float(part.split("=")[1])
2109
+ if part.startswith("model="):
2110
+ model = part.split("=")[1]
2111
+
2112
+ if part.startswith("provider="):
2113
+ provider = part.split("=")[1]
2114
+ if part.startswith("api_url="):
2115
+ api_url = part.split("=")[1]
2116
+ if part.startswith("api_key="):
2117
+ api_key = part.split("=")[1]
2118
+
2119
+ # load the npc properly
2120
+
2045
2121
  match = re.search(r"files=\s*\[(.*?)\]", command)
2046
2122
  files = []
2047
2123
  if match:
@@ -2055,21 +2131,24 @@ def execute_slash_command(
2055
2131
  files = None
2056
2132
 
2057
2133
  if len(command_parts) >= 2 and command_parts[1] == "reattach":
2134
+ command_history = CommandHistory()
2058
2135
  last_conversation = command_history.get_last_conversation_by_path(
2059
2136
  os.getcwd()
2060
2137
  )
2061
2138
  print(last_conversation)
2062
2139
  if last_conversation:
2063
2140
  spool_context = [
2064
- {"role": part[2], "content": part[3]} for part in last_conversation
2141
+ {"role": part["role"], "content": part["content"]}
2142
+ for part in last_conversation
2065
2143
  ]
2066
2144
 
2067
2145
  print(f"Reattached to previous conversation:\n\n")
2068
2146
  output = enter_spool_mode(
2069
- command_history,
2070
2147
  inherit_last,
2071
2148
  files=files,
2072
2149
  npc=npc,
2150
+ model=model,
2151
+ provider=provider,
2073
2152
  rag_similarity_threshold=rag_similarity_threshold,
2074
2153
  device=device,
2075
2154
  messages=spool_context,
@@ -2082,7 +2161,6 @@ def execute_slash_command(
2082
2161
  return {"messages": [], "output": "No previous conversation found."}
2083
2162
 
2084
2163
  output = enter_spool_mode(
2085
- command_history,
2086
2164
  inherit_last,
2087
2165
  files=files,
2088
2166
  npc=npc,
@@ -2367,11 +2445,13 @@ def execute_command(
2367
2445
  valid_npcs = get_db_npcs(db_path)
2368
2446
 
2369
2447
  npc_name = get_npc_from_command(command)
2448
+
2370
2449
  if npc_name is None:
2371
2450
  npc_name = "sibiji" # Default NPC
2372
2451
  npc_path = get_npc_path(npc_name, db_path)
2373
2452
 
2374
2453
  npc = load_npc_from_file(npc_path, db_conn)
2454
+ current_npc = npc
2375
2455
  else:
2376
2456
  valid_npcs = [current_npc]
2377
2457
  npc = current_npc
@@ -2722,98 +2802,428 @@ def enter_whisper_mode(
2722
2802
  npc: Any = None,
2723
2803
  spool=False,
2724
2804
  continuous=False,
2725
- stream=False,
2726
- ) -> str:
2727
- """
2728
- Function Description:
2729
- This function is used to enter the whisper mode.
2730
- Args:
2731
- Keyword Args:
2732
- npc : Any : The NPC object.
2733
- Returns:
2734
- str : The output of the whisper mode.
2735
- """
2805
+ stream=True,
2806
+ tts_model="kokoro",
2807
+ voice="af_heart", # Default voice,
2808
+ ) -> Dict[str, Any]:
2809
+ # Initialize state
2810
+ running = True
2811
+ is_recording = False
2812
+ recording_data = []
2813
+ buffer_data = []
2814
+ last_speech_time = 0
2736
2815
 
2737
- try:
2738
- model = whisper.load_model("base")
2739
- except Exception as e:
2740
- return f"Error: Unable to load Whisper model due to {str(e)}"
2816
+ print("Entering whisper mode. Initializing...")
2741
2817
 
2742
- whisper_output = []
2743
- if npc:
2744
- npc_info = f" (NPC: {npc.name})"
2745
- else:
2746
- npc_info = ""
2818
+ # Update the system message to encourage concise responses
2819
+ concise_instruction = "Please provide brief responses of 1-2 sentences unless the user specifically asks for more detailed information. Keep responses clear and concise."
2820
+
2821
+ model = select_model() if npc is None else npc.model or NPCSH_CHAT_MODEL
2822
+ provider = (
2823
+ NPCSH_CHAT_PROVIDER if npc is None else npc.provider or NPCSH_CHAT_PROVIDER
2824
+ )
2825
+ api_url = NPCSH_API_URL if npc is None else npc.api_url or NPCSH_API_URL
2826
+
2827
+ print(f"\nUsing model: {model} with provider: {provider}")
2828
+
2829
+ system_message = get_system_message(npc) if npc else "You are a helpful assistant."
2830
+
2831
+ # Add conciseness instruction to the system message
2832
+ system_message = system_message + " " + concise_instruction
2747
2833
 
2748
2834
  if messages is None:
2749
- messages = [] # Initialize messages list if not provided
2835
+ messages = [{"role": "system", "content": system_message}]
2836
+ elif messages and messages[0]["role"] == "system":
2837
+ # Update the existing system message
2838
+ messages[0]["content"] = messages[0]["content"] + " " + concise_instruction
2839
+ else:
2840
+ messages.insert(0, {"role": "system", "content": system_message})
2750
2841
 
2751
- # Begin whisper mode functionality
2752
- whisper_output.append(
2753
- f"Entering whisper mode{npc_info}. Calibrating silence level..."
2754
- )
2842
+ kokoro_pipeline = None
2843
+ if tts_model == "kokoro":
2844
+ try:
2845
+ from kokoro import KPipeline
2846
+ import soundfile as sf
2847
+
2848
+ kokoro_pipeline = KPipeline(lang_code="a")
2849
+ print("Kokoro TTS model initialized")
2850
+ except ImportError:
2851
+ print("Kokoro not installed, falling back to gTTS")
2852
+ tts_model = "gtts"
2853
+
2854
+ # Initialize PyAudio
2855
+ pyaudio_instance = pyaudio.PyAudio()
2856
+ audio_stream = None # We'll open and close as needed
2857
+ transcription_queue = queue.Queue()
2858
+
2859
+ # Create and properly use the is_speaking event
2860
+ is_speaking = threading.Event()
2861
+ is_speaking.clear() # Not speaking initially
2862
+
2863
+ speech_queue = queue.Queue(maxsize=20)
2864
+ speech_thread_active = threading.Event()
2865
+ speech_thread_active.set()
2866
+
2867
+ def speech_playback_thread():
2868
+ nonlocal running, audio_stream
2869
+
2870
+ while running and speech_thread_active.is_set():
2871
+ try:
2872
+ # Get next speech item from queue
2873
+ if not speech_queue.empty():
2874
+ text_to_speak = speech_queue.get(timeout=0.1)
2875
+
2876
+ # Only process if there's text to speak
2877
+ if text_to_speak.strip():
2878
+ # IMPORTANT: Set is_speaking flag BEFORE starting audio output
2879
+ is_speaking.set()
2880
+
2881
+ # Safely close the audio input stream before speaking
2882
+ current_audio_stream = audio_stream
2883
+ audio_stream = (
2884
+ None # Set to None to prevent capture thread from using it
2885
+ )
2886
+
2887
+ if current_audio_stream and current_audio_stream.is_active():
2888
+ current_audio_stream.stop_stream()
2889
+ current_audio_stream.close()
2890
+
2891
+ print(f"Speaking full response...")
2892
+
2893
+ # Generate and play speech
2894
+ generate_and_play_speech(text_to_speak)
2895
+
2896
+ # Delay after speech to prevent echo
2897
+ time.sleep(0.005 * len(text_to_speak))
2898
+ print(len(text_to_speak))
2899
+
2900
+ # Clear the speaking flag to allow listening again
2901
+ is_speaking.clear()
2902
+ else:
2903
+ time.sleep(0.5)
2904
+ except Exception as e:
2905
+ print(f"Error in speech thread: {e}")
2906
+ is_speaking.clear() # Make sure to clear the flag if there's an error
2907
+ time.sleep(0.1)
2908
+
2909
+ def safely_close_audio_stream(stream):
2910
+ """Safely close an audio stream with error handling"""
2911
+ if stream:
2912
+ try:
2913
+ if stream.is_active():
2914
+ stream.stop_stream()
2915
+ stream.close()
2916
+ except Exception as e:
2917
+ print(f"Error closing audio stream: {e}")
2918
+
2919
+ # Start speech thread
2920
+ speech_thread = threading.Thread(target=speech_playback_thread)
2921
+ speech_thread.daemon = True
2922
+ speech_thread.start()
2923
+
2924
+ def generate_and_play_speech(text):
2925
+ try:
2926
+ # Create a temporary file for audio
2927
+ unique_id = str(time.time()).replace(".", "")
2928
+ temp_dir = tempfile.gettempdir()
2929
+ wav_file = os.path.join(temp_dir, f"temp_{unique_id}.wav")
2930
+
2931
+ # Generate speech based on selected TTS model
2932
+ if tts_model == "kokoro" and kokoro_pipeline:
2933
+ # Use Kokoro for generation
2934
+ generator = kokoro_pipeline(text, voice=voice)
2935
+
2936
+ # Get the audio from the generator
2937
+ for _, _, audio in generator:
2938
+ # Save audio to WAV file
2939
+ import soundfile as sf
2940
+
2941
+ sf.write(wav_file, audio, 24000)
2942
+ break # Just use the first chunk for now
2943
+ else:
2944
+ # Fall back to gTTS
2945
+ mp3_file = os.path.join(temp_dir, f"temp_{unique_id}.mp3")
2946
+ tts = gTTS(text=text, lang="en", slow=False)
2947
+ tts.save(mp3_file)
2948
+ convert_mp3_to_wav(mp3_file, wav_file)
2949
+
2950
+ # Play the audio
2951
+ wf = wave.open(wav_file, "rb")
2952
+ p = pyaudio.PyAudio()
2953
+
2954
+ stream = p.open(
2955
+ format=p.get_format_from_width(wf.getsampwidth()),
2956
+ channels=wf.getnchannels(),
2957
+ rate=wf.getframerate(),
2958
+ output=True,
2959
+ )
2960
+
2961
+ data = wf.readframes(4096)
2962
+ while data and running:
2963
+ stream.write(data)
2964
+ data = wf.readframes(4096)
2965
+
2966
+ stream.stop_stream()
2967
+ stream.close()
2968
+ p.terminate()
2969
+
2970
+ # Cleanup temp files
2971
+ try:
2972
+ if os.path.exists(wav_file):
2973
+ os.remove(wav_file)
2974
+ if tts_model == "gtts" and "mp3_file" in locals():
2975
+ if os.path.exists(mp3_file):
2976
+ os.remove(mp3_file)
2977
+ except Exception as e:
2978
+ print(f"Error removing temp file: {e}")
2979
+
2980
+ except Exception as e:
2981
+ print(f"Error in TTS process: {e}")
2982
+
2983
+ # Modified speak_text function that just queues text
2984
+ def speak_text(text):
2985
+ speech_queue.put(text)
2986
+
2987
+ def process_input(user_input):
2988
+ nonlocal messages
2989
+
2990
+ # Add user message
2991
+ messages.append({"role": "user", "content": user_input})
2992
+
2993
+ # Process with LLM and collect the ENTIRE response first
2994
+ try:
2995
+ full_response = ""
2996
+
2997
+ # Use get_stream for streaming response
2998
+ check = check_llm_command(
2999
+ user_input,
3000
+ npc=npc,
3001
+ messages=messages,
3002
+ model=model,
3003
+ provider=provider,
3004
+ stream=True,
3005
+ whisper=True,
3006
+ )
3007
+
3008
+ # Collect the entire response first
3009
+ for chunk in check:
3010
+ if chunk:
3011
+ chunk_content = "".join(
3012
+ choice.delta.content
3013
+ for choice in chunk.choices
3014
+ if choice.delta.content is not None
3015
+ )
3016
+
3017
+ full_response += chunk_content
3018
+
3019
+ # Show progress in console
3020
+ print(chunk_content, end="", flush=True)
3021
+
3022
+ print("\n") # End the progress display
3023
+
3024
+ # Process and speak the entire response at once
3025
+ if full_response.strip():
3026
+ processed_text = process_text_for_tts(full_response)
3027
+ speak_text(processed_text)
3028
+
3029
+ # Add assistant's response to messages
3030
+ messages.append({"role": "assistant", "content": full_response})
3031
+
3032
+ except Exception as e:
3033
+ print(f"Error in LLM response: {e}")
3034
+ speak_text("I'm sorry, there was an error processing your request.")
3035
+
3036
+ # Function to capture and process audio
3037
+ def capture_audio():
3038
+ nonlocal is_recording, recording_data, buffer_data, last_speech_time, running, is_speaking
3039
+ nonlocal audio_stream, transcription_queue
3040
+
3041
+ # Don't try to record if we're speaking
3042
+ if is_speaking.is_set():
3043
+ return False
3044
+
3045
+ try:
3046
+ # Only create a new audio stream if we don't have one
3047
+ if audio_stream is None and not is_speaking.is_set():
3048
+ audio_stream = pyaudio_instance.open(
3049
+ format=FORMAT,
3050
+ channels=CHANNELS,
3051
+ rate=RATE,
3052
+ input=True,
3053
+ frames_per_buffer=CHUNK,
3054
+ )
3055
+
3056
+ # Initialize or reset the recording variables
3057
+ is_recording = False
3058
+ recording_data = []
3059
+ buffer_data = []
3060
+
3061
+ print("\nListening for speech...")
3062
+
3063
+ while (
3064
+ running
3065
+ and audio_stream
3066
+ and audio_stream.is_active()
3067
+ and not is_speaking.is_set()
3068
+ ):
3069
+ try:
3070
+ data = audio_stream.read(CHUNK, exception_on_overflow=False)
3071
+ if data:
3072
+ audio_array = np.frombuffer(data, dtype=np.int16)
3073
+ audio_float = audio_array.astype(np.float32) / 32768.0
3074
+
3075
+ tensor = torch.from_numpy(audio_float).to(device)
3076
+ speech_prob = vad_model(tensor, RATE).item()
3077
+ current_time = time.time()
3078
+
3079
+ if speech_prob > 0.5: # VAD threshold
3080
+ last_speech_time = current_time
3081
+ if not is_recording:
3082
+ is_recording = True
3083
+ print("\nSpeech detected, listening...")
3084
+ recording_data.extend(buffer_data)
3085
+ buffer_data = []
3086
+ recording_data.append(data)
3087
+ else:
3088
+ if is_recording:
3089
+ if (
3090
+ current_time - last_speech_time > 1
3091
+ ): # silence duration
3092
+ is_recording = False
3093
+ print("Speech ended, transcribing...")
3094
+
3095
+ # Stop stream before transcribing
3096
+ safely_close_audio_stream(audio_stream)
3097
+ audio_stream = None
3098
+
3099
+ # Transcribe in this thread to avoid race conditions
3100
+ transcription = transcribe_recording(recording_data)
3101
+ if transcription:
3102
+ transcription_queue.put(transcription)
3103
+ recording_data = []
3104
+ return True # Got speech
3105
+ else:
3106
+ buffer_data.append(data)
3107
+ if len(buffer_data) > int(
3108
+ 0.65 * RATE / CHUNK
3109
+ ): # buffer duration
3110
+ buffer_data.pop(0)
3111
+
3112
+ # Check frequently if we need to stop capturing
3113
+ if is_speaking.is_set():
3114
+ safely_close_audio_stream(audio_stream)
3115
+ audio_stream = None
3116
+ return False
3117
+
3118
+ except Exception as e:
3119
+ print(f"Error processing audio frame: {e}")
3120
+ time.sleep(0.1)
3121
+
3122
+ except Exception as e:
3123
+ print(f"Error in audio capture: {e}")
3124
+
3125
+ # Close stream if we exit without finding speech
3126
+ safely_close_audio_stream(audio_stream)
3127
+ audio_stream = None
3128
+
3129
+ return False
3130
+
3131
+ def process_text_for_tts(text):
3132
+ # Remove special characters that might cause issues in TTS
3133
+ text = re.sub(r"[*<>{}()\[\]&%#@^_=+~]", "", text)
3134
+ text = text.strip()
3135
+ # Add spaces after periods that are followed by words (for better pronunciation)
3136
+ text = re.sub(r"(\w)\.(\w)\.", r"\1 \2 ", text)
3137
+ text = re.sub(r"([.!?])(\w)", r"\1 \2", text)
3138
+ return text
3139
+
3140
+ # Now that functions are defined, play welcome messages
3141
+ speak_text("Entering whisper mode. Please wait.")
2755
3142
 
2756
3143
  try:
2757
- silence_threshold = calibrate_silence()
2758
- except Exception as e:
2759
- return f"Error: Unable to calibrate silence due to {str(e)}"
2760
3144
 
2761
- whisper_output.append(
2762
- "Ready. Speak after seeing 'Listening...'. Say 'exit' or type '/wq' to quit."
2763
- )
2764
- speak_text("Whisper mode activated. Ready for your input.")
3145
+ while running:
2765
3146
 
2766
- while True:
2767
- audio_data = record_audio(silence_threshold=silence_threshold)
2768
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
2769
- wf = wave.open(temp_audio.name, "wb")
2770
- wf.setnchannels(1)
2771
- wf.setsampwidth(2)
2772
- wf.setframerate(16000)
2773
- wf.writeframes(audio_data)
2774
- wf.close()
2775
-
2776
- result = model.transcribe(temp_audio.name)
2777
- text = result["text"].strip()
2778
- print(f"You said: {text}")
2779
- os.unlink(temp_audio.name)
2780
-
2781
- messages.append({"role": "user", "content": text}) # Add user message
2782
- if text.lower() in ["exit", "/wq"]:
2783
- whisper_output.append("Exiting whisper mode.")
2784
- speak_text("Exiting whisper mode. Goodbye!")
2785
- break
2786
- if not spool:
2787
- llm_response = check_llm_command(
2788
- text, npc=npc, messages=messages, stream=stream
2789
- ) # Use
3147
+ # First check for typed input (non-blocking)
3148
+ import select
3149
+ import sys
2790
3150
 
2791
- messages = llm_response["messages"]
2792
- output = llm_response["output"]
2793
- else:
2794
- if stream:
2795
- messages = get_stream(
2796
- messages,
2797
- model=model,
2798
- provider=provider,
2799
- npc=npc,
3151
+ # Don't spam the console with prompts when speaking
3152
+ if not is_speaking.is_set():
3153
+ print(
3154
+ "\Speak or type your message (or 'exit' to quit): ",
3155
+ end="",
3156
+ flush=True,
2800
3157
  )
3158
+
3159
+ rlist, _, _ = select.select([sys.stdin], [], [], 0.1)
3160
+ if rlist:
3161
+ user_input = sys.stdin.readline().strip()
3162
+ if user_input.lower() in ("exit", "quit", "goodbye"):
3163
+ print("\nExiting whisper mode.")
3164
+ break
3165
+ if user_input:
3166
+ print(f"\nYou (typed): {user_input}")
3167
+ process_input(user_input)
3168
+ continue # Skip audio capture this cycle
3169
+
3170
+ # Then try to capture some audio (if no typed input)
3171
+ if not is_speaking.is_set(): # Only capture if not currently speaking
3172
+ got_speech = capture_audio()
3173
+
3174
+ # If we got speech, process it
3175
+ if got_speech:
3176
+ try:
3177
+ transcription = transcription_queue.get_nowait()
3178
+ print(f"\nYou (spoke): {transcription}")
3179
+ process_input(transcription)
3180
+ except queue.Empty:
3181
+ pass
2801
3182
  else:
2802
- messages = get_conversation(
2803
- messages,
2804
- model=model,
2805
- provider=provider,
2806
- npc=npc,
2807
- )
3183
+ # If we're speaking, just wait a bit without spamming the console
3184
+ time.sleep(0.1)
2808
3185
 
2809
- output = messages[-1]["content"]
2810
- print(output)
2811
- if not continuous:
2812
- inp = input("Press Enter to continue or type '/q' to quit: ")
2813
- if inp.lower() == "/q":
2814
- break
3186
+ except KeyboardInterrupt:
3187
+ print("\nInterrupted by user.")
3188
+
3189
+ finally:
3190
+ # Set running to False to signal threads to exit
3191
+ running = False
3192
+ speech_thread_active.clear()
3193
+
3194
+ # Clean up audio resources
3195
+ safely_close_audio_stream(audio_stream)
2815
3196
 
2816
- return messages
3197
+ if pyaudio_instance:
3198
+ pyaudio_instance.terminate()
3199
+
3200
+ print("\nExiting whisper mode.")
3201
+ speak_text("Exiting whisper mode. Goodbye!")
3202
+ time.sleep(1)
3203
+ cleanup_temp_files()
3204
+
3205
+ return {"messages": messages, "output": "Whisper mode session ended."}
3206
+
3207
+
3208
+ def get_context_string(messages):
3209
+ context = []
3210
+ for message in messages[-5:]: # Get last 5 messages for context
3211
+ role = message.get("role", "")
3212
+ content = message.get("content", "")
3213
+ context.append(f"{role.capitalize()}: {content}")
3214
+ return "\n".join(context)
3215
+
3216
+
3217
+ def input_with_timeout(prompt, timeout=0.1):
3218
+ """Non-blocking input function with a timeout."""
3219
+ import select
3220
+ import sys
3221
+
3222
+ print(prompt, end="", flush=True)
3223
+ rlist, _, _ = select.select([sys.stdin], [], [], timeout)
3224
+ if rlist:
3225
+ return sys.stdin.readline().strip()
3226
+ return None
2817
3227
 
2818
3228
 
2819
3229
  def enter_notes_mode(npc: Any = None) -> None:
@@ -3163,6 +3573,7 @@ def enter_spool_mode(
3163
3573
  Dict : The messages and output.
3164
3574
 
3165
3575
  """
3576
+
3166
3577
  command_history = CommandHistory()
3167
3578
  npc_info = f" (NPC: {npc.name})" if npc else ""
3168
3579
  print(f"Entering spool mode{npc_info}. Type '/sq' to exit spool mode.")