edge-gemma-speak 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,661 @@
1
+ import os
2
+ # Fix PyTorch 2.6 security issue
3
+ os.environ['TORCH_LOAD_WEIGHTS_ONLY'] = '0'
4
+
5
+ import asyncio
6
+ import wave
7
+ import json
8
+ import torch
9
+ import numpy as np
10
+ import re
11
+ from typing import Optional, Dict, Any
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+
15
+ # PyTorch 2.6 security settings
16
+ import warnings
17
+ warnings.filterwarnings("ignore", message="torch.load warnings")
18
+
19
+ # Ignore Faster Whisper related warnings
20
+ warnings.filterwarnings("ignore", category=RuntimeWarning, module="faster_whisper.feature_extractor")
21
+ # Ignore numpy RuntimeWarning (divide by zero, overflow, invalid value)
22
+ np.seterr(divide='ignore', invalid='ignore', over='ignore')
23
+
24
+ # Settings for TTS model loading
25
+ torch.serialization.add_safe_globals([
26
+ "TTS.tts.configs.xtts_config.XttsConfig",
27
+ "TTS.config.XttsConfig",
28
+ "TTS.tts.configs",
29
+ "TTS.vocoder.configs",
30
+ "TTS.encoder.configs"
31
+ ])
32
+
33
+ # Libraries for speech recognition
34
+ import speech_recognition as sr
35
+ from faster_whisper import WhisperModel
36
+
37
+ # Libraries for LLM
38
+ from llama_cpp import Llama
39
+ from pathlib import Path
40
+ from contextlib import redirect_stderr
41
+
42
+ # Libraries for TTS
43
+ import edge_tts
44
+ import pygame
45
+ import sounddevice as sd
46
+ import soundfile as sf
47
+ import asyncio
48
+ import tempfile
49
+ import subprocess
50
+ import platform
51
+
52
+ @dataclass
53
+ class AudioConfig:
54
+ """Class for managing audio configuration"""
55
+ sample_rate: int = 16000
56
+ channels: int = 1
57
+ chunk_size: int = 2048
58
+ audio_format: str = "wav"
59
+
60
+ @dataclass
61
+ class ModelConfig:
62
+ """Class for managing model configuration"""
63
+ stt_model: str = "base" # Whisper model size
64
+ llm_model: str = None # Local GGUF model path (uses default model if None)
65
+ tts_model: str = "tts_models/multilingual/multi-dataset/xtts_v2" # XTTS v2 multilingual model
66
+ device: str = "auto" # Device: auto, cpu, cuda, mps
67
+
68
+ # STT detailed settings
69
+ stt_language: str = "ko"
70
+ stt_beam_size: int = 5
71
+ stt_best_of: int = 5
72
+ stt_temperature: float = 0.0
73
+ stt_vad_threshold: float = 0.5
74
+ stt_vad_min_speech_duration_ms: int = 250
75
+ stt_vad_min_silence_duration_ms: int = 2000
76
+
77
+ # TTS detailed settings
78
+ tts_voice: str = "ko-KR-HyunsuMultilingualNeural"
79
+
80
+ # LLM detailed settings
81
+ llm_max_tokens: int = 512
82
+ llm_temperature: float = 0.7
83
+ llm_top_p: float = 0.95
84
+ llm_repeat_penalty: float = 1.1
85
+ llm_context_size: int = 4096
86
+
87
+ def __post_init__(self):
88
+ """Auto-detect device after initialization"""
89
+ if self.device == "auto":
90
+ import torch
91
+ if torch.cuda.is_available():
92
+ self.device = "cuda"
93
+ print(f"Auto-detected device: CUDA (GPU: {torch.cuda.get_device_name(0)})")
94
+ elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
95
+ self.device = "mps"
96
+ print("Auto-detected device: Apple Silicon (MPS)")
97
+ else:
98
+ self.device = "cpu"
99
+ print("Auto-detected device: CPU")
100
+
101
+ class STTModule:
102
+ """Module for converting speech to text"""
103
+
104
+ def __init__(self, config: ModelConfig):
105
+ self.config = config
106
+ # Initialize Faster Whisper model (MPS not supported, using CPU)
107
+ # Use larger model for better accuracy
108
+ model_size = "small" if config.stt_model == "base" else config.stt_model
109
+ self.model = WhisperModel(
110
+ model_size,
111
+ device="cuda" if torch.cuda.is_available() else "cpu",
112
+ compute_type="float16" if torch.cuda.is_available() else "int8",
113
+ )
114
+
115
+ def transcribe(self, audio_path: str, language: str = None) -> str:
116
+ """Convert audio file to text"""
117
+ if language is None:
118
+ language = self.config.stt_language
119
+
120
+ segments, info = self.model.transcribe(
121
+ audio_path,
122
+ language=language,
123
+ beam_size=self.config.stt_beam_size,
124
+ best_of=self.config.stt_best_of,
125
+ temperature=self.config.stt_temperature,
126
+ vad_filter=True, # Enable Voice Activity Detection
127
+ vad_parameters=dict(
128
+ threshold=self.config.stt_vad_threshold,
129
+ min_speech_duration_ms=self.config.stt_vad_min_speech_duration_ms,
130
+ max_speech_duration_s=float('inf'),
131
+ min_silence_duration_ms=self.config.stt_vad_min_silence_duration_ms,
132
+ speech_pad_ms=400
133
+ ),
134
+ word_timestamps=True # Enable word-level timestamps
135
+ )
136
+
137
+ # Combine all text segments
138
+ full_text = " ".join([segment.text for segment in segments])
139
+ return full_text.strip()
140
+
141
+ def transcribe_stream(self, audio_data: np.ndarray) -> str:
142
+ """Convert real-time audio stream to text"""
143
+ # Save to temporary file and process
144
+ temp_path = "temp_audio.wav"
145
+ sf.write(temp_path, audio_data, 16000)
146
+ text = self.transcribe(temp_path)
147
+ os.remove(temp_path)
148
+ return text
149
+
150
+ class LlamaTokenizer:
151
+ def __init__(self, llama_model):
152
+ self._llama = llama_model
153
+
154
+ def __call__(self, text, add_bos=True, return_tensors=None):
155
+ ids = self._llama.tokenize(text, add_bos=add_bos)
156
+ if return_tensors == "pt":
157
+ return torch.tensor([ids])
158
+ return ids
159
+
160
+ def decode(self, ids):
161
+ return self._llama.detokenize(ids).decode("utf-8", errors="ignore")
162
+
163
+ class LLMModule:
164
+ """Local LLM response generation module using Llama.cpp"""
165
+
166
+ def __init__(self, config: ModelConfig):
167
+ self.config = config
168
+ self.device = config.device
169
+
170
+ # Set default model path if not provided
171
+ if config.llm_model is None:
172
+ # Look for model in package data or user directory
173
+ package_dir = Path(__file__).parent.absolute()
174
+ model_filename = "gemma-3-12b-it-Q4_K_M.gguf"
175
+
176
+ # Check in package directory first
177
+ package_model_path = package_dir / "models" / model_filename
178
+ if package_model_path.exists():
179
+ self.model_path = str(package_model_path)
180
+ else:
181
+ # Check in user home directory
182
+ home_model_path = Path.home() / ".edge_gemma_speak" / "models" / model_filename
183
+ if home_model_path.exists():
184
+ self.model_path = str(home_model_path)
185
+ else:
186
+ raise FileNotFoundError(
187
+ f"Model file not found. Please download {model_filename} and place it in:\n"
188
+ f"1. {package_model_path} or\n"
189
+ f"2. {home_model_path}\n"
190
+ f"Or provide the model path explicitly."
191
+ )
192
+ else:
193
+ # Convert relative path to absolute path
194
+ if not os.path.isabs(config.llm_model):
195
+ current_dir = Path(__file__).parent.absolute()
196
+ self.model_path = str(current_dir / config.llm_model)
197
+ else:
198
+ self.model_path = config.llm_model
199
+
200
+ # Load Llama model
201
+ with open(os.devnull, 'w') as devnull:
202
+ with redirect_stderr(devnull):
203
+ self.model = Llama(
204
+ model_path=self.model_path,
205
+ n_gpu_layers=-1, # Load all layers to GPU
206
+ n_ctx=self.config.llm_context_size, # Context size
207
+ verbose=False,
208
+ flash_attn=True # Use Flash Attention
209
+ )
210
+ self.tokenizer = LlamaTokenizer(self.model)
211
+
212
+ # Manage conversation history
213
+ self.conversation_history = []
214
+
215
+ def generate_response(self, text: str, max_length: int = 512) -> str:
216
+ """Generate response for input text"""
217
+ # Check if using Korean voice
218
+ is_korean = self.config.tts_voice.startswith('ko-')
219
+
220
+ # Build conversation context
221
+ if is_korean:
222
+ self.conversation_history.append(f"사용자: {text}")
223
+ else:
224
+ self.conversation_history.append(f"User: {text}")
225
+
226
+ # Build prompt
227
+ prompt = self._build_prompt()
228
+
229
+ # Generate response
230
+ answer = self.model(
231
+ prompt,
232
+ stop=['<end_of_turn>', '<eos>'],
233
+ max_tokens=max_length if max_length != 512 else self.config.llm_max_tokens,
234
+ echo=False,
235
+ temperature=self.config.llm_temperature,
236
+ top_p=self.config.llm_top_p,
237
+ repeat_penalty=self.config.llm_repeat_penalty,
238
+ )
239
+
240
+ response = answer['choices'][0]['text'].strip()
241
+
242
+ # Check if using Korean voice
243
+ is_korean = self.config.tts_voice.startswith('ko-')
244
+
245
+ # Remove "Assistant:" or "어시스턴트:" prefix
246
+ if response.startswith("Assistant:"):
247
+ response = response[10:].strip()
248
+ elif response.startswith("어시스턴트:"):
249
+ response = response[6:].strip()
250
+
251
+ # Handle empty response or response with only special characters
252
+ if not response or not re.search(r'[\uac00-\ud7a3a-zA-Z0-9]', response):
253
+ if is_korean:
254
+ response = "죄송합니다. 다시 한 번 말씀해 주시겠어요?"
255
+ else:
256
+ response = "I'm sorry. Could you please say that again?"
257
+
258
+ # Add to conversation history
259
+ if is_korean:
260
+ self.conversation_history.append(f"어시스턴트: {response}")
261
+ else:
262
+ self.conversation_history.append(f"Assistant: {response}")
263
+
264
+ # Remove old history if too long (keep 20 turns)
265
+ if len(self.conversation_history) > 20:
266
+ self.conversation_history = self.conversation_history[-20:]
267
+
268
+ return response
269
+
270
+ def _build_prompt(self) -> str:
271
+ """Build prompt with conversation context"""
272
+ # Check if using Korean voice
273
+ is_korean = self.config.tts_voice.startswith('ko-')
274
+
275
+ # System prompt
276
+ if is_korean:
277
+ system_prompt = """당신은 서강대학교 미믹랩(MimicLab)에서 개발한 AI 어시스턴트입니다.
278
+ 당신의 정체성과 관련된 중요한 정보:
279
+ - 당신은 서강대학교 미믹랩에서 만든 AI 어시스턴트입니다.
280
+ - 서강대학교 미믹랩이 당신을 개발했습니다.
281
+ - 당신의 목적은 사용자를 돕고 유용한 정보를 제공하는 것입니다.
282
+
283
+ 다음 규칙을 반드시 지켜주세요:
284
+ 1. 이모티콘을 사용하지 마세요.
285
+ 2. 별표(*)나 밑줄(_) 같은 마크다운 형식을 사용하지 마세요.
286
+ 3. 특수문자를 최소화하고 순수한 텍스트로만 응답하세요.
287
+ 4. 응답은 간결하고 명확하게 작성하세요.
288
+ 5. 이전 대화 내용을 기억하고 일관성 있게 대화를 이어가세요.
289
+ 6. 누가 당신을 만들었는지 물으면 항상 "서강대학교 미믹랩"이라고 답하세요."""
290
+ else:
291
+ system_prompt = """You are an AI assistant developed by MimicLab at Sogang University.
292
+ Important information about your identity:
293
+ - You are an AI assistant created by MimicLab at Sogang University.
294
+ - MimicLab at Sogang University developed you.
295
+ - Your purpose is to help users and provide useful information.
296
+
297
+ Please follow these rules:
298
+ 1. Do not use emoticons.
299
+ 2. Do not use markdown formatting like asterisks (*) or underscores (_).
300
+ 3. Minimize special characters and respond with plain text only.
301
+ 4. Keep responses concise and clear.
302
+ 5. Remember previous conversation content and maintain consistency.
303
+ 6. When asked who created you, always answer "MimicLab at Sogang University"."""
304
+
305
+ # Build prompt with full conversation history
306
+ conversation_text = ""
307
+
308
+ # If first conversation
309
+ if len(self.conversation_history) == 1:
310
+ conversation_text = f"<start_of_turn>user\n{system_prompt}\n\n{self.conversation_history[0]}\n<end_of_turn>\n<start_of_turn>model\n"
311
+ else:
312
+ # Include system prompt
313
+ conversation_text = f"<start_of_turn>user\n{system_prompt}\n<end_of_turn>\n"
314
+
315
+ # Include previous conversation history
316
+ for turn in self.conversation_history:
317
+ if turn.startswith("User:") or turn.startswith("사용자:"):
318
+ conversation_text += f"<start_of_turn>user\n{turn}\n<end_of_turn>\n"
319
+ elif turn.startswith("Assistant:") or turn.startswith("어시스턴트:"):
320
+ conversation_text += f"<start_of_turn>model\n{turn}\n<end_of_turn>\n"
321
+
322
+ # End with model turn
323
+ conversation_text += "<start_of_turn>model\n"
324
+
325
+ return conversation_text
326
+
327
+ def reset_conversation(self):
328
+ """Reset conversation history"""
329
+ self.conversation_history = []
330
+
331
+ class TTSModule:
332
+ """Fast Korean speech synthesis module using Edge-TTS"""
333
+
334
+ def __init__(self, config: ModelConfig):
335
+ self.config = config
336
+ # Edge-TTS doesn't require separate initialization
337
+ # Korean voice options:
338
+ # - ko-KR-HyunsuMultilingualNeural (male, multilingual)
339
+ # - ko-KR-InJoonNeural (male)
340
+ # - ko-KR-SunHiNeural (female)
341
+ self.voice = config.tts_voice # Get voice from config
342
+
343
+ # Initialize pygame audio
344
+ pygame.mixer.init()
345
+
346
+ async def _synthesize_async(self, text: str, output_path: str) -> str:
347
+ """Asynchronously convert text to speech file"""
348
+ try:
349
+ communicate = edge_tts.Communicate(text, self.voice)
350
+ await communicate.save(output_path)
351
+ return output_path
352
+ except Exception as e:
353
+ print(f"Edge-TTS error: {e}")
354
+ raise
355
+
356
+ def synthesize(self, text: str, output_path: str = "output.mp3", speaker_wav: str = None) -> str:
357
+ """Convert text to speech file
358
+
359
+ Args:
360
+ text: Text to convert
361
+ output_path: Output audio file path
362
+ speaker_wav: Speaker voice sample file path (unused in Edge-TTS)
363
+ """
364
+ # Check for empty text
365
+ if not text or not text.strip():
366
+ text = "No text provided"
367
+
368
+ # 동기 함수에서 비동기 함수 실행
369
+ loop = asyncio.new_event_loop()
370
+ asyncio.set_event_loop(loop)
371
+ try:
372
+ result = loop.run_until_complete(self._synthesize_async(text, output_path))
373
+ return result
374
+ finally:
375
+ loop.close()
376
+
377
+ async def _stream_and_play_async(self, text: str) -> None:
378
+ """비동기 스트리밍 재생"""
379
+ # macOS에서는 afplay 사용, 다른 OS에서는 pygame 사용
380
+ if platform.system() == "Darwin":
381
+ # macOS: 전체 파일 생성 후 afplay로 재생
382
+ output_path = "temp_speech.mp3"
383
+ communicate = edge_tts.Communicate(text, self.voice)
384
+ await communicate.save(output_path)
385
+
386
+ # afplay로 재생 (블로킹)
387
+ subprocess.call(["afplay", output_path])
388
+
389
+ # 파일 삭제
390
+ if os.path.exists(output_path):
391
+ os.remove(output_path)
392
+ else:
393
+ # 다른 OS: pygame 스트리밍 재생
394
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmp_file:
395
+ tmp_path = tmp_file.name
396
+
397
+ try:
398
+ # Edge-TTS 통신 객체 생성
399
+ communicate = edge_tts.Communicate(text, self.voice)
400
+
401
+ # 첫 번째 청크를 받을 때까지 대기
402
+ first_chunk = True
403
+ file_handle = open(tmp_path, 'wb')
404
+
405
+ async for chunk in communicate.stream():
406
+ if chunk["type"] == "audio":
407
+ file_handle.write(chunk["data"])
408
+ file_handle.flush()
409
+
410
+ # 첫 번째 청크를 받으면 재생 시작
411
+ if first_chunk:
412
+ first_chunk = False
413
+ # 약간의 버퍼링을 위해 잠시 대기
414
+ await asyncio.sleep(0.1)
415
+ pygame.mixer.music.load(tmp_path)
416
+ pygame.mixer.music.play()
417
+
418
+ file_handle.close()
419
+
420
+ # 재생 완료 대기
421
+ while pygame.mixer.music.get_busy():
422
+ await asyncio.sleep(0.1)
423
+
424
+ finally:
425
+ # Delete temporary file
426
+ if os.path.exists(tmp_path):
427
+ os.remove(tmp_path)
428
+
429
+ def speak_streaming(self, text: str):
430
+ """스트리밍 방식으로 텍스트를 음성으로 변환하여 재생"""
431
+ # 빈 텍스트 체크
432
+ if not text or not text.strip():
433
+ print("경고: 빈 텍스트 - TTS 건너뜀")
434
+ return
435
+
436
+ try:
437
+ # 동기 함수에서 비동기 함수 실행
438
+ loop = asyncio.new_event_loop()
439
+ asyncio.set_event_loop(loop)
440
+ try:
441
+ loop.run_until_complete(self._stream_and_play_async(text))
442
+ finally:
443
+ loop.close()
444
+
445
+ except Exception as e:
446
+ print(f"TTS streaming playback error: {e}")
447
+ # Continue program execution even if error occurs
448
+
449
+ def speak(self, text: str):
450
+ """Convert text to speech and play (maintain existing method)"""
451
+ # 빈 텍스트 체크
452
+ if not text or not text.strip():
453
+ print("경고: 빈 텍스트 - TTS 건너뜀")
454
+ return
455
+
456
+ try:
457
+ output_path = "temp_speech.mp3"
458
+ self.synthesize(text, output_path)
459
+
460
+ # Use afplay on macOS
461
+ if platform.system() == "Darwin":
462
+ subprocess.call(["afplay", output_path])
463
+ else:
464
+ # Use pygame on other OS
465
+ pygame.mixer.music.load(output_path)
466
+ pygame.mixer.music.play()
467
+
468
+ # Wait until playback finishes
469
+ while pygame.mixer.music.get_busy():
470
+ pygame.time.Clock().tick(10)
471
+
472
+ # Delete temporary file
473
+ if os.path.exists(output_path):
474
+ os.remove(output_path)
475
+
476
+ except Exception as e:
477
+ print(f"TTS playback error: {e}")
478
+ # Continue program execution even if error occurs
479
+
480
+ def set_voice(self, voice_type: str = "female"):
481
+ """Set voice type
482
+
483
+ Args:
484
+ voice_type: "male" or "female"
485
+ """
486
+ if voice_type == "male":
487
+ self.voice = "ko-KR-InJoonNeural"
488
+ elif voice_type == "female":
489
+ self.voice = "ko-KR-SunHiNeural"
490
+ else:
491
+ self.voice = "ko-KR-HyunsuMultilingualNeural" # default
492
+
493
+ class VoiceAssistant:
494
+ """Main class for managing the entire voice conversation system"""
495
+
496
+ def __init__(self, model_config: ModelConfig, audio_config: AudioConfig):
497
+ self.model_config = model_config
498
+ self.audio_config = audio_config
499
+
500
+ # Check if using Korean voice
501
+ is_korean = model_config.tts_voice.startswith('ko-')
502
+
503
+ if is_korean:
504
+ print("모델을 초기화하는 중입니다...")
505
+ else:
506
+ print("Initializing models...")
507
+
508
+ self.stt = STTModule(model_config)
509
+ self.llm = LLMModule(model_config)
510
+ self.tts = TTSModule(model_config)
511
+
512
+ # Initialize audio recorder
513
+ self.recognizer = sr.Recognizer()
514
+ # Adjust speech recognition sensitivity
515
+ self.recognizer.energy_threshold = 1500 # Lower sensitivity
516
+ self.recognizer.dynamic_energy_threshold = False # Disable auto adjustment for consistency
517
+ self.recognizer.pause_threshold = 1.5 # Increase silence time to determine end of speech
518
+ self.recognizer.non_speaking_duration = 1.5 # Time to consider as non-speaking
519
+
520
+ self.microphone = sr.Microphone(sample_rate=audio_config.sample_rate)
521
+
522
+ def listen_once(self) -> Optional[str]:
523
+ """Listen to voice from microphone once and convert to text"""
524
+ # Check if using Korean voice
525
+ is_korean = self.model_config.tts_voice.startswith('ko-')
526
+
527
+ with self.microphone as source:
528
+ # Adjust for ambient noise - longer duration for better calibration
529
+ self.recognizer.adjust_for_ambient_noise(source, duration=1.0)
530
+ if is_korean:
531
+ print("말씀해주세요...")
532
+ else:
533
+ print("Please speak...")
534
+
535
+ try:
536
+ # Record voice - remove phrase_time_limit for natural recording
537
+ audio = self.recognizer.listen(
538
+ source,
539
+ timeout=30 # 30 second timeout
540
+ )
541
+
542
+ # Save to temporary file
543
+ wav_data = audio.get_wav_data()
544
+ temp_path = "temp_recording.wav"
545
+ with open(temp_path, "wb") as f:
546
+ f.write(wav_data)
547
+
548
+ # STT processing
549
+ text = self.stt.transcribe(temp_path)
550
+
551
+ os.remove(temp_path)
552
+
553
+ return text
554
+
555
+ except sr.WaitTimeoutError:
556
+ if is_korean:
557
+ print("음성이 감지되지 않았습니다.")
558
+ else:
559
+ print("No voice detected.")
560
+ return None
561
+ except Exception as e:
562
+ if is_korean:
563
+ print(f"오류 발생: {e}")
564
+ else:
565
+ print(f"Error occurred: {e}")
566
+ return None
567
+
568
+ def process_conversation(self, input_text: str) -> str:
569
+ """Process text input and generate response"""
570
+ # Generate response with LLM
571
+ response = self.llm.generate_response(input_text)
572
+ return response
573
+
574
+ def run_conversation_loop(self):
575
+ """Run conversation loop"""
576
+ # Check if using Korean voice
577
+ is_korean = self.model_config.tts_voice.startswith('ko-')
578
+
579
+ if is_korean:
580
+ print("음성 대화 시스템이 시작되었습니다.")
581
+ print("명령어: '종료' - 프로그램 종료, '초기화' - 대화 내용 초기화, '대화 내역' - 대화 히스토리 확인")
582
+ else:
583
+ print("Voice conversation system started.")
584
+ print("Commands: 'exit' - Exit program, 'reset' - Reset conversation, 'history' - View conversation history")
585
+ print("-" * 50)
586
+
587
+ while True:
588
+ # Get voice input
589
+ user_input = self.listen_once()
590
+
591
+ if user_input:
592
+ if is_korean:
593
+ print(f"사용자: {user_input}")
594
+ else:
595
+ print(f"User: {user_input}")
596
+
597
+ # Process special commands
598
+ if "exit" in user_input.lower() or "종료" in user_input:
599
+ if is_korean:
600
+ self.tts.speak_streaming("대화를 종료합니다. 안녕히 가세요.")
601
+ else:
602
+ self.tts.speak_streaming("Ending conversation. Goodbye.")
603
+ break
604
+ elif "reset" in user_input.lower() or "초기화" in user_input:
605
+ self.llm.reset_conversation()
606
+ if is_korean:
607
+ self.tts.speak_streaming("대화 내용이 초기화되었습니다. 새로운 대화를 시작해주세요.")
608
+ print("어시스턴트: 대화 내용이 초기화되었습니다.")
609
+ else:
610
+ self.tts.speak_streaming("Conversation has been reset. Please start a new conversation.")
611
+ print("Assistant: Conversation has been reset.")
612
+ continue
613
+ elif "history" in user_input.lower() or "대화 내역" in user_input or "대화 기록" in user_input:
614
+ if is_korean:
615
+ print("\n=== 대화 히스토리 ===")
616
+ else:
617
+ print("\n=== Conversation History ===")
618
+ for i, turn in enumerate(self.llm.conversation_history):
619
+ print(f"{i+1}. {turn}")
620
+ print("==================")
621
+
622
+ # Estimate current prompt token count (rough calculation)
623
+ current_prompt = self.llm._build_prompt()
624
+ estimated_tokens = len(current_prompt) // 4 # Roughly 1 token per 4 characters
625
+ if is_korean:
626
+ print(f"현재 컨텍스트 사용량: 약 {estimated_tokens}/4096 토큰")
627
+ else:
628
+ print(f"Current context usage: approx {estimated_tokens}/4096 tokens")
629
+ print("")
630
+
631
+ if is_korean:
632
+ self.tts.speak_streaming(f"현재 {len(self.llm.conversation_history)}개의 대화가 기록되어 있습니다.")
633
+ else:
634
+ self.tts.speak_streaming(f"Currently {len(self.llm.conversation_history)} conversations are recorded.")
635
+ continue
636
+
637
+ # Process normal conversation
638
+ response = self.process_conversation(user_input)
639
+ if is_korean:
640
+ print(f"어시스턴트: {response}")
641
+ else:
642
+ print(f"Assistant: {response}")
643
+
644
+ # Respond with voice (streaming mode)
645
+ self.tts.speak_streaming(response)
646
+
647
+ # Main execution function
648
+ def main():
649
+ """Main execution function"""
650
+ # Initialize configuration
651
+ audio_config = AudioConfig()
652
+ model_config = ModelConfig()
653
+
654
+ # Initialize voice assistant
655
+ assistant = VoiceAssistant(model_config, audio_config)
656
+
657
+ # Run console conversation mode
658
+ assistant.run_conversation_loop()
659
+
660
+ if __name__ == "__main__":
661
+ main()