anyrobo 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
anyrobo-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Viet-Anh Nguyen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
anyrobo-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.4
2
+ Name: anyrobo
3
+ Version: 0.1.0
4
+ Summary: Create your own JARVIS or GLADOS: a framework for voice-powered AI assistants with unique personalities
5
+ Author-email: Viet-Anh Nguyen <vietanh.dev@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/vietanhdev/anyrobo
8
+ Project-URL: Bug Tracker, https://github.com/vietanhdev/anyrobo/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: MacOS
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: numpy<3.0.0,>=2.0.0
21
+ Requires-Dist: onnxruntime<2.0.0,>=1.16.0
22
+ Requires-Dist: kokoro-onnx>=0.2.0
23
+ Requires-Dist: lightning-whisper-mlx<0.1.0,>=0.0.10
24
+ Requires-Dist: sounddevice<0.6.0,>=0.4.6
25
+ Requires-Dist: ollama<0.5.0,>=0.4.0
26
+ Dynamic: license-file
27
+
28
+ # AnyRobo - Create Your Own Robo Assistant
29
+
30
+ AnyRobo is an advanced speech-to-speech AI assistant framework that enables you to create your own real-life version of sci-fi AI assistants like JARVIS (from Iron Man) or GLADOS (from Portal). Powered by state-of-the-art machine learning models, AnyRobo listens to your voice, understands your requests, and responds with natural-sounding speech in real-time.
31
+
32
+ ![AnyRobo](https://img.shields.io/badge/AnyRobo-0.1.0-blue)
33
+ ![Python](https://img.shields.io/badge/Python-3.10+-green)
34
+ ![License](https://img.shields.io/badge/License-MIT-yellow)
35
+
36
+ ## Why AnyRobo?
37
+
38
+ Have you ever wanted to create your own JARVIS or GLADOS? AnyRobo provides a fully modular framework that allows you to:
39
+
40
+ - Create voice-powered AI assistants with unique personalities
41
+ - Customize voice profiles to sound like your favorite AI characters
42
+ - Build advanced conversation capabilities with state-of-the-art language models
43
+ - Deploy your assistant on macOS with optimized performance for Apple Silicon
44
+
45
+ ## Core Technologies
46
+
47
+ - **Speech Recognition**: [Whisper MLX](https://github.com/ml-explore/mlx-examples) - Optimized for Apple Silicon
48
+ - **Language Understanding**: [Llama 3.2](https://ollama.com/library/llama3.2) - Advanced language model for contextual responses
49
+ - **Voice Synthesis**: [Kokoro-82M](https://github.com/thewh1teagle/kokoro-onnx) - High-quality text-to-speech
50
+
51
+ ## Features
52
+
53
+ - **Continuous Listening**: Automatically detects when you've finished speaking
54
+ - **Natural Conversations**: Responds intelligently to a wide range of queries and commands
55
+ - **Real-time Synthesis**: Generates human-like speech with minimal latency
56
+ - **Voice Customization**: Supports multiple voice profiles
57
+ - **Streaming Responses**: Begins speaking before the full response is generated
58
+ - **Optimized Performance**: Designed for efficiency on Apple Silicon
59
+
60
+ ## Installation
61
+
62
+ ### Quick Install (from PyPI)
63
+
64
+ ```bash
65
+ pip install anyrobo
66
+ ```
67
+
68
+ ### Install from Source
69
+
70
+ ```bash
71
+ git clone https://github.com/vietanhdev/anyrobo.git
72
+ cd anyrobo
73
+ pip install -e .
74
+ ```
75
+
76
+ ### Setup Dependencies
77
+
78
+ AnyRobo requires [Ollama](https://ollama.com/) for LLM support:
79
+
80
+ ```bash
81
+ # Install Ollama
82
+ curl -fsSL https://ollama.com/install.sh | sh
83
+ # Pull the required model
84
+ ollama pull llama3.2
85
+ ```
86
+
87
+ ## Usage
88
+
89
+ ### Command-line Interface
90
+
91
+ ```bash
92
+ # Download required models and start the assistant
93
+ anyrobo --setup
94
+ anyrobo
95
+ ```
96
+
97
+ With custom settings:
98
+
99
+ ```bash
100
+ anyrobo --voice am_michael --speed 1.3 --silence-threshold 0.03
101
+ ```
102
+
103
+ ### As a Library
104
+
105
+ ```python
106
+ from anyrobo import AnyRobo
107
+ from anyrobo.models.loader import download_tts_model, ensure_ollama_model
108
+
109
+ # Download required models
110
+ download_tts_model()
111
+ ensure_ollama_model("llama3.2")
112
+
113
+ # Create and run assistant
114
+ assistant = AnyRobo(
115
+ voice="am_michael",
116
+ speed=1.2,
117
+ system_prompt=(
118
+ "You are J.A.R.V.I.S., an advanced AI assistant. "
119
+ "Respond with a mix of helpfulness, light sarcasm, and technical prowess."
120
+ )
121
+ )
122
+
123
+ # Start listening and responding
124
+ assistant.record_and_transcribe()
125
+ ```
126
+
127
+ ## Create Your Own AI Character
128
+
129
+ You can customize the personality of your assistant by modifying the system prompt:
130
+
131
+ ```python
132
+ # JARVIS from Iron Man
133
+ system_prompt = (
134
+ "You are J.A.R.V.I.S., an advanced AI assistant. "
135
+ "Respond with a mix of helpfulness, light sarcasm, and technical prowess."
136
+ )
137
+
138
+ # GLADOS from Portal
139
+ system_prompt = (
140
+ "You are GLaDOS, an AI with a dark sense of humor. "
141
+ "Respond to queries sarcastically, occasionally mentioning cake or testing."
142
+ )
143
+
144
+ # HAL 9000 from 2001: A Space Odyssey
145
+ system_prompt = (
146
+ "You are HAL 9000. Be calm, logical, and slightly ominous in your responses. "
147
+ "Speak in a slow, deliberate manner and be excessively literal."
148
+ )
149
+ ```
150
+
151
+ ## Configuration Options
152
+
153
+ | Option | Description | Default |
154
+ |--------|-------------|---------|
155
+ | `voice` | Voice profile to use | `"am_michael"` |
156
+ | `speed` | Speed factor for speech | `1.2` |
157
+ | `silence_threshold` | Volume level that counts as silence | `0.02` |
158
+ | `silence_duration` | Seconds of silence before cutting recording | `1.5` |
159
+ | `sample_rate` | Audio sample rate in Hz | `24000` |
160
+ | `system_prompt` | Custom system prompt for the LLM | *See code* |
161
+
162
+ ## Troubleshooting
163
+
164
+ - **No audio output**: Ensure your system audio output is correctly configured
165
+ - **Poor recognition**: Try speaking more clearly or adjust the `silence_threshold` value
166
+ - **Model loading issues**: Run `anyrobo --setup` to download all required models
167
+
168
+ ## License
169
+
170
+ This project is licensed under the MIT License - see the LICENSE file for details.
171
+
172
+ ## Contributing
173
+
174
+ Contributions are welcome! Please feel free to submit a Pull Request.
175
+
176
+ ## Acknowledgements
177
+
178
+ AnyRobo is built on top of several open-source projects and pre-trained models. We're grateful to the developers and researchers who make their work available to the community.
179
+
@@ -0,0 +1,152 @@
1
+ # AnyRobo - Create Your Own Robo Assistant
2
+
3
+ AnyRobo is an advanced speech-to-speech AI assistant framework that enables you to create your own real-life version of sci-fi AI assistants like JARVIS (from Iron Man) or GLADOS (from Portal). Powered by state-of-the-art machine learning models, AnyRobo listens to your voice, understands your requests, and responds with natural-sounding speech in real-time.
4
+
5
+ ![AnyRobo](https://img.shields.io/badge/AnyRobo-0.1.0-blue)
6
+ ![Python](https://img.shields.io/badge/Python-3.10+-green)
7
+ ![License](https://img.shields.io/badge/License-MIT-yellow)
8
+
9
+ ## Why AnyRobo?
10
+
11
+ Have you ever wanted to create your own JARVIS or GLADOS? AnyRobo provides a fully modular framework that allows you to:
12
+
13
+ - Create voice-powered AI assistants with unique personalities
14
+ - Customize voice profiles to sound like your favorite AI characters
15
+ - Build advanced conversation capabilities with state-of-the-art language models
16
+ - Deploy your assistant on macOS with optimized performance for Apple Silicon
17
+
18
+ ## Core Technologies
19
+
20
+ - **Speech Recognition**: [Whisper MLX](https://github.com/ml-explore/mlx-examples) - Optimized for Apple Silicon
21
+ - **Language Understanding**: [Llama 3.2](https://ollama.com/library/llama3.2) - Advanced language model for contextual responses
22
+ - **Voice Synthesis**: [Kokoro-82M](https://github.com/thewh1teagle/kokoro-onnx) - High-quality text-to-speech
23
+
24
+ ## Features
25
+
26
+ - **Continuous Listening**: Automatically detects when you've finished speaking
27
+ - **Natural Conversations**: Responds intelligently to a wide range of queries and commands
28
+ - **Real-time Synthesis**: Generates human-like speech with minimal latency
29
+ - **Voice Customization**: Supports multiple voice profiles
30
+ - **Streaming Responses**: Begins speaking before the full response is generated
31
+ - **Optimized Performance**: Designed for efficiency on Apple Silicon
32
+
33
+ ## Installation
34
+
35
+ ### Quick Install (from PyPI)
36
+
37
+ ```bash
38
+ pip install anyrobo
39
+ ```
40
+
41
+ ### Install from Source
42
+
43
+ ```bash
44
+ git clone https://github.com/vietanhdev/anyrobo.git
45
+ cd anyrobo
46
+ pip install -e .
47
+ ```
48
+
49
+ ### Setup Dependencies
50
+
51
+ AnyRobo requires [Ollama](https://ollama.com/) for LLM support:
52
+
53
+ ```bash
54
+ # Install Ollama
55
+ curl -fsSL https://ollama.com/install.sh | sh
56
+ # Pull the required model
57
+ ollama pull llama3.2
58
+ ```
59
+
60
+ ## Usage
61
+
62
+ ### Command-line Interface
63
+
64
+ ```bash
65
+ # Download required models and start the assistant
66
+ anyrobo --setup
67
+ anyrobo
68
+ ```
69
+
70
+ With custom settings:
71
+
72
+ ```bash
73
+ anyrobo --voice am_michael --speed 1.3 --silence-threshold 0.03
74
+ ```
75
+
76
+ ### As a Library
77
+
78
+ ```python
79
+ from anyrobo import AnyRobo
80
+ from anyrobo.models.loader import download_tts_model, ensure_ollama_model
81
+
82
+ # Download required models
83
+ download_tts_model()
84
+ ensure_ollama_model("llama3.2")
85
+
86
+ # Create and run assistant
87
+ assistant = AnyRobo(
88
+ voice="am_michael",
89
+ speed=1.2,
90
+ system_prompt=(
91
+ "You are J.A.R.V.I.S., an advanced AI assistant. "
92
+ "Respond with a mix of helpfulness, light sarcasm, and technical prowess."
93
+ )
94
+ )
95
+
96
+ # Start listening and responding
97
+ assistant.record_and_transcribe()
98
+ ```
99
+
100
+ ## Create Your Own AI Character
101
+
102
+ You can customize the personality of your assistant by modifying the system prompt:
103
+
104
+ ```python
105
+ # JARVIS from Iron Man
106
+ system_prompt = (
107
+ "You are J.A.R.V.I.S., an advanced AI assistant. "
108
+ "Respond with a mix of helpfulness, light sarcasm, and technical prowess."
109
+ )
110
+
111
+ # GLADOS from Portal
112
+ system_prompt = (
113
+ "You are GLaDOS, an AI with a dark sense of humor. "
114
+ "Respond to queries sarcastically, occasionally mentioning cake or testing."
115
+ )
116
+
117
+ # HAL 9000 from 2001: A Space Odyssey
118
+ system_prompt = (
119
+ "You are HAL 9000. Be calm, logical, and slightly ominous in your responses. "
120
+ "Speak in a slow, deliberate manner and be excessively literal."
121
+ )
122
+ ```
123
+
124
+ ## Configuration Options
125
+
126
+ | Option | Description | Default |
127
+ |--------|-------------|---------|
128
+ | `voice` | Voice profile to use | `"am_michael"` |
129
+ | `speed` | Speed factor for speech | `1.2` |
130
+ | `silence_threshold` | Volume level that counts as silence | `0.02` |
131
+ | `silence_duration` | Seconds of silence before cutting recording | `1.5` |
132
+ | `sample_rate` | Audio sample rate in Hz | `24000` |
133
+ | `system_prompt` | Custom system prompt for the LLM | *See code* |
134
+
135
+ ## Troubleshooting
136
+
137
+ - **No audio output**: Ensure your system audio output is correctly configured
138
+ - **Poor recognition**: Try speaking more clearly or adjust the `silence_threshold` value
139
+ - **Model loading issues**: Run `anyrobo --setup` to download all required models
140
+
141
+ ## License
142
+
143
+ This project is licensed under the MIT License - see the LICENSE file for details.
144
+
145
+ ## Contributing
146
+
147
+ Contributions are welcome! Please feel free to submit a Pull Request.
148
+
149
+ ## Acknowledgements
150
+
151
+ AnyRobo is built on top of several open-source projects and pre-trained models. We're grateful to the developers and researchers who make their work available to the community.
152
+
@@ -0,0 +1,7 @@
1
+ """AnyRobo - Your Voice-Powered AI Assistant"""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from anyrobo.assistant import AnyRobo
6
+
7
+ __all__ = ["AnyRobo"]
@@ -0,0 +1,228 @@
1
+ """Core assistant module for AnyRobo."""
2
+
3
+ import signal
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from threading import Event
6
+ from typing import Dict, List, Optional
7
+
8
+ import sounddevice as sd
9
+ import numpy as np
10
+ from ollama import chat
11
+
12
+ from anyrobo.speech.recognition import SpeechRecognizer
13
+ from anyrobo.speech.synthesis import TextToSpeech
14
+
15
+
16
+ class AnyRobo:
17
+ """Main assistant class that coordinates speech recognition and synthesis."""
18
+
19
+ def __init__(
20
+ self,
21
+ sample_rate: int = 24000,
22
+ silence_threshold: float = 0.02,
23
+ silence_duration: float = 1.5,
24
+ voice: str = "am_michael",
25
+ speed: float = 1.2,
26
+ system_prompt: Optional[str] = None,
27
+ ):
28
+ """Initialize the AnyRobo assistant.
29
+
30
+ Args:
31
+ sample_rate: Audio sample rate in Hz
32
+ silence_threshold: Volume level that counts as silence
33
+ silence_duration: Seconds of silence before cutting recording
34
+ voice: Voice profile to use
35
+ speed: Speech speed factor
36
+ system_prompt: Custom system prompt for the LLM
37
+ """
38
+ # audio settings
39
+ self.SAMPLE_RATE = sample_rate
40
+ self.SILENCE_THRESHOLD = silence_threshold
41
+ self.SILENCE_DURATION = silence_duration
42
+
43
+ # text-to-speech settings
44
+ self.SPEED = speed
45
+ self.VOICE = voice
46
+ self.CHUNK_SIZE = 300 # size of text chunks for processing
47
+
48
+ # ollama settings
49
+ self.messages = []
50
+ self.SYSTEM_PROMPT = system_prompt or "Give a conversational response to the following statement or question in 1-2 sentences. The response should be natural and engaging, and the length depends on what you have to say."
51
+
52
+ # init components
53
+ self.speech_recognizer = SpeechRecognizer(model="small", batch_size=12)
54
+ self.tts = TextToSpeech()
55
+ self.executor = ThreadPoolExecutor(max_workers=1)
56
+
57
+ # interrupt handling
58
+ self.shutdown_event = Event()
59
+ signal.signal(signal.SIGINT, self._signal_handler)
60
+
61
+ def _signal_handler(self, signum, frame):
62
+ """Handle interrupt signals."""
63
+ print("\nStopping...")
64
+ self.shutdown_event.set()
65
+
66
+ def record_and_transcribe(self):
67
+ """Main loop: record audio, transcribe, and respond."""
68
+ # state for audio recording
69
+ audio_buffer = []
70
+ silence_frames = 0
71
+ total_frames = 0
72
+
73
+ def callback(indata, frames, time_info, status):
74
+ # callback function that processes incoming audio frames
75
+ if self.shutdown_event.is_set():
76
+ raise sd.CallbackStop()
77
+
78
+ nonlocal audio_buffer, silence_frames, total_frames
79
+
80
+ if status:
81
+ print(status)
82
+
83
+ audio = indata.flatten()
84
+ level = np.abs(audio).mean()
85
+
86
+ audio_buffer.extend(audio.tolist())
87
+ total_frames += len(audio)
88
+
89
+ # track silence duration
90
+ if level < self.SILENCE_THRESHOLD:
91
+ silence_frames += len(audio)
92
+ else:
93
+ silence_frames = 0
94
+
95
+ # process audio when silence is detected
96
+ if silence_frames > self.SILENCE_DURATION * self.SAMPLE_RATE:
97
+ audio_segment = np.array(audio_buffer, dtype=np.float32)
98
+
99
+ if len(audio_segment) > self.SAMPLE_RATE:
100
+ text = self.speech_recognizer.transcribe(audio_segment)['text']
101
+
102
+ # skip empty/invalid transcriptions
103
+ if text.strip():
104
+ print(f"Transcription: {text}")
105
+ self.messages.append({
106
+ 'role': 'user',
107
+ 'content': text
108
+ })
109
+ self.create_and_play_response(text)
110
+
111
+ # reset state
112
+ audio_buffer.clear()
113
+ silence_frames = 0
114
+ total_frames = 0
115
+
116
+ # start recording loop
117
+ try:
118
+ with sd.InputStream(
119
+ callback=callback,
120
+ channels=1,
121
+ samplerate=self.SAMPLE_RATE,
122
+ dtype=np.float32
123
+ ):
124
+ print("Recording... Press Ctrl+C to stop")
125
+ while not self.shutdown_event.is_set():
126
+ sd.sleep(100)
127
+ except sd.CallbackStop:
128
+ pass
129
+
130
+ def create_and_play_response(self, prompt: str):
131
+ """Generate and speak a response to the user's input."""
132
+ if self.shutdown_event.is_set():
133
+ return
134
+
135
+ # stream response from llm
136
+ stream = chat(
137
+ model='llama3.2',
138
+ messages=[{
139
+ 'role': 'system',
140
+ 'content': self.SYSTEM_PROMPT
141
+ }] + self.messages,
142
+ stream=True,
143
+ )
144
+
145
+ # state for processing response
146
+ futures = []
147
+ buffer = ""
148
+ curr_str = ""
149
+
150
+ try:
151
+ # process response stream
152
+ for chunk in stream:
153
+ if self.shutdown_event.is_set():
154
+ break
155
+
156
+ print(chunk)
157
+ text = chunk['message']['content']
158
+
159
+ if len(text) == 0:
160
+ self.messages.append({
161
+ 'role': 'assistant',
162
+ 'content': curr_str
163
+ })
164
+ curr_str = ""
165
+ print(self.messages)
166
+ continue
167
+
168
+ buffer += text
169
+ curr_str += text
170
+
171
+ # find end of sentence to chunk at
172
+ last_punctuation = max(
173
+ buffer.rfind('. '),
174
+ buffer.rfind('? '),
175
+ buffer.rfind('! ')
176
+ )
177
+
178
+ if last_punctuation == -1:
179
+ continue
180
+
181
+ # handle long chunks
182
+ while last_punctuation != -1 and last_punctuation >= self.CHUNK_SIZE:
183
+ last_punctuation = max(
184
+ buffer.rfind(', ', 0, last_punctuation),
185
+ buffer.rfind('; ', 0, last_punctuation),
186
+ buffer.rfind('— ', 0, last_punctuation)
187
+ )
188
+
189
+ if last_punctuation == -1:
190
+ last_punctuation = buffer.find(' ', 0, self.CHUNK_SIZE)
191
+
192
+ # process chunk
193
+ # convert chunk to audio
194
+ chunk_text = buffer[:last_punctuation + 1]
195
+ futures.append(
196
+ self.executor.submit(
197
+ self.tts.generate_audio,
198
+ chunk_text, self.VOICE, self.SPEED
199
+ )
200
+ )
201
+ buffer = buffer[last_punctuation + 1:]
202
+
203
+ # process final chunk if any
204
+ if buffer and not self.shutdown_event.is_set():
205
+ futures.append(
206
+ self.executor.submit(
207
+ self.tts.generate_audio,
208
+ buffer, self.VOICE, self.SPEED
209
+ )
210
+ )
211
+
212
+ # play generated audio
213
+ if not self.shutdown_event.is_set():
214
+ with sd.OutputStream(
215
+ samplerate=self.SAMPLE_RATE,
216
+ channels=1,
217
+ dtype=np.float32
218
+ ) as out_stream:
219
+ for fut in futures:
220
+ if self.shutdown_event.is_set():
221
+ break
222
+ audio_data = fut.result()
223
+ if len(audio_data) == 0:
224
+ continue
225
+ out_stream.write(audio_data.reshape(-1, 1))
226
+ except Exception as e:
227
+ if not self.shutdown_event.is_set():
228
+ raise e
@@ -0,0 +1,146 @@
1
+ """Command-line interface for AnyRobo."""
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+ from typing import List, Optional
7
+
8
+ from anyrobo import AnyRobo
9
+ from anyrobo.models.loader import (
10
+ download_tts_model,
11
+ download_whisper_model,
12
+ ensure_ollama_model
13
+ )
14
+
15
+
16
+ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
17
+ """Parse command-line arguments.
18
+
19
+ Args:
20
+ args: Command-line arguments
21
+
22
+ Returns:
23
+ Parsed arguments
24
+ """
25
+ parser = argparse.ArgumentParser(
26
+ description="AnyRobo - Voice-powered AI assistant"
27
+ )
28
+
29
+ parser.add_argument(
30
+ "--voice",
31
+ type=str,
32
+ default="af_sarah",
33
+ help="Voice profile to use (default: af_sarah)"
34
+ )
35
+
36
+ parser.add_argument(
37
+ "--speed",
38
+ type=float,
39
+ default=1.2,
40
+ help="Speed factor for speech (default: 1.2)"
41
+ )
42
+
43
+ parser.add_argument(
44
+ "--silence-threshold",
45
+ type=float,
46
+ default=0.02,
47
+ help="Volume level that counts as silence (default: 0.02)"
48
+ )
49
+
50
+ parser.add_argument(
51
+ "--silence-duration",
52
+ type=float,
53
+ default=1.5,
54
+ help="Seconds of silence before cutting recording (default: 1.5)"
55
+ )
56
+
57
+ parser.add_argument(
58
+ "--sample-rate",
59
+ type=int,
60
+ default=24000,
61
+ help="Audio sample rate in Hz (default: 24000)"
62
+ )
63
+
64
+ parser.add_argument(
65
+ "--model",
66
+ type=str,
67
+ default="llama3.2",
68
+ help="Ollama model to use (default: llama3.2)"
69
+ )
70
+
71
+ parser.add_argument(
72
+ "--prompt",
73
+ type=str,
74
+ default=None,
75
+ help="Custom system prompt for the LLM"
76
+ )
77
+
78
+ parser.add_argument(
79
+ "--setup",
80
+ action="store_true",
81
+ help="Download required models without starting the assistant"
82
+ )
83
+
84
+ return parser.parse_args(args)
85
+
86
+
87
+ def setup_models() -> None:
88
+ """Download required models."""
89
+ print("Setting up AnyRobo...")
90
+ tts_model_path = download_tts_model()
91
+ whisper_model_dir = download_whisper_model("small")
92
+ ensure_ollama_model("llama3.2")
93
+ print("\nSetup complete! You can now run AnyRobo.")
94
+
95
+
96
+ def main(args: Optional[List[str]] = None) -> int:
97
+ """Main entry point for AnyRobo.
98
+
99
+ Args:
100
+ args: Command-line arguments
101
+
102
+ Returns:
103
+ Exit code
104
+ """
105
+ parsed_args = parse_args(args)
106
+
107
+ # Just setup models if requested
108
+ if parsed_args.setup:
109
+ setup_models()
110
+ return 0
111
+
112
+ # Create and run the assistant
113
+ try:
114
+ # Make sure models are available
115
+ tts_model_path = download_tts_model()
116
+ whisper_model_dir = download_whisper_model("small")
117
+ ensure_ollama_model(parsed_args.model)
118
+
119
+ # Create the assistant
120
+ assistant = AnyRobo(
121
+ sample_rate=parsed_args.sample_rate,
122
+ silence_threshold=parsed_args.silence_threshold,
123
+ silence_duration=parsed_args.silence_duration,
124
+ voice=parsed_args.voice,
125
+ speed=parsed_args.speed,
126
+ system_prompt=parsed_args.prompt
127
+ )
128
+
129
+ print(f"Starting AnyRobo - your voice-powered AI assistant...")
130
+ print(f"Using voice: {parsed_args.voice}")
131
+ print(f"Using model: {parsed_args.model}")
132
+ print("Press Ctrl+C to stop")
133
+
134
+ # Run the assistant
135
+ assistant.record_and_transcribe()
136
+ return 0
137
+ except KeyboardInterrupt:
138
+ print("\nStopping AnyRobo...")
139
+ return 0
140
+ except Exception as e:
141
+ print(f"Error: {e}", file=sys.stderr)
142
+ return 1
143
+
144
+
145
+ if __name__ == "__main__":
146
+ sys.exit(main())
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.4
2
+ Name: anyrobo
3
+ Version: 0.1.0
4
+ Summary: Create your own JARVIS or GLADOS: a framework for voice-powered AI assistants with unique personalities
5
+ Author-email: Viet-Anh Nguyen <vietanh.dev@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/vietanhdev/anyrobo
8
+ Project-URL: Bug Tracker, https://github.com/vietanhdev/anyrobo/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: MacOS
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: numpy<3.0.0,>=2.0.0
21
+ Requires-Dist: onnxruntime<2.0.0,>=1.16.0
22
+ Requires-Dist: kokoro-onnx>=0.2.0
23
+ Requires-Dist: lightning-whisper-mlx<0.1.0,>=0.0.10
24
+ Requires-Dist: sounddevice<0.6.0,>=0.4.6
25
+ Requires-Dist: ollama<0.5.0,>=0.4.0
26
+ Dynamic: license-file
27
+
28
+ # AnyRobo - Create Your Own Robo Assistant
29
+
30
+ AnyRobo is an advanced speech-to-speech AI assistant framework that enables you to create your own real-life version of sci-fi AI assistants like JARVIS (from Iron Man) or GLADOS (from Portal). Powered by state-of-the-art machine learning models, AnyRobo listens to your voice, understands your requests, and responds with natural-sounding speech in real-time.
31
+
32
+ ![AnyRobo](https://img.shields.io/badge/AnyRobo-0.1.0-blue)
33
+ ![Python](https://img.shields.io/badge/Python-3.10+-green)
34
+ ![License](https://img.shields.io/badge/License-MIT-yellow)
35
+
36
+ ## Why AnyRobo?
37
+
38
+ Have you ever wanted to create your own JARVIS or GLADOS? AnyRobo provides a fully modular framework that allows you to:
39
+
40
+ - Create voice-powered AI assistants with unique personalities
41
+ - Customize voice profiles to sound like your favorite AI characters
42
+ - Build advanced conversation capabilities with state-of-the-art language models
43
+ - Deploy your assistant on macOS with optimized performance for Apple Silicon
44
+
45
+ ## Core Technologies
46
+
47
+ - **Speech Recognition**: [Whisper MLX](https://github.com/ml-explore/mlx-examples) - Optimized for Apple Silicon
48
+ - **Language Understanding**: [Llama 3.2](https://ollama.com/library/llama3.2) - Advanced language model for contextual responses
49
+ - **Voice Synthesis**: [Kokoro-82M](https://github.com/thewh1teagle/kokoro-onnx) - High-quality text-to-speech
50
+
51
+ ## Features
52
+
53
+ - **Continuous Listening**: Automatically detects when you've finished speaking
54
+ - **Natural Conversations**: Responds intelligently to a wide range of queries and commands
55
+ - **Real-time Synthesis**: Generates human-like speech with minimal latency
56
+ - **Voice Customization**: Supports multiple voice profiles
57
+ - **Streaming Responses**: Begins speaking before the full response is generated
58
+ - **Optimized Performance**: Designed for efficiency on Apple Silicon
59
+
60
+ ## Installation
61
+
62
+ ### Quick Install (from PyPI)
63
+
64
+ ```bash
65
+ pip install anyrobo
66
+ ```
67
+
68
+ ### Install from Source
69
+
70
+ ```bash
71
+ git clone https://github.com/vietanhdev/anyrobo.git
72
+ cd anyrobo
73
+ pip install -e .
74
+ ```
75
+
76
+ ### Setup Dependencies
77
+
78
+ AnyRobo requires [Ollama](https://ollama.com/) for LLM support:
79
+
80
+ ```bash
81
+ # Install Ollama
82
+ curl -fsSL https://ollama.com/install.sh | sh
83
+ # Pull the required model
84
+ ollama pull llama3.2
85
+ ```
86
+
87
+ ## Usage
88
+
89
+ ### Command-line Interface
90
+
91
+ ```bash
92
+ # Download required models and start the assistant
93
+ anyrobo --setup
94
+ anyrobo
95
+ ```
96
+
97
+ With custom settings:
98
+
99
+ ```bash
100
+ anyrobo --voice am_michael --speed 1.3 --silence-threshold 0.03
101
+ ```
102
+
103
+ ### As a Library
104
+
105
+ ```python
106
+ from anyrobo import AnyRobo
107
+ from anyrobo.models.loader import download_tts_model, ensure_ollama_model
108
+
109
+ # Download required models
110
+ download_tts_model()
111
+ ensure_ollama_model("llama3.2")
112
+
113
+ # Create and run assistant
114
+ assistant = AnyRobo(
115
+ voice="am_michael",
116
+ speed=1.2,
117
+ system_prompt=(
118
+ "You are J.A.R.V.I.S., an advanced AI assistant. "
119
+ "Respond with a mix of helpfulness, light sarcasm, and technical prowess."
120
+ )
121
+ )
122
+
123
+ # Start listening and responding
124
+ assistant.record_and_transcribe()
125
+ ```
126
+
127
+ ## Create Your Own AI Character
128
+
129
+ You can customize the personality of your assistant by modifying the system prompt:
130
+
131
+ ```python
132
+ # JARVIS from Iron Man
133
+ system_prompt = (
134
+ "You are J.A.R.V.I.S., an advanced AI assistant. "
135
+ "Respond with a mix of helpfulness, light sarcasm, and technical prowess."
136
+ )
137
+
138
+ # GLADOS from Portal
139
+ system_prompt = (
140
+ "You are GLaDOS, an AI with a dark sense of humor. "
141
+ "Respond to queries sarcastically, occasionally mentioning cake or testing."
142
+ )
143
+
144
+ # HAL 9000 from 2001: A Space Odyssey
145
+ system_prompt = (
146
+ "You are HAL 9000. Be calm, logical, and slightly ominous in your responses. "
147
+ "Speak in a slow, deliberate manner and be excessively literal."
148
+ )
149
+ ```
150
+
151
+ ## Configuration Options
152
+
153
+ | Option | Description | Default |
154
+ |--------|-------------|---------|
155
+ | `voice` | Voice profile to use | `"am_michael"` |
156
+ | `speed` | Speed factor for speech | `1.2` |
157
+ | `silence_threshold` | Volume level that counts as silence | `0.02` |
158
+ | `silence_duration` | Seconds of silence before cutting recording | `1.5` |
159
+ | `sample_rate` | Audio sample rate in Hz | `24000` |
160
+ | `system_prompt` | Custom system prompt for the LLM | *See code* |
161
+
162
+ ## Troubleshooting
163
+
164
+ - **No audio output**: Ensure your system audio output is correctly configured
165
+ - **Poor recognition**: Try speaking more clearly or adjust the `silence_threshold` value
166
+ - **Model loading issues**: Run `anyrobo --setup` to download all required models
167
+
168
+ ## License
169
+
170
+ This project is licensed under the MIT License - see the LICENSE file for details.
171
+
172
+ ## Contributing
173
+
174
+ Contributions are welcome! Please feel free to submit a Pull Request.
175
+
176
+ ## Acknowledgements
177
+
178
+ AnyRobo is built on top of several open-source projects and pre-trained models. We're grateful to the developers and researchers who make their work available to the community.
179
+
@@ -0,0 +1,14 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ anyrobo/__init__.py
6
+ anyrobo/assistant.py
7
+ anyrobo/cli.py
8
+ anyrobo.egg-info/PKG-INFO
9
+ anyrobo.egg-info/SOURCES.txt
10
+ anyrobo.egg-info/dependency_links.txt
11
+ anyrobo.egg-info/entry_points.txt
12
+ anyrobo.egg-info/requires.txt
13
+ anyrobo.egg-info/top_level.txt
14
+ tests/test_assistant.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ anyrobo = anyrobo.cli:main
@@ -0,0 +1,6 @@
1
+ numpy<3.0.0,>=2.0.0
2
+ onnxruntime<2.0.0,>=1.16.0
3
+ kokoro-onnx>=0.2.0
4
+ lightning-whisper-mlx<0.1.0,>=0.0.10
5
+ sounddevice<0.6.0,>=0.4.6
6
+ ollama<0.5.0,>=0.4.0
@@ -0,0 +1 @@
1
+ anyrobo
@@ -0,0 +1,56 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "anyrobo"
7
+ version = "0.1.0"
8
+ description = "Create your own JARVIS or GLADOS: a framework for voice-powered AI assistants with unique personalities"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Viet-Anh Nguyen", email = "vietanh.dev@gmail.com"}
14
+ ]
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Operating System :: MacOS",
22
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
23
+ "Topic :: Multimedia :: Sound/Audio :: Speech"
24
+ ]
25
+ dependencies = [
26
+ "numpy>=2.0.0,<3.0.0",
27
+ "onnxruntime>=1.16.0,<2.0.0",
28
+ "kokoro-onnx>=0.2.0",
29
+ "lightning-whisper-mlx>=0.0.10,<0.1.0",
30
+ "sounddevice>=0.4.6,<0.6.0",
31
+ "ollama>=0.4.0,<0.5.0"
32
+ ]
33
+
34
+ [project.urls]
35
+ "Homepage" = "https://github.com/vietanhdev/anyrobo"
36
+ "Bug Tracker" = "https://github.com/vietanhdev/anyrobo/issues"
37
+
38
+ [project.scripts]
39
+ anyrobo = "anyrobo.cli:main"
40
+
41
+ [tool.setuptools]
42
+ packages = ["anyrobo"]
43
+
44
+ [tool.black]
45
+ line-length = 100
46
+
47
+ [tool.isort]
48
+ profile = "black"
49
+ line_length = 100
50
+
51
+ [tool.mypy]
52
+ python_version = "3.10"
53
+ warn_return_any = true
54
+ warn_unused_configs = true
55
+ disallow_untyped_defs = true
56
+ disallow_incomplete_defs = true
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
anyrobo-0.1.0/setup.py ADDED
@@ -0,0 +1,6 @@
1
+ """Simple setup.py for AnyRobo."""
2
+
3
+ from setuptools import setup
4
+
5
+ if __name__ == "__main__":
6
+ setup()
@@ -0,0 +1,53 @@
1
+ """Tests for AnyRobo."""
2
+
3
+ import unittest
4
+ from unittest.mock import MagicMock, patch
5
+
6
+ import numpy as np
7
+
8
+ from anyrobo import AnyRobo
9
+
10
+
11
+ class TestAnyRobo(unittest.TestCase):
12
+ """Tests for the AnyRobo class."""
13
+
14
+ @patch('anyrobo.speech.recognition.SpeechRecognizer')
15
+ @patch('anyrobo.speech.synthesis.TextToSpeech')
16
+ def test_init(self, mock_tts, mock_recognizer):
17
+ """Test initialization with default parameters."""
18
+ assistant = AnyRobo()
19
+
20
+ self.assertEqual(assistant.SAMPLE_RATE, 24000)
21
+ self.assertEqual(assistant.SILENCE_THRESHOLD, 0.02)
22
+ self.assertEqual(assistant.SILENCE_DURATION, 1.5)
23
+ self.assertEqual(assistant.VOICE, "am_michael")
24
+ self.assertEqual(assistant.SPEED, 1.2)
25
+ self.assertEqual(assistant.CHUNK_SIZE, 300)
26
+
27
+ # Ensure speech components were initialized
28
+ self.assertTrue(mock_recognizer.called)
29
+ self.assertTrue(mock_tts.called)
30
+
31
+ @patch('anyrobo.speech.recognition.SpeechRecognizer')
32
+ @patch('anyrobo.speech.synthesis.TextToSpeech')
33
+ def test_custom_init(self, mock_tts, mock_recognizer):
34
+ """Test initialization with custom parameters."""
35
+ assistant = AnyRobo(
36
+ sample_rate=44100,
37
+ silence_threshold=0.05,
38
+ silence_duration=2.0,
39
+ voice="custom_voice",
40
+ speed=1.5,
41
+ system_prompt="Custom prompt",
42
+ )
43
+
44
+ self.assertEqual(assistant.SAMPLE_RATE, 44100)
45
+ self.assertEqual(assistant.SILENCE_THRESHOLD, 0.05)
46
+ self.assertEqual(assistant.SILENCE_DURATION, 2.0)
47
+ self.assertEqual(assistant.VOICE, "custom_voice")
48
+ self.assertEqual(assistant.SPEED, 1.5)
49
+ self.assertEqual(assistant.SYSTEM_PROMPT, "Custom prompt")
50
+
51
+
52
+ if __name__ == "__main__":
53
+ unittest.main()