anyrobo 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anyrobo-0.1.0/LICENSE +21 -0
- anyrobo-0.1.0/PKG-INFO +179 -0
- anyrobo-0.1.0/README.md +152 -0
- anyrobo-0.1.0/anyrobo/__init__.py +7 -0
- anyrobo-0.1.0/anyrobo/assistant.py +228 -0
- anyrobo-0.1.0/anyrobo/cli.py +146 -0
- anyrobo-0.1.0/anyrobo.egg-info/PKG-INFO +179 -0
- anyrobo-0.1.0/anyrobo.egg-info/SOURCES.txt +14 -0
- anyrobo-0.1.0/anyrobo.egg-info/dependency_links.txt +1 -0
- anyrobo-0.1.0/anyrobo.egg-info/entry_points.txt +2 -0
- anyrobo-0.1.0/anyrobo.egg-info/requires.txt +6 -0
- anyrobo-0.1.0/anyrobo.egg-info/top_level.txt +1 -0
- anyrobo-0.1.0/pyproject.toml +56 -0
- anyrobo-0.1.0/setup.cfg +4 -0
- anyrobo-0.1.0/setup.py +6 -0
- anyrobo-0.1.0/tests/test_assistant.py +53 -0
anyrobo-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Viet-Anh Nguyen
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
anyrobo-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: anyrobo
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Create your own JARVIS or GLADOS: a framework for voice-powered AI assistants with unique personalities
|
|
5
|
+
Author-email: Viet-Anh Nguyen <vietanh.dev@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/vietanhdev/anyrobo
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/vietanhdev/anyrobo/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: MacOS
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: numpy<3.0.0,>=2.0.0
|
|
21
|
+
Requires-Dist: onnxruntime<2.0.0,>=1.16.0
|
|
22
|
+
Requires-Dist: kokoro-onnx>=0.2.0
|
|
23
|
+
Requires-Dist: lightning-whisper-mlx<0.1.0,>=0.0.10
|
|
24
|
+
Requires-Dist: sounddevice<0.6.0,>=0.4.6
|
|
25
|
+
Requires-Dist: ollama<0.5.0,>=0.4.0
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# AnyRobo - Create Your Own Robo Assistant
|
|
29
|
+
|
|
30
|
+
AnyRobo is an advanced speech-to-speech AI assistant framework that enables you to create your own real-life version of sci-fi AI assistants like JARVIS (from Iron Man) or GLADOS (from Portal). Powered by state-of-the-art machine learning models, AnyRobo listens to your voice, understands your requests, and responds with natural-sounding speech in real-time.
|
|
31
|
+
|
|
32
|
+

|
|
33
|
+

|
|
34
|
+

|
|
35
|
+
|
|
36
|
+
## Why AnyRobo?
|
|
37
|
+
|
|
38
|
+
Have you ever wanted to create your own JARVIS or GLADOS? AnyRobo provides a fully modular framework that allows you to:
|
|
39
|
+
|
|
40
|
+
- Create voice-powered AI assistants with unique personalities
|
|
41
|
+
- Customize voice profiles to sound like your favorite AI characters
|
|
42
|
+
- Build advanced conversation capabilities with state-of-the-art language models
|
|
43
|
+
- Deploy your assistant on macOS with optimized performance for Apple Silicon
|
|
44
|
+
|
|
45
|
+
## Core Technologies
|
|
46
|
+
|
|
47
|
+
- **Speech Recognition**: [Whisper MLX](https://github.com/ml-explore/mlx-examples) - Optimized for Apple Silicon
|
|
48
|
+
- **Language Understanding**: [Llama 3.2](https://ollama.com/library/llama3.2) - Advanced language model for contextual responses
|
|
49
|
+
- **Voice Synthesis**: [Kokoro-82M](https://github.com/thewh1teagle/kokoro-onnx) - High-quality text-to-speech
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
- **Continuous Listening**: Automatically detects when you've finished speaking
|
|
54
|
+
- **Natural Conversations**: Responds intelligently to a wide range of queries and commands
|
|
55
|
+
- **Real-time Synthesis**: Generates human-like speech with minimal latency
|
|
56
|
+
- **Voice Customization**: Supports multiple voice profiles
|
|
57
|
+
- **Streaming Responses**: Begins speaking before the full response is generated
|
|
58
|
+
- **Optimized Performance**: Designed for efficiency on Apple Silicon
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
### Quick Install (from PyPI)
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install anyrobo
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Install from Source
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
git clone https://github.com/vietanhdev/anyrobo.git
|
|
72
|
+
cd anyrobo
|
|
73
|
+
pip install -e .
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Setup Dependencies
|
|
77
|
+
|
|
78
|
+
AnyRobo requires [Ollama](https://ollama.com/) for LLM support:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Install Ollama
|
|
82
|
+
curl -fsSL https://ollama.com/install.sh | sh
|
|
83
|
+
# Pull the required model
|
|
84
|
+
ollama pull llama3.2
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Usage
|
|
88
|
+
|
|
89
|
+
### Command-line Interface
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# Download required models and start the assistant
|
|
93
|
+
anyrobo --setup
|
|
94
|
+
anyrobo
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
With custom settings:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
anyrobo --voice am_michael --speed 1.3 --silence-threshold 0.03
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### As a Library
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from anyrobo import AnyRobo
|
|
107
|
+
from anyrobo.models.loader import download_tts_model, ensure_ollama_model
|
|
108
|
+
|
|
109
|
+
# Download required models
|
|
110
|
+
download_tts_model()
|
|
111
|
+
ensure_ollama_model("llama3.2")
|
|
112
|
+
|
|
113
|
+
# Create and run assistant
|
|
114
|
+
assistant = AnyRobo(
|
|
115
|
+
voice="am_michael",
|
|
116
|
+
speed=1.2,
|
|
117
|
+
system_prompt=(
|
|
118
|
+
"You are J.A.R.V.I.S., an advanced AI assistant. "
|
|
119
|
+
"Respond with a mix of helpfulness, light sarcasm, and technical prowess."
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Start listening and responding
|
|
124
|
+
assistant.record_and_transcribe()
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Create Your Own AI Character
|
|
128
|
+
|
|
129
|
+
You can customize the personality of your assistant by modifying the system prompt:
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# JARVIS from Iron Man
|
|
133
|
+
system_prompt = (
|
|
134
|
+
"You are J.A.R.V.I.S., an advanced AI assistant. "
|
|
135
|
+
"Respond with a mix of helpfulness, light sarcasm, and technical prowess."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# GLADOS from Portal
|
|
139
|
+
system_prompt = (
|
|
140
|
+
"You are GLaDOS, an AI with a dark sense of humor. "
|
|
141
|
+
"Respond to queries sarcastically, occasionally mentioning cake or testing."
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# HAL 9000 from 2001: A Space Odyssey
|
|
145
|
+
system_prompt = (
|
|
146
|
+
"You are HAL 9000. Be calm, logical, and slightly ominous in your responses. "
|
|
147
|
+
"Speak in a slow, deliberate manner and be excessively literal."
|
|
148
|
+
)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Configuration Options
|
|
152
|
+
|
|
153
|
+
| Option | Description | Default |
|
|
154
|
+
|--------|-------------|---------|
|
|
155
|
+
| `voice` | Voice profile to use | `"am_michael"` |
|
|
156
|
+
| `speed` | Speed factor for speech | `1.2` |
|
|
157
|
+
| `silence_threshold` | Volume level that counts as silence | `0.02` |
|
|
158
|
+
| `silence_duration` | Seconds of silence before cutting recording | `1.5` |
|
|
159
|
+
| `sample_rate` | Audio sample rate in Hz | `24000` |
|
|
160
|
+
| `system_prompt` | Custom system prompt for the LLM | *See code* |
|
|
161
|
+
|
|
162
|
+
## Troubleshooting
|
|
163
|
+
|
|
164
|
+
- **No audio output**: Ensure your system audio output is correctly configured
|
|
165
|
+
- **Poor recognition**: Try speaking more clearly or adjust the `silence_threshold` value
|
|
166
|
+
- **Model loading issues**: Run `anyrobo --setup` to download all required models
|
|
167
|
+
|
|
168
|
+
## License
|
|
169
|
+
|
|
170
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
171
|
+
|
|
172
|
+
## Contributing
|
|
173
|
+
|
|
174
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
175
|
+
|
|
176
|
+
## Acknowledgements
|
|
177
|
+
|
|
178
|
+
AnyRobo is built on top of several open-source projects and pre-trained models. We're grateful to the developers and researchers who make their work available to the community.
|
|
179
|
+
|
anyrobo-0.1.0/README.md
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# AnyRobo - Create Your Own Robo Assistant
|
|
2
|
+
|
|
3
|
+
AnyRobo is an advanced speech-to-speech AI assistant framework that enables you to create your own real-life version of sci-fi AI assistants like JARVIS (from Iron Man) or GLADOS (from Portal). Powered by state-of-the-art machine learning models, AnyRobo listens to your voice, understands your requests, and responds with natural-sounding speech in real-time.
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
## Why AnyRobo?
|
|
10
|
+
|
|
11
|
+
Have you ever wanted to create your own JARVIS or GLADOS? AnyRobo provides a fully modular framework that allows you to:
|
|
12
|
+
|
|
13
|
+
- Create voice-powered AI assistants with unique personalities
|
|
14
|
+
- Customize voice profiles to sound like your favorite AI characters
|
|
15
|
+
- Build advanced conversation capabilities with state-of-the-art language models
|
|
16
|
+
- Deploy your assistant on macOS with optimized performance for Apple Silicon
|
|
17
|
+
|
|
18
|
+
## Core Technologies
|
|
19
|
+
|
|
20
|
+
- **Speech Recognition**: [Whisper MLX](https://github.com/ml-explore/mlx-examples) - Optimized for Apple Silicon
|
|
21
|
+
- **Language Understanding**: [Llama 3.2](https://ollama.com/library/llama3.2) - Advanced language model for contextual responses
|
|
22
|
+
- **Voice Synthesis**: [Kokoro-82M](https://github.com/thewh1teagle/kokoro-onnx) - High-quality text-to-speech
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
|
|
26
|
+
- **Continuous Listening**: Automatically detects when you've finished speaking
|
|
27
|
+
- **Natural Conversations**: Responds intelligently to a wide range of queries and commands
|
|
28
|
+
- **Real-time Synthesis**: Generates human-like speech with minimal latency
|
|
29
|
+
- **Voice Customization**: Supports multiple voice profiles
|
|
30
|
+
- **Streaming Responses**: Begins speaking before the full response is generated
|
|
31
|
+
- **Optimized Performance**: Designed for efficiency on Apple Silicon
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
### Quick Install (from PyPI)
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install anyrobo
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Install from Source
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
git clone https://github.com/vietanhdev/anyrobo.git
|
|
45
|
+
cd anyrobo
|
|
46
|
+
pip install -e .
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Setup Dependencies
|
|
50
|
+
|
|
51
|
+
AnyRobo requires [Ollama](https://ollama.com/) for LLM support:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# Install Ollama
|
|
55
|
+
curl -fsSL https://ollama.com/install.sh | sh
|
|
56
|
+
# Pull the required model
|
|
57
|
+
ollama pull llama3.2
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Usage
|
|
61
|
+
|
|
62
|
+
### Command-line Interface
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
# Download required models and start the assistant
|
|
66
|
+
anyrobo --setup
|
|
67
|
+
anyrobo
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
With custom settings:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
anyrobo --voice am_michael --speed 1.3 --silence-threshold 0.03
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### As a Library
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from anyrobo import AnyRobo
|
|
80
|
+
from anyrobo.models.loader import download_tts_model, ensure_ollama_model
|
|
81
|
+
|
|
82
|
+
# Download required models
|
|
83
|
+
download_tts_model()
|
|
84
|
+
ensure_ollama_model("llama3.2")
|
|
85
|
+
|
|
86
|
+
# Create and run assistant
|
|
87
|
+
assistant = AnyRobo(
|
|
88
|
+
voice="am_michael",
|
|
89
|
+
speed=1.2,
|
|
90
|
+
system_prompt=(
|
|
91
|
+
"You are J.A.R.V.I.S., an advanced AI assistant. "
|
|
92
|
+
"Respond with a mix of helpfulness, light sarcasm, and technical prowess."
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Start listening and responding
|
|
97
|
+
assistant.record_and_transcribe()
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Create Your Own AI Character
|
|
101
|
+
|
|
102
|
+
You can customize the personality of your assistant by modifying the system prompt:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
# JARVIS from Iron Man
|
|
106
|
+
system_prompt = (
|
|
107
|
+
"You are J.A.R.V.I.S., an advanced AI assistant. "
|
|
108
|
+
"Respond with a mix of helpfulness, light sarcasm, and technical prowess."
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# GLADOS from Portal
|
|
112
|
+
system_prompt = (
|
|
113
|
+
"You are GLaDOS, an AI with a dark sense of humor. "
|
|
114
|
+
"Respond to queries sarcastically, occasionally mentioning cake or testing."
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# HAL 9000 from 2001: A Space Odyssey
|
|
118
|
+
system_prompt = (
|
|
119
|
+
"You are HAL 9000. Be calm, logical, and slightly ominous in your responses. "
|
|
120
|
+
"Speak in a slow, deliberate manner and be excessively literal."
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Configuration Options
|
|
125
|
+
|
|
126
|
+
| Option | Description | Default |
|
|
127
|
+
|--------|-------------|---------|
|
|
128
|
+
| `voice` | Voice profile to use | `"am_michael"` |
|
|
129
|
+
| `speed` | Speed factor for speech | `1.2` |
|
|
130
|
+
| `silence_threshold` | Volume level that counts as silence | `0.02` |
|
|
131
|
+
| `silence_duration` | Seconds of silence before cutting recording | `1.5` |
|
|
132
|
+
| `sample_rate` | Audio sample rate in Hz | `24000` |
|
|
133
|
+
| `system_prompt` | Custom system prompt for the LLM | *See code* |
|
|
134
|
+
|
|
135
|
+
## Troubleshooting
|
|
136
|
+
|
|
137
|
+
- **No audio output**: Ensure your system audio output is correctly configured
|
|
138
|
+
- **Poor recognition**: Try speaking more clearly or adjust the `silence_threshold` value
|
|
139
|
+
- **Model loading issues**: Run `anyrobo --setup` to download all required models
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
144
|
+
|
|
145
|
+
## Contributing
|
|
146
|
+
|
|
147
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
148
|
+
|
|
149
|
+
## Acknowledgements
|
|
150
|
+
|
|
151
|
+
AnyRobo is built on top of several open-source projects and pre-trained models. We're grateful to the developers and researchers who make their work available to the community.
|
|
152
|
+
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Core assistant module for AnyRobo."""
|
|
2
|
+
|
|
3
|
+
import signal
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
+
from threading import Event
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
import sounddevice as sd
|
|
9
|
+
import numpy as np
|
|
10
|
+
from ollama import chat
|
|
11
|
+
|
|
12
|
+
from anyrobo.speech.recognition import SpeechRecognizer
|
|
13
|
+
from anyrobo.speech.synthesis import TextToSpeech
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AnyRobo:
|
|
17
|
+
"""Main assistant class that coordinates speech recognition and synthesis."""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
sample_rate: int = 24000,
|
|
22
|
+
silence_threshold: float = 0.02,
|
|
23
|
+
silence_duration: float = 1.5,
|
|
24
|
+
voice: str = "am_michael",
|
|
25
|
+
speed: float = 1.2,
|
|
26
|
+
system_prompt: Optional[str] = None,
|
|
27
|
+
):
|
|
28
|
+
"""Initialize the AnyRobo assistant.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
sample_rate: Audio sample rate in Hz
|
|
32
|
+
silence_threshold: Volume level that counts as silence
|
|
33
|
+
silence_duration: Seconds of silence before cutting recording
|
|
34
|
+
voice: Voice profile to use
|
|
35
|
+
speed: Speech speed factor
|
|
36
|
+
system_prompt: Custom system prompt for the LLM
|
|
37
|
+
"""
|
|
38
|
+
# audio settings
|
|
39
|
+
self.SAMPLE_RATE = sample_rate
|
|
40
|
+
self.SILENCE_THRESHOLD = silence_threshold
|
|
41
|
+
self.SILENCE_DURATION = silence_duration
|
|
42
|
+
|
|
43
|
+
# text-to-speech settings
|
|
44
|
+
self.SPEED = speed
|
|
45
|
+
self.VOICE = voice
|
|
46
|
+
self.CHUNK_SIZE = 300 # size of text chunks for processing
|
|
47
|
+
|
|
48
|
+
# ollama settings
|
|
49
|
+
self.messages = []
|
|
50
|
+
self.SYSTEM_PROMPT = system_prompt or "Give a conversational response to the following statement or question in 1-2 sentences. The response should be natural and engaging, and the length depends on what you have to say."
|
|
51
|
+
|
|
52
|
+
# init components
|
|
53
|
+
self.speech_recognizer = SpeechRecognizer(model="small", batch_size=12)
|
|
54
|
+
self.tts = TextToSpeech()
|
|
55
|
+
self.executor = ThreadPoolExecutor(max_workers=1)
|
|
56
|
+
|
|
57
|
+
# interrupt handling
|
|
58
|
+
self.shutdown_event = Event()
|
|
59
|
+
signal.signal(signal.SIGINT, self._signal_handler)
|
|
60
|
+
|
|
61
|
+
def _signal_handler(self, signum, frame):
|
|
62
|
+
"""Handle interrupt signals."""
|
|
63
|
+
print("\nStopping...")
|
|
64
|
+
self.shutdown_event.set()
|
|
65
|
+
|
|
66
|
+
def record_and_transcribe(self):
|
|
67
|
+
"""Main loop: record audio, transcribe, and respond."""
|
|
68
|
+
# state for audio recording
|
|
69
|
+
audio_buffer = []
|
|
70
|
+
silence_frames = 0
|
|
71
|
+
total_frames = 0
|
|
72
|
+
|
|
73
|
+
def callback(indata, frames, time_info, status):
|
|
74
|
+
# callback function that processes incoming audio frames
|
|
75
|
+
if self.shutdown_event.is_set():
|
|
76
|
+
raise sd.CallbackStop()
|
|
77
|
+
|
|
78
|
+
nonlocal audio_buffer, silence_frames, total_frames
|
|
79
|
+
|
|
80
|
+
if status:
|
|
81
|
+
print(status)
|
|
82
|
+
|
|
83
|
+
audio = indata.flatten()
|
|
84
|
+
level = np.abs(audio).mean()
|
|
85
|
+
|
|
86
|
+
audio_buffer.extend(audio.tolist())
|
|
87
|
+
total_frames += len(audio)
|
|
88
|
+
|
|
89
|
+
# track silence duration
|
|
90
|
+
if level < self.SILENCE_THRESHOLD:
|
|
91
|
+
silence_frames += len(audio)
|
|
92
|
+
else:
|
|
93
|
+
silence_frames = 0
|
|
94
|
+
|
|
95
|
+
# process audio when silence is detected
|
|
96
|
+
if silence_frames > self.SILENCE_DURATION * self.SAMPLE_RATE:
|
|
97
|
+
audio_segment = np.array(audio_buffer, dtype=np.float32)
|
|
98
|
+
|
|
99
|
+
if len(audio_segment) > self.SAMPLE_RATE:
|
|
100
|
+
text = self.speech_recognizer.transcribe(audio_segment)['text']
|
|
101
|
+
|
|
102
|
+
# skip empty/invalid transcriptions
|
|
103
|
+
if text.strip():
|
|
104
|
+
print(f"Transcription: {text}")
|
|
105
|
+
self.messages.append({
|
|
106
|
+
'role': 'user',
|
|
107
|
+
'content': text
|
|
108
|
+
})
|
|
109
|
+
self.create_and_play_response(text)
|
|
110
|
+
|
|
111
|
+
# reset state
|
|
112
|
+
audio_buffer.clear()
|
|
113
|
+
silence_frames = 0
|
|
114
|
+
total_frames = 0
|
|
115
|
+
|
|
116
|
+
# start recording loop
|
|
117
|
+
try:
|
|
118
|
+
with sd.InputStream(
|
|
119
|
+
callback=callback,
|
|
120
|
+
channels=1,
|
|
121
|
+
samplerate=self.SAMPLE_RATE,
|
|
122
|
+
dtype=np.float32
|
|
123
|
+
):
|
|
124
|
+
print("Recording... Press Ctrl+C to stop")
|
|
125
|
+
while not self.shutdown_event.is_set():
|
|
126
|
+
sd.sleep(100)
|
|
127
|
+
except sd.CallbackStop:
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
def create_and_play_response(self, prompt: str):
|
|
131
|
+
"""Generate and speak a response to the user's input."""
|
|
132
|
+
if self.shutdown_event.is_set():
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
# stream response from llm
|
|
136
|
+
stream = chat(
|
|
137
|
+
model='llama3.2',
|
|
138
|
+
messages=[{
|
|
139
|
+
'role': 'system',
|
|
140
|
+
'content': self.SYSTEM_PROMPT
|
|
141
|
+
}] + self.messages,
|
|
142
|
+
stream=True,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# state for processing response
|
|
146
|
+
futures = []
|
|
147
|
+
buffer = ""
|
|
148
|
+
curr_str = ""
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
# process response stream
|
|
152
|
+
for chunk in stream:
|
|
153
|
+
if self.shutdown_event.is_set():
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
print(chunk)
|
|
157
|
+
text = chunk['message']['content']
|
|
158
|
+
|
|
159
|
+
if len(text) == 0:
|
|
160
|
+
self.messages.append({
|
|
161
|
+
'role': 'assistant',
|
|
162
|
+
'content': curr_str
|
|
163
|
+
})
|
|
164
|
+
curr_str = ""
|
|
165
|
+
print(self.messages)
|
|
166
|
+
continue
|
|
167
|
+
|
|
168
|
+
buffer += text
|
|
169
|
+
curr_str += text
|
|
170
|
+
|
|
171
|
+
# find end of sentence to chunk at
|
|
172
|
+
last_punctuation = max(
|
|
173
|
+
buffer.rfind('. '),
|
|
174
|
+
buffer.rfind('? '),
|
|
175
|
+
buffer.rfind('! ')
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if last_punctuation == -1:
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
# handle long chunks
|
|
182
|
+
while last_punctuation != -1 and last_punctuation >= self.CHUNK_SIZE:
|
|
183
|
+
last_punctuation = max(
|
|
184
|
+
buffer.rfind(', ', 0, last_punctuation),
|
|
185
|
+
buffer.rfind('; ', 0, last_punctuation),
|
|
186
|
+
buffer.rfind('— ', 0, last_punctuation)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if last_punctuation == -1:
|
|
190
|
+
last_punctuation = buffer.find(' ', 0, self.CHUNK_SIZE)
|
|
191
|
+
|
|
192
|
+
# process chunk
|
|
193
|
+
# convert chunk to audio
|
|
194
|
+
chunk_text = buffer[:last_punctuation + 1]
|
|
195
|
+
futures.append(
|
|
196
|
+
self.executor.submit(
|
|
197
|
+
self.tts.generate_audio,
|
|
198
|
+
chunk_text, self.VOICE, self.SPEED
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
buffer = buffer[last_punctuation + 1:]
|
|
202
|
+
|
|
203
|
+
# process final chunk if any
|
|
204
|
+
if buffer and not self.shutdown_event.is_set():
|
|
205
|
+
futures.append(
|
|
206
|
+
self.executor.submit(
|
|
207
|
+
self.tts.generate_audio,
|
|
208
|
+
buffer, self.VOICE, self.SPEED
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# play generated audio
|
|
213
|
+
if not self.shutdown_event.is_set():
|
|
214
|
+
with sd.OutputStream(
|
|
215
|
+
samplerate=self.SAMPLE_RATE,
|
|
216
|
+
channels=1,
|
|
217
|
+
dtype=np.float32
|
|
218
|
+
) as out_stream:
|
|
219
|
+
for fut in futures:
|
|
220
|
+
if self.shutdown_event.is_set():
|
|
221
|
+
break
|
|
222
|
+
audio_data = fut.result()
|
|
223
|
+
if len(audio_data) == 0:
|
|
224
|
+
continue
|
|
225
|
+
out_stream.write(audio_data.reshape(-1, 1))
|
|
226
|
+
except Exception as e:
|
|
227
|
+
if not self.shutdown_event.is_set():
|
|
228
|
+
raise e
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Command-line interface for AnyRobo."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
from anyrobo import AnyRobo
|
|
9
|
+
from anyrobo.models.loader import (
|
|
10
|
+
download_tts_model,
|
|
11
|
+
download_whisper_model,
|
|
12
|
+
ensure_ollama_model
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
|
|
17
|
+
"""Parse command-line arguments.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
args: Command-line arguments
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Parsed arguments
|
|
24
|
+
"""
|
|
25
|
+
parser = argparse.ArgumentParser(
|
|
26
|
+
description="AnyRobo - Voice-powered AI assistant"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"--voice",
|
|
31
|
+
type=str,
|
|
32
|
+
default="af_sarah",
|
|
33
|
+
help="Voice profile to use (default: af_sarah)"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--speed",
|
|
38
|
+
type=float,
|
|
39
|
+
default=1.2,
|
|
40
|
+
help="Speed factor for speech (default: 1.2)"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"--silence-threshold",
|
|
45
|
+
type=float,
|
|
46
|
+
default=0.02,
|
|
47
|
+
help="Volume level that counts as silence (default: 0.02)"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--silence-duration",
|
|
52
|
+
type=float,
|
|
53
|
+
default=1.5,
|
|
54
|
+
help="Seconds of silence before cutting recording (default: 1.5)"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"--sample-rate",
|
|
59
|
+
type=int,
|
|
60
|
+
default=24000,
|
|
61
|
+
help="Audio sample rate in Hz (default: 24000)"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--model",
|
|
66
|
+
type=str,
|
|
67
|
+
default="llama3.2",
|
|
68
|
+
help="Ollama model to use (default: llama3.2)"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
"--prompt",
|
|
73
|
+
type=str,
|
|
74
|
+
default=None,
|
|
75
|
+
help="Custom system prompt for the LLM"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
parser.add_argument(
|
|
79
|
+
"--setup",
|
|
80
|
+
action="store_true",
|
|
81
|
+
help="Download required models without starting the assistant"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return parser.parse_args(args)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def setup_models() -> None:
|
|
88
|
+
"""Download required models."""
|
|
89
|
+
print("Setting up AnyRobo...")
|
|
90
|
+
tts_model_path = download_tts_model()
|
|
91
|
+
whisper_model_dir = download_whisper_model("small")
|
|
92
|
+
ensure_ollama_model("llama3.2")
|
|
93
|
+
print("\nSetup complete! You can now run AnyRobo.")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def main(args: Optional[List[str]] = None) -> int:
|
|
97
|
+
"""Main entry point for AnyRobo.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
args: Command-line arguments
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Exit code
|
|
104
|
+
"""
|
|
105
|
+
parsed_args = parse_args(args)
|
|
106
|
+
|
|
107
|
+
# Just setup models if requested
|
|
108
|
+
if parsed_args.setup:
|
|
109
|
+
setup_models()
|
|
110
|
+
return 0
|
|
111
|
+
|
|
112
|
+
# Create and run the assistant
|
|
113
|
+
try:
|
|
114
|
+
# Make sure models are available
|
|
115
|
+
tts_model_path = download_tts_model()
|
|
116
|
+
whisper_model_dir = download_whisper_model("small")
|
|
117
|
+
ensure_ollama_model(parsed_args.model)
|
|
118
|
+
|
|
119
|
+
# Create the assistant
|
|
120
|
+
assistant = AnyRobo(
|
|
121
|
+
sample_rate=parsed_args.sample_rate,
|
|
122
|
+
silence_threshold=parsed_args.silence_threshold,
|
|
123
|
+
silence_duration=parsed_args.silence_duration,
|
|
124
|
+
voice=parsed_args.voice,
|
|
125
|
+
speed=parsed_args.speed,
|
|
126
|
+
system_prompt=parsed_args.prompt
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
print(f"Starting AnyRobo - your voice-powered AI assistant...")
|
|
130
|
+
print(f"Using voice: {parsed_args.voice}")
|
|
131
|
+
print(f"Using model: {parsed_args.model}")
|
|
132
|
+
print("Press Ctrl+C to stop")
|
|
133
|
+
|
|
134
|
+
# Run the assistant
|
|
135
|
+
assistant.record_and_transcribe()
|
|
136
|
+
return 0
|
|
137
|
+
except KeyboardInterrupt:
|
|
138
|
+
print("\nStopping AnyRobo...")
|
|
139
|
+
return 0
|
|
140
|
+
except Exception as e:
|
|
141
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
142
|
+
return 1
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
if __name__ == "__main__":
|
|
146
|
+
sys.exit(main())
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: anyrobo
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Create your own JARVIS or GLADOS: a framework for voice-powered AI assistants with unique personalities
|
|
5
|
+
Author-email: Viet-Anh Nguyen <vietanh.dev@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/vietanhdev/anyrobo
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/vietanhdev/anyrobo/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: MacOS
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: numpy<3.0.0,>=2.0.0
|
|
21
|
+
Requires-Dist: onnxruntime<2.0.0,>=1.16.0
|
|
22
|
+
Requires-Dist: kokoro-onnx>=0.2.0
|
|
23
|
+
Requires-Dist: lightning-whisper-mlx<0.1.0,>=0.0.10
|
|
24
|
+
Requires-Dist: sounddevice<0.6.0,>=0.4.6
|
|
25
|
+
Requires-Dist: ollama<0.5.0,>=0.4.0
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# AnyRobo - Create Your Own Robo Assistant
|
|
29
|
+
|
|
30
|
+
AnyRobo is an advanced speech-to-speech AI assistant framework that enables you to create your own real-life version of sci-fi AI assistants like JARVIS (from Iron Man) or GLADOS (from Portal). Powered by state-of-the-art machine learning models, AnyRobo listens to your voice, understands your requests, and responds with natural-sounding speech in real-time.
|
|
31
|
+
|
|
32
|
+

|
|
33
|
+

|
|
34
|
+

|
|
35
|
+
|
|
36
|
+
## Why AnyRobo?
|
|
37
|
+
|
|
38
|
+
Have you ever wanted to create your own JARVIS or GLADOS? AnyRobo provides a fully modular framework that allows you to:
|
|
39
|
+
|
|
40
|
+
- Create voice-powered AI assistants with unique personalities
|
|
41
|
+
- Customize voice profiles to sound like your favorite AI characters
|
|
42
|
+
- Build advanced conversation capabilities with state-of-the-art language models
|
|
43
|
+
- Deploy your assistant on macOS with optimized performance for Apple Silicon
|
|
44
|
+
|
|
45
|
+
## Core Technologies
|
|
46
|
+
|
|
47
|
+
- **Speech Recognition**: [Whisper MLX](https://github.com/ml-explore/mlx-examples) - Optimized for Apple Silicon
|
|
48
|
+
- **Language Understanding**: [Llama 3.2](https://ollama.com/library/llama3.2) - Advanced language model for contextual responses
|
|
49
|
+
- **Voice Synthesis**: [Kokoro-82M](https://github.com/thewh1teagle/kokoro-onnx) - High-quality text-to-speech
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
- **Continuous Listening**: Automatically detects when you've finished speaking
|
|
54
|
+
- **Natural Conversations**: Responds intelligently to a wide range of queries and commands
|
|
55
|
+
- **Real-time Synthesis**: Generates human-like speech with minimal latency
|
|
56
|
+
- **Voice Customization**: Supports multiple voice profiles
|
|
57
|
+
- **Streaming Responses**: Begins speaking before the full response is generated
|
|
58
|
+
- **Optimized Performance**: Designed for efficiency on Apple Silicon
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
### Quick Install (from PyPI)
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install anyrobo
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Install from Source
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
git clone https://github.com/vietanhdev/anyrobo.git
|
|
72
|
+
cd anyrobo
|
|
73
|
+
pip install -e .
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Setup Dependencies
|
|
77
|
+
|
|
78
|
+
AnyRobo requires [Ollama](https://ollama.com/) for LLM support:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Install Ollama
|
|
82
|
+
curl -fsSL https://ollama.com/install.sh | sh
|
|
83
|
+
# Pull the required model
|
|
84
|
+
ollama pull llama3.2
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Usage
|
|
88
|
+
|
|
89
|
+
### Command-line Interface
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# Download required models and start the assistant
|
|
93
|
+
anyrobo --setup
|
|
94
|
+
anyrobo
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
With custom settings:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
anyrobo --voice am_michael --speed 1.3 --silence-threshold 0.03
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### As a Library
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from anyrobo import AnyRobo
|
|
107
|
+
from anyrobo.models.loader import download_tts_model, ensure_ollama_model
|
|
108
|
+
|
|
109
|
+
# Download required models
|
|
110
|
+
download_tts_model()
|
|
111
|
+
ensure_ollama_model("llama3.2")
|
|
112
|
+
|
|
113
|
+
# Create and run assistant
|
|
114
|
+
assistant = AnyRobo(
|
|
115
|
+
voice="am_michael",
|
|
116
|
+
speed=1.2,
|
|
117
|
+
system_prompt=(
|
|
118
|
+
"You are J.A.R.V.I.S., an advanced AI assistant. "
|
|
119
|
+
"Respond with a mix of helpfulness, light sarcasm, and technical prowess."
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Start listening and responding
|
|
124
|
+
assistant.record_and_transcribe()
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Create Your Own AI Character
|
|
128
|
+
|
|
129
|
+
You can customize the personality of your assistant by modifying the system prompt:
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# JARVIS from Iron Man
|
|
133
|
+
system_prompt = (
|
|
134
|
+
"You are J.A.R.V.I.S., an advanced AI assistant. "
|
|
135
|
+
"Respond with a mix of helpfulness, light sarcasm, and technical prowess."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# GLADOS from Portal
|
|
139
|
+
system_prompt = (
|
|
140
|
+
"You are GLaDOS, an AI with a dark sense of humor. "
|
|
141
|
+
"Respond to queries sarcastically, occasionally mentioning cake or testing."
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# HAL 9000 from 2001: A Space Odyssey
|
|
145
|
+
system_prompt = (
|
|
146
|
+
"You are HAL 9000. Be calm, logical, and slightly ominous in your responses. "
|
|
147
|
+
"Speak in a slow, deliberate manner and be excessively literal."
|
|
148
|
+
)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Configuration Options
|
|
152
|
+
|
|
153
|
+
| Option | Description | Default |
|
|
154
|
+
|--------|-------------|---------|
|
|
155
|
+
| `voice` | Voice profile to use | `"am_michael"` |
|
|
156
|
+
| `speed` | Speed factor for speech | `1.2` |
|
|
157
|
+
| `silence_threshold` | Volume level that counts as silence | `0.02` |
|
|
158
|
+
| `silence_duration` | Seconds of silence before cutting recording | `1.5` |
|
|
159
|
+
| `sample_rate` | Audio sample rate in Hz | `24000` |
|
|
160
|
+
| `system_prompt` | Custom system prompt for the LLM | *See code* |
|
|
161
|
+
|
|
162
|
+
## Troubleshooting
|
|
163
|
+
|
|
164
|
+
- **No audio output**: Ensure your system audio output is correctly configured
|
|
165
|
+
- **Poor recognition**: Try speaking more clearly or adjust the `silence_threshold` value
|
|
166
|
+
- **Model loading issues**: Run `anyrobo --setup` to download all required models
|
|
167
|
+
|
|
168
|
+
## License
|
|
169
|
+
|
|
170
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
171
|
+
|
|
172
|
+
## Contributing
|
|
173
|
+
|
|
174
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
175
|
+
|
|
176
|
+
## Acknowledgements
|
|
177
|
+
|
|
178
|
+
AnyRobo is built on top of several open-source projects and pre-trained models. We're grateful to the developers and researchers who make their work available to the community.
|
|
179
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
anyrobo/__init__.py
|
|
6
|
+
anyrobo/assistant.py
|
|
7
|
+
anyrobo/cli.py
|
|
8
|
+
anyrobo.egg-info/PKG-INFO
|
|
9
|
+
anyrobo.egg-info/SOURCES.txt
|
|
10
|
+
anyrobo.egg-info/dependency_links.txt
|
|
11
|
+
anyrobo.egg-info/entry_points.txt
|
|
12
|
+
anyrobo.egg-info/requires.txt
|
|
13
|
+
anyrobo.egg-info/top_level.txt
|
|
14
|
+
tests/test_assistant.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
anyrobo
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=42", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "anyrobo"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Create your own JARVIS or GLADOS: a framework for voice-powered AI assistants with unique personalities"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Viet-Anh Nguyen", email = "vietanh.dev@gmail.com"}
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: MacOS",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech"
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"numpy>=2.0.0,<3.0.0",
|
|
27
|
+
"onnxruntime>=1.16.0,<2.0.0",
|
|
28
|
+
"kokoro-onnx>=0.2.0",
|
|
29
|
+
"lightning-whisper-mlx>=0.0.10,<0.1.0",
|
|
30
|
+
"sounddevice>=0.4.6,<0.6.0",
|
|
31
|
+
"ollama>=0.4.0,<0.5.0"
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
"Homepage" = "https://github.com/vietanhdev/anyrobo"
|
|
36
|
+
"Bug Tracker" = "https://github.com/vietanhdev/anyrobo/issues"
|
|
37
|
+
|
|
38
|
+
[project.scripts]
|
|
39
|
+
anyrobo = "anyrobo.cli:main"
|
|
40
|
+
|
|
41
|
+
[tool.setuptools]
|
|
42
|
+
packages = ["anyrobo"]
|
|
43
|
+
|
|
44
|
+
[tool.black]
|
|
45
|
+
line-length = 100
|
|
46
|
+
|
|
47
|
+
[tool.isort]
|
|
48
|
+
profile = "black"
|
|
49
|
+
line_length = 100
|
|
50
|
+
|
|
51
|
+
[tool.mypy]
|
|
52
|
+
python_version = "3.10"
|
|
53
|
+
warn_return_any = true
|
|
54
|
+
warn_unused_configs = true
|
|
55
|
+
disallow_untyped_defs = true
|
|
56
|
+
disallow_incomplete_defs = true
|
anyrobo-0.1.0/setup.cfg
ADDED
anyrobo-0.1.0/setup.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Tests for AnyRobo."""
|
|
2
|
+
|
|
3
|
+
import unittest
|
|
4
|
+
from unittest.mock import MagicMock, patch
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from anyrobo import AnyRobo
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TestAnyRobo(unittest.TestCase):
|
|
12
|
+
"""Tests for the AnyRobo class."""
|
|
13
|
+
|
|
14
|
+
@patch('anyrobo.speech.recognition.SpeechRecognizer')
|
|
15
|
+
@patch('anyrobo.speech.synthesis.TextToSpeech')
|
|
16
|
+
def test_init(self, mock_tts, mock_recognizer):
|
|
17
|
+
"""Test initialization with default parameters."""
|
|
18
|
+
assistant = AnyRobo()
|
|
19
|
+
|
|
20
|
+
self.assertEqual(assistant.SAMPLE_RATE, 24000)
|
|
21
|
+
self.assertEqual(assistant.SILENCE_THRESHOLD, 0.02)
|
|
22
|
+
self.assertEqual(assistant.SILENCE_DURATION, 1.5)
|
|
23
|
+
self.assertEqual(assistant.VOICE, "am_michael")
|
|
24
|
+
self.assertEqual(assistant.SPEED, 1.2)
|
|
25
|
+
self.assertEqual(assistant.CHUNK_SIZE, 300)
|
|
26
|
+
|
|
27
|
+
# Ensure speech components were initialized
|
|
28
|
+
self.assertTrue(mock_recognizer.called)
|
|
29
|
+
self.assertTrue(mock_tts.called)
|
|
30
|
+
|
|
31
|
+
@patch('anyrobo.speech.recognition.SpeechRecognizer')
|
|
32
|
+
@patch('anyrobo.speech.synthesis.TextToSpeech')
|
|
33
|
+
def test_custom_init(self, mock_tts, mock_recognizer):
|
|
34
|
+
"""Test initialization with custom parameters."""
|
|
35
|
+
assistant = AnyRobo(
|
|
36
|
+
sample_rate=44100,
|
|
37
|
+
silence_threshold=0.05,
|
|
38
|
+
silence_duration=2.0,
|
|
39
|
+
voice="custom_voice",
|
|
40
|
+
speed=1.5,
|
|
41
|
+
system_prompt="Custom prompt",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
self.assertEqual(assistant.SAMPLE_RATE, 44100)
|
|
45
|
+
self.assertEqual(assistant.SILENCE_THRESHOLD, 0.05)
|
|
46
|
+
self.assertEqual(assistant.SILENCE_DURATION, 2.0)
|
|
47
|
+
self.assertEqual(assistant.VOICE, "custom_voice")
|
|
48
|
+
self.assertEqual(assistant.SPEED, 1.5)
|
|
49
|
+
self.assertEqual(assistant.SYSTEM_PROMPT, "Custom prompt")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
unittest.main()
|