miscai 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- miscai-0.1.0/.gitignore +2 -0
- miscai-0.1.0/LICENSE +25 -0
- miscai-0.1.0/PKG-INFO +193 -0
- miscai-0.1.0/README.md +135 -0
- miscai-0.1.0/pyproject.toml +45 -0
- miscai-0.1.0/src/miscai/__init__.py +0 -0
- miscai-0.1.0/src/miscai/dlm.py +195 -0
- miscai-0.1.0/src/miscai/llm.py +190 -0
- miscai-0.1.0/src/miscai/stt.py +19 -0
- miscai-0.1.0/src/miscai/tools.py +38 -0
- miscai-0.1.0/src/miscai/tts.py +22 -0
- miscai-0.1.0/src/miscai/vad.py +24 -0
- miscai-0.1.0/src/miscai/wakeword.py +12 -0
miscai-0.1.0/.gitignore
ADDED
miscai-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
=====================
|
|
3
|
+
|
|
4
|
+
Copyright © '2026' 'Daniel (pui4)'
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person
|
|
7
|
+
obtaining a copy of this software and associated documentation
|
|
8
|
+
files (the “Software”), to deal in the Software without
|
|
9
|
+
restriction, including without limitation the rights to use,
|
|
10
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
copies of the Software, and to permit persons to whom the
|
|
12
|
+
Software is furnished to do so, subject to the following
|
|
13
|
+
conditions:
|
|
14
|
+
|
|
15
|
+
The above copyright notice and this permission notice shall be
|
|
16
|
+
included in all copies or substantial portions of the Software.
|
|
17
|
+
|
|
18
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
|
|
19
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
20
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
21
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
22
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
23
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
24
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
25
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
miscai-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: miscai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Miscellaneous differnet AI tools for embedding into projects.
|
|
5
|
+
License: The MIT License (MIT)
|
|
6
|
+
=====================
|
|
7
|
+
|
|
8
|
+
Copyright © '2026' 'Daniel (pui4)'
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person
|
|
11
|
+
obtaining a copy of this software and associated documentation
|
|
12
|
+
files (the “Software”), to deal in the Software without
|
|
13
|
+
restriction, including without limitation the rights to use,
|
|
14
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the
|
|
16
|
+
Software is furnished to do so, subject to the following
|
|
17
|
+
conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be
|
|
20
|
+
included in all copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
|
|
23
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
24
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
25
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
26
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
27
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
28
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
29
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Requires-Python: >=3.14
|
|
32
|
+
Provides-Extra: dlm
|
|
33
|
+
Requires-Dist: chromadb>=1.5.2; extra == 'dlm'
|
|
34
|
+
Requires-Dist: peft>=0.19.1; extra == 'dlm'
|
|
35
|
+
Requires-Dist: requests>=2.34.2; extra == 'dlm'
|
|
36
|
+
Requires-Dist: torch>=2.10.0; extra == 'dlm'
|
|
37
|
+
Requires-Dist: transformers>=4.57.3; extra == 'dlm'
|
|
38
|
+
Provides-Extra: llm
|
|
39
|
+
Requires-Dist: chromadb>=1.5.2; extra == 'llm'
|
|
40
|
+
Requires-Dist: ollama>=0.6.1; extra == 'llm'
|
|
41
|
+
Requires-Dist: peft>=0.19.1; extra == 'llm'
|
|
42
|
+
Requires-Dist: torch>=2.10.0; extra == 'llm'
|
|
43
|
+
Requires-Dist: transformers>=4.57.3; extra == 'llm'
|
|
44
|
+
Provides-Extra: stt
|
|
45
|
+
Requires-Dist: numpy>=2.4.6; extra == 'stt'
|
|
46
|
+
Requires-Dist: pywhispercpp>=1.4.1; extra == 'stt'
|
|
47
|
+
Provides-Extra: tts
|
|
48
|
+
Requires-Dist: numpy>=2.4.6; extra == 'tts'
|
|
49
|
+
Requires-Dist: pocket-tts>=1.1.1; extra == 'tts'
|
|
50
|
+
Requires-Dist: scipy>=1.17.1; extra == 'tts'
|
|
51
|
+
Provides-Extra: vad
|
|
52
|
+
Requires-Dist: numpy>=2.4.6; extra == 'vad'
|
|
53
|
+
Requires-Dist: torch>=2.10.0; extra == 'vad'
|
|
54
|
+
Requires-Dist: torchaudio>=2.10.0; extra == 'vad'
|
|
55
|
+
Provides-Extra: wake
|
|
56
|
+
Requires-Dist: local-wake>=0.1.1; extra == 'wake'
|
|
57
|
+
Description-Content-Type: text/markdown
|
|
58
|
+
|
|
59
|
+
# MiscAI
|
|
60
|
+
Tools to make it easier to embed AI related things into your exsisting projects.
|
|
61
|
+
|
|
62
|
+
## Features
|
|
63
|
+
- Ollama local LLM wrapper (with build-in Jina embeddings model)
|
|
64
|
+
- Mercury diffusion language model (with the same embeddings as with Ollama, requires a mercury API key)
|
|
65
|
+
- Easy tool function creation with both types of language models
|
|
66
|
+
- Whisper-cpp speech to text
|
|
67
|
+
- Text to speech cloning with pocket-tts (requires a hugging face account)
|
|
68
|
+
- Voice activity detection with silero
|
|
69
|
+
- Wakeword detection with local wake
|
|
70
|
+
|
|
71
|
+
## Ollama local LLM
|
|
72
|
+
To use the local LLM with Ollama the host has to have Ollama installed already. The embeddings model is ran on the CPU as it is light and performant enough. Then install it into the python project with:
|
|
73
|
+
```sh
|
|
74
|
+
pip install miscai[llm]
|
|
75
|
+
```
|
|
76
|
+
Here is an example of how to use it:
|
|
77
|
+
```python
|
|
78
|
+
from miscai.llm import LLM
|
|
79
|
+
|
|
80
|
+
llm = LLM(promt="You are helpful assistant.", model="qwen3:4b", convo_file="./convo.json")
|
|
81
|
+
print(llm.ask_LLM("Hello!"))
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Diffusion Language Model (DLM)
|
|
85
|
+
Use of the DLM module is very similar to using the local LLM. You will need to make a Mercury API key to use the wrapper. The embeddings model is ran on the CPU as it is light and performant enough. Then install it into the python project with:
|
|
86
|
+
```sh
|
|
87
|
+
pip install miscai[dlm]
|
|
88
|
+
```
|
|
89
|
+
Here is an example of how to use it:
|
|
90
|
+
```python
|
|
91
|
+
from miscai.dlm import DLM
|
|
92
|
+
|
|
93
|
+
dlm = DLM(promt="You are helpful assistant.", model="mercury-v2", convo_file="./convo.json", api_key="123abc")
|
|
94
|
+
print(dlm.ask_LLM("Hello!"))
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Tool calling for Language models
|
|
98
|
+
This requires that you have installed one of the language models above. Here is an example of how to use it in a project:
|
|
99
|
+
```python
|
|
100
|
+
from miscai.tools import ToolLoader
|
|
101
|
+
|
|
102
|
+
tool_loader = ToolLoader("./tools")
|
|
103
|
+
```
|
|
104
|
+
Then when you are creating your language model object, create it like this (using the Ollama one for example):
|
|
105
|
+
```python
|
|
106
|
+
llm = LLM(promt="You are helpful assistant.", model="qwen3:4b", convo_file="./convo.json", tools=tool_loader.get_tools())
|
|
107
|
+
```
|
|
108
|
+
To create a tool, install the required dependencies to your project and place the python file in the directory specified in the ToolLoader object. Here is an example tool that gets the time with the 'pytz' package:
|
|
109
|
+
```python
|
|
110
|
+
import pytz
|
|
111
|
+
from datetime import datetime
|
|
112
|
+
|
|
113
|
+
tool = {
|
|
114
|
+
"type": "function",
|
|
115
|
+
"function": {
|
|
116
|
+
"name": "get_current_time",
|
|
117
|
+
"description": "Gets the current time for a provided time zone.",
|
|
118
|
+
'parameters': {
|
|
119
|
+
"type": "object",
|
|
120
|
+
"properties": {
|
|
121
|
+
"timezone": {
|
|
122
|
+
"type": "string",
|
|
123
|
+
"description": "The time zone as specified in the tz (zoneinfo) library."
|
|
124
|
+
}
|
|
125
|
+
},
|
|
126
|
+
"required": ["timezone"]
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
def get_current_time(timezone: str) -> str:
|
|
132
|
+
tz = pytz.timezone(timezone)
|
|
133
|
+
return str(datetime.now(tz))
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Speech to text
|
|
137
|
+
The whisper-cpp model is ran on the CPU but is quite performant and accurate, but may use a lot of CPU. The audio bytes are in a numpy array encoded at 16000kHz with 512 byte chunks. Install it into your project with:
|
|
138
|
+
```sh
|
|
139
|
+
pip install miscai[stt]
|
|
140
|
+
```
|
|
141
|
+
Here is an example of how to use it:
|
|
142
|
+
```python
|
|
143
|
+
from miscai.stt import STT
|
|
144
|
+
|
|
145
|
+
stt = STT()
|
|
146
|
+
print(stt.transcribe(audio_bytes=audio_bytes))
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Text to speech
|
|
150
|
+
You will have to create a hugging face account and accept the eula for using the pocket-tts model. Then create an API key as you will need this later. The model is ran on the CPU for reason mentioned on the model's page. The audio base needs to be encoded with 16000kHz for the best results. The outputed audio is a numpy array. Then install it into the project using:
|
|
151
|
+
```sh
|
|
152
|
+
pip install miscai[tts]
|
|
153
|
+
```
|
|
154
|
+
Here is an example of how to use it:
|
|
155
|
+
```python
|
|
156
|
+
from miscai.tts import TTS
|
|
157
|
+
|
|
158
|
+
tts = TTS("./voice_base.wav")
|
|
159
|
+
audio = tts.get_audio("Hello!")
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Voice activity detection
|
|
163
|
+
This uses Silero VAD for voice detection and it runs on the CPU due to the model being light weight enough. The audio bytes inputed is a numpy array with 512 byte chunks. For best results the audio should be encoded in 16000kHz. Install it into your project with:
|
|
164
|
+
```sh
|
|
165
|
+
pip install miscai[vad]
|
|
166
|
+
```
|
|
167
|
+
Here is an example of how to use it:
|
|
168
|
+
```python
|
|
169
|
+
from miscai.vad import VAD
|
|
170
|
+
|
|
171
|
+
vad = VAD(threshold=0.5)
|
|
172
|
+
print(vad.is_speech(audio_bytes=audio_bytes))
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Wakeword detection
|
|
176
|
+
This uses local-wake and so any audio file of any person saying the wakeword works as the wakeword. The audio inputed is a audio stream with 512 byte chunks (best gotten through SoundDevice). For best results the audio for the input and the audio for the reference audio files should be encoded in 16000kHz. Install it into your project with:
|
|
177
|
+
```sh
|
|
178
|
+
pip install miscai[wake]
|
|
179
|
+
```
|
|
180
|
+
Here is an example of how to use it:
|
|
181
|
+
```python
|
|
182
|
+
from miscai.wakeword import WakeWord
|
|
183
|
+
|
|
184
|
+
wake_word = WakeWord(threshold=0.5, audio_dir="./wakeword")
|
|
185
|
+
print("Begining to wait for wakeword")
|
|
186
|
+
wake_word.waitForWord(callback=awoke, stream=stream)
|
|
187
|
+
|
|
188
|
+
def awoke(detection: dict, stream: sd.InputStream):
|
|
189
|
+
print(f"Wake word detected: {detection['wakeword']}")
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Final notes
|
|
193
|
+
This project is in quite an early stage and could do with some more polish. It is not meant to be used in production but is good for making small prototypes for different ideas that you may have. I made this to make it easier to implement LLMs into my other projects and it grew from there. Changes are welcome as the documention is hastely written and the codes is arguable worse. Thanks for reading and maybe using this. :)
|
miscai-0.1.0/README.md
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# MiscAI
|
|
2
|
+
Tools to make it easier to embed AI related things into your exsisting projects.
|
|
3
|
+
|
|
4
|
+
## Features
|
|
5
|
+
- Ollama local LLM wrapper (with build-in Jina embeddings model)
|
|
6
|
+
- Mercury diffusion language model (with the same embeddings as with Ollama, requires a mercury API key)
|
|
7
|
+
- Easy tool function creation with both types of language models
|
|
8
|
+
- Whisper-cpp speech to text
|
|
9
|
+
- Text to speech cloning with pocket-tts (requires a hugging face account)
|
|
10
|
+
- Voice activity detection with silero
|
|
11
|
+
- Wakeword detection with local wake
|
|
12
|
+
|
|
13
|
+
## Ollama local LLM
|
|
14
|
+
To use the local LLM with Ollama the host has to have Ollama installed already. The embeddings model is ran on the CPU as it is light and performant enough. Then install it into the python project with:
|
|
15
|
+
```sh
|
|
16
|
+
pip install miscai[llm]
|
|
17
|
+
```
|
|
18
|
+
Here is an example of how to use it:
|
|
19
|
+
```python
|
|
20
|
+
from miscai.llm import LLM
|
|
21
|
+
|
|
22
|
+
llm = LLM(promt="You are helpful assistant.", model="qwen3:4b", convo_file="./convo.json")
|
|
23
|
+
print(llm.ask_LLM("Hello!"))
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Diffusion Language Model (DLM)
|
|
27
|
+
Use of the DLM module is very similar to using the local LLM. You will need to make a Mercury API key to use the wrapper. The embeddings model is ran on the CPU as it is light and performant enough. Then install it into the python project with:
|
|
28
|
+
```sh
|
|
29
|
+
pip install miscai[dlm]
|
|
30
|
+
```
|
|
31
|
+
Here is an example of how to use it:
|
|
32
|
+
```python
|
|
33
|
+
from miscai.dlm import DLM
|
|
34
|
+
|
|
35
|
+
dlm = DLM(promt="You are helpful assistant.", model="mercury-v2", convo_file="./convo.json", api_key="123abc")
|
|
36
|
+
print(dlm.ask_LLM("Hello!"))
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Tool calling for Language models
|
|
40
|
+
This requires that you have installed one of the language models above. Here is an example of how to use it in a project:
|
|
41
|
+
```python
|
|
42
|
+
from miscai.tools import ToolLoader
|
|
43
|
+
|
|
44
|
+
tool_loader = ToolLoader("./tools")
|
|
45
|
+
```
|
|
46
|
+
Then when you are creating your language model object, create it like this (using the Ollama one for example):
|
|
47
|
+
```python
|
|
48
|
+
llm = LLM(promt="You are helpful assistant.", model="qwen3:4b", convo_file="./convo.json", tools=tool_loader.get_tools())
|
|
49
|
+
```
|
|
50
|
+
To create a tool, install the required dependencies to your project and place the python file in the directory specified in the ToolLoader object. Here is an example tool that gets the time with the 'pytz' package:
|
|
51
|
+
```python
|
|
52
|
+
import pytz
|
|
53
|
+
from datetime import datetime
|
|
54
|
+
|
|
55
|
+
tool = {
|
|
56
|
+
"type": "function",
|
|
57
|
+
"function": {
|
|
58
|
+
"name": "get_current_time",
|
|
59
|
+
"description": "Gets the current time for a provided time zone.",
|
|
60
|
+
'parameters': {
|
|
61
|
+
"type": "object",
|
|
62
|
+
"properties": {
|
|
63
|
+
"timezone": {
|
|
64
|
+
"type": "string",
|
|
65
|
+
"description": "The time zone as specified in the tz (zoneinfo) library."
|
|
66
|
+
}
|
|
67
|
+
},
|
|
68
|
+
"required": ["timezone"]
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def get_current_time(timezone: str) -> str:
|
|
74
|
+
tz = pytz.timezone(timezone)
|
|
75
|
+
return str(datetime.now(tz))
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Speech to text
|
|
79
|
+
The whisper-cpp model is ran on the CPU but is quite performant and accurate, but may use a lot of CPU. The audio bytes are in a numpy array encoded at 16000kHz with 512 byte chunks. Install it into your project with:
|
|
80
|
+
```sh
|
|
81
|
+
pip install miscai[stt]
|
|
82
|
+
```
|
|
83
|
+
Here is an example of how to use it:
|
|
84
|
+
```python
|
|
85
|
+
from miscai.stt import STT
|
|
86
|
+
|
|
87
|
+
stt = STT()
|
|
88
|
+
print(stt.transcribe(audio_bytes=audio_bytes))
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Text to speech
|
|
92
|
+
You will have to create a hugging face account and accept the eula for using the pocket-tts model. Then create an API key as you will need this later. The model is ran on the CPU for reason mentioned on the model's page. The audio base needs to be encoded with 16000kHz for the best results. The outputed audio is a numpy array. Then install it into the project using:
|
|
93
|
+
```sh
|
|
94
|
+
pip install miscai[tts]
|
|
95
|
+
```
|
|
96
|
+
Here is an example of how to use it:
|
|
97
|
+
```python
|
|
98
|
+
from miscai.tts import TTS
|
|
99
|
+
|
|
100
|
+
tts = TTS("./voice_base.wav")
|
|
101
|
+
audio = tts.get_audio("Hello!")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Voice activity detection
|
|
105
|
+
This uses Silero VAD for voice detection and it runs on the CPU due to the model being light weight enough. The audio bytes inputed is a numpy array with 512 byte chunks. For best results the audio should be encoded in 16000kHz. Install it into your project with:
|
|
106
|
+
```sh
|
|
107
|
+
pip install miscai[vad]
|
|
108
|
+
```
|
|
109
|
+
Here is an example of how to use it:
|
|
110
|
+
```python
|
|
111
|
+
from miscai.vad import VAD
|
|
112
|
+
|
|
113
|
+
vad = VAD(threshold=0.5)
|
|
114
|
+
print(vad.is_speech(audio_bytes=audio_bytes))
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Wakeword detection
|
|
118
|
+
This uses local-wake and so any audio file of any person saying the wakeword works as the wakeword. The audio inputed is a audio stream with 512 byte chunks (best gotten through SoundDevice). For best results the audio for the input and the audio for the reference audio files should be encoded in 16000kHz. Install it into your project with:
|
|
119
|
+
```sh
|
|
120
|
+
pip install miscai[wake]
|
|
121
|
+
```
|
|
122
|
+
Here is an example of how to use it:
|
|
123
|
+
```python
|
|
124
|
+
from miscai.wakeword import WakeWord
|
|
125
|
+
|
|
126
|
+
wake_word = WakeWord(threshold=0.5, audio_dir="./wakeword")
|
|
127
|
+
print("Begining to wait for wakeword")
|
|
128
|
+
wake_word.waitForWord(callback=awoke, stream=stream)
|
|
129
|
+
|
|
130
|
+
def awoke(detection: dict, stream: sd.InputStream):
|
|
131
|
+
print(f"Wake word detected: {detection['wakeword']}")
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Final notes
|
|
135
|
+
This project is in quite an early stage and could do with some more polish. It is not meant to be used in production but is good for making small prototypes for different ideas that you may have. I made this to make it easier to implement LLMs into my other projects and it grew from there. Changes are welcome as the documention is hastely written and the codes is arguable worse. Thanks for reading and maybe using this. :)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "miscai"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Miscellaneous differnet AI tools for embedding into projects."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.14"
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
|
|
13
|
+
# Optional component dependencies
|
|
14
|
+
[project.optional-dependencies]
|
|
15
|
+
dlm = [
|
|
16
|
+
"chromadb>=1.5.2",
|
|
17
|
+
"torch>=2.10.0",
|
|
18
|
+
"transformers>=4.57.3",
|
|
19
|
+
"requests>=2.34.2",
|
|
20
|
+
"peft>=0.19.1",
|
|
21
|
+
]
|
|
22
|
+
llm = [
|
|
23
|
+
"chromadb>=1.5.2",
|
|
24
|
+
"torch>=2.10.0",
|
|
25
|
+
"transformers>=4.57.3",
|
|
26
|
+
"ollama>=0.6.1",
|
|
27
|
+
"peft>=0.19.1",
|
|
28
|
+
]
|
|
29
|
+
stt = [
|
|
30
|
+
"numpy>=2.4.6",
|
|
31
|
+
"pywhispercpp>=1.4.1",
|
|
32
|
+
]
|
|
33
|
+
tts = [
|
|
34
|
+
"numpy>=2.4.6",
|
|
35
|
+
"pocket-tts>=1.1.1",
|
|
36
|
+
"scipy>=1.17.1",
|
|
37
|
+
]
|
|
38
|
+
vad = [
|
|
39
|
+
"torch>=2.10.0",
|
|
40
|
+
"torchaudio>=2.10.0",
|
|
41
|
+
"numpy>=2.4.6",
|
|
42
|
+
]
|
|
43
|
+
wake = [
|
|
44
|
+
"local-wake>=0.1.1",
|
|
45
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import json
|
|
3
|
+
try:
|
|
4
|
+
import torch
|
|
5
|
+
import chromadb
|
|
6
|
+
import requests
|
|
7
|
+
from transformers import AutoModel
|
|
8
|
+
from chromadb import EmbeddingFunction, Embeddings, Documents
|
|
9
|
+
except:
|
|
10
|
+
raise ImportError("The 'dlm' module is required to use this. Install it with 'pip install miscai[dlm]'.")
|
|
11
|
+
|
|
12
|
+
class DLM():
|
|
13
|
+
def __init__(self,
|
|
14
|
+
promt: str,
|
|
15
|
+
model: str,
|
|
16
|
+
convo_file: str,
|
|
17
|
+
api_key: str,
|
|
18
|
+
tools: tuple = (None, None),
|
|
19
|
+
think: str = "high",
|
|
20
|
+
) -> None:
|
|
21
|
+
self.PROMT = promt
|
|
22
|
+
self.MODEL = model
|
|
23
|
+
self.API_KEY = api_key
|
|
24
|
+
|
|
25
|
+
self.tools_def, self.tools_fn = tools
|
|
26
|
+
self.think = think
|
|
27
|
+
self.convo_file = convo_file
|
|
28
|
+
|
|
29
|
+
self.jina_model = AutoModel.from_pretrained(
|
|
30
|
+
"jinaai/jina-embeddings-v5-text-small",
|
|
31
|
+
trust_remote_code=True,
|
|
32
|
+
dtype=torch.bfloat16,
|
|
33
|
+
attn_implementation="sdpa"
|
|
34
|
+
)
|
|
35
|
+
self.jina_model = self.jina_model.to("cpu")
|
|
36
|
+
|
|
37
|
+
chroma_client = chromadb.PersistentClient(path="./memory_db")
|
|
38
|
+
self.collection = chroma_client.get_or_create_collection(
|
|
39
|
+
name="ltm",
|
|
40
|
+
embedding_function=self.JinaEmbeddingFunction(self.jina_model)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
self.query_ef = self.JinaQueryEmbeddingFunction(self.jina_model)
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
with open(self.convo_file, "r") as file:
|
|
47
|
+
self.messages = json.load(file)
|
|
48
|
+
self.msg_count = len(self.messages)
|
|
49
|
+
except:
|
|
50
|
+
self.messages = []
|
|
51
|
+
self.msg_count = 0
|
|
52
|
+
|
|
53
|
+
def save_memory(self, text: str, memory_id: str) -> None:
|
|
54
|
+
self.collection.upsert(documents=[text], ids=[memory_id])
|
|
55
|
+
|
|
56
|
+
def retrieve_memories(self, query: str, n: int = 3) -> str:
|
|
57
|
+
query_embedding = self.query_ef([query])[0]
|
|
58
|
+
results = self.collection.query(query_embeddings=[query_embedding], n_results=n)
|
|
59
|
+
docs = results["documents"][0] # type: ignore
|
|
60
|
+
return "\n".join(docs) if docs else ""
|
|
61
|
+
|
|
62
|
+
def ask_LLM(self, text: str) -> str:
|
|
63
|
+
system_prompt = self.PROMT
|
|
64
|
+
|
|
65
|
+
memories = self.retrieve_memories(text)
|
|
66
|
+
if memories:
|
|
67
|
+
system_prompt += f"\n\nRelevant memories:\n{memories}"
|
|
68
|
+
|
|
69
|
+
self.messages.append({"role": "user", "content": text})
|
|
70
|
+
|
|
71
|
+
resp = requests.post(
|
|
72
|
+
"https://api.inceptionlabs.ai/v1/chat/completions",
|
|
73
|
+
headers={
|
|
74
|
+
'Content-Type': 'application/json',
|
|
75
|
+
'Authorization': f'Bearer {self.API_KEY}'
|
|
76
|
+
},
|
|
77
|
+
json={
|
|
78
|
+
"model": self.MODEL,
|
|
79
|
+
"messages": [{"role": "system", "content": system_prompt}] + self.messages[-10:],
|
|
80
|
+
"reasoning_effort": self.think,
|
|
81
|
+
"tools": self.tools_def
|
|
82
|
+
}
|
|
83
|
+
)
|
|
84
|
+
resp_j = resp.json()
|
|
85
|
+
print(f"API response: {resp_j}")
|
|
86
|
+
|
|
87
|
+
reply = resp_j["choices"][0]["message"]["content"] or ""
|
|
88
|
+
self.messages.append(self._serialize_assistant_message(resp_j["choices"][0]["message"]))
|
|
89
|
+
print(f"\n{reply}\n")
|
|
90
|
+
|
|
91
|
+
while resp_j["choices"][0]["message"]["tool_calls"]:
|
|
92
|
+
result = self.call_tools(resp_j["choices"][0]["message"]["tool_calls"]) # type: ignore
|
|
93
|
+
self.messages.extend(result)
|
|
94
|
+
|
|
95
|
+
resp = requests.post(
|
|
96
|
+
"https://api.inceptionlabs.ai/v1/chat/completions",
|
|
97
|
+
headers={
|
|
98
|
+
'Content-Type': 'application/json',
|
|
99
|
+
'Authorization': f'Bearer {self.API_KEY}'
|
|
100
|
+
},
|
|
101
|
+
json={
|
|
102
|
+
"model": self.MODEL,
|
|
103
|
+
"messages": [{"role": "system", "content": system_prompt}] + self.messages[-10:],
|
|
104
|
+
"reasoning_effort": self.think,
|
|
105
|
+
"tools": self.tools_def
|
|
106
|
+
}
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
resp_j = resp.json()
|
|
110
|
+
print(f"API response: {resp_j}")
|
|
111
|
+
reply = resp_j["choices"][0]["message"]["content"] or ""
|
|
112
|
+
|
|
113
|
+
self.messages.append(self._serialize_assistant_message(resp_j["choices"][0]["message"]))
|
|
114
|
+
print(f"\n{reply}\n")
|
|
115
|
+
|
|
116
|
+
self.msg_count += 1
|
|
117
|
+
self.save_memory(
|
|
118
|
+
f"User: {text}\nAssistant: {reply}",
|
|
119
|
+
memory_id=f"msg_{self.msg_count}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
with open(self.convo_file, "w") as file:
|
|
123
|
+
json.dump(self.messages, file)
|
|
124
|
+
|
|
125
|
+
return re.sub(r".*?</think>", "", reply, flags=re.DOTALL).strip()
|
|
126
|
+
|
|
127
|
+
def call_tools(self, tool_calls: list) -> list:
|
|
128
|
+
results = []
|
|
129
|
+
|
|
130
|
+
for call in tool_calls:
|
|
131
|
+
fn = call["function"]
|
|
132
|
+
fn_name = fn["name"]
|
|
133
|
+
fn_args = json.loads(fn["arguments"]) if fn["arguments"] else {}
|
|
134
|
+
|
|
135
|
+
if fn_name in self.tools_fn:
|
|
136
|
+
print(f"CALLING TOOL {fn_name}({fn_args})")
|
|
137
|
+
try:
|
|
138
|
+
out = self.tools_fn[fn_name](**fn_args)
|
|
139
|
+
content = str(out) if out is not None else "Done."
|
|
140
|
+
except Exception as e:
|
|
141
|
+
content = f"Error: {e}"
|
|
142
|
+
else:
|
|
143
|
+
content = f"Unknown tool: {fn_name}"
|
|
144
|
+
|
|
145
|
+
results.append({
|
|
146
|
+
"role": "tool",
|
|
147
|
+
"content": content,
|
|
148
|
+
"tool_call_id": call.get("id", fn_name) # call is a dict, use .get()
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
return results
|
|
152
|
+
|
|
153
|
+
def _serialize_assistant_message(self, message) -> dict:
|
|
154
|
+
msg = {"role": "assistant", "content": message.get("content") or ""}
|
|
155
|
+
tool_calls = message.get("tool_calls")
|
|
156
|
+
if tool_calls:
|
|
157
|
+
msg["tool_calls"] = [
|
|
158
|
+
{
|
|
159
|
+
"id": tc.get("id", tc["function"]["name"]),
|
|
160
|
+
"function": {
|
|
161
|
+
"name": tc["function"]["name"],
|
|
162
|
+
"arguments": tc["function"]["arguments"] # keep as raw string
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
for tc in tool_calls
|
|
166
|
+
]
|
|
167
|
+
return msg
|
|
168
|
+
|
|
169
|
+
class JinaEmbeddingFunction(EmbeddingFunction):
|
|
170
|
+
def __init__(self, model) -> None:
|
|
171
|
+
super().__init__()
|
|
172
|
+
|
|
173
|
+
self.jina_model = model
|
|
174
|
+
|
|
175
|
+
def __call__(self, input: Documents) -> Embeddings:
|
|
176
|
+
embedings = self.jina_model.encode(
|
|
177
|
+
texts=input,
|
|
178
|
+
task="retrieval",
|
|
179
|
+
prompt_name="document"
|
|
180
|
+
)
|
|
181
|
+
return embedings.tolist()
|
|
182
|
+
|
|
183
|
+
class JinaQueryEmbeddingFunction(EmbeddingFunction):
|
|
184
|
+
def __init__(self, model) -> None:
|
|
185
|
+
super().__init__()
|
|
186
|
+
|
|
187
|
+
self.jina_model = model
|
|
188
|
+
|
|
189
|
+
def __call__(self, input: Documents) -> Embeddings:
|
|
190
|
+
embeddings = self.jina_model.encode(
|
|
191
|
+
texts=input,
|
|
192
|
+
task="retrieval",
|
|
193
|
+
prompt_name="query"
|
|
194
|
+
)
|
|
195
|
+
return embeddings.tolist()
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import json
|
|
3
|
+
try:
|
|
4
|
+
import torch
|
|
5
|
+
import chromadb
|
|
6
|
+
from transformers import AutoModel
|
|
7
|
+
from chromadb import EmbeddingFunction, Embeddings, Documents
|
|
8
|
+
from ollama import chat
|
|
9
|
+
from ollama import ChatResponse
|
|
10
|
+
except:
|
|
11
|
+
raise ImportError("The 'llm' module is required to use this. Install it with 'pip install miscai[llm]'.")
|
|
12
|
+
|
|
13
|
+
class LLM():
|
|
14
|
+
def __init__(self,
|
|
15
|
+
promt: str,
|
|
16
|
+
model: str,
|
|
17
|
+
convo_file: str,
|
|
18
|
+
tools: tuple = (None, None),
|
|
19
|
+
think: bool = True,
|
|
20
|
+
) -> None:
|
|
21
|
+
self.PROMT = promt
|
|
22
|
+
self.MODEL = model
|
|
23
|
+
|
|
24
|
+
self.tools_def, self.tools_fn = tools
|
|
25
|
+
self.think = think
|
|
26
|
+
self.convo_file = convo_file
|
|
27
|
+
|
|
28
|
+
self.jina_model = AutoModel.from_pretrained(
|
|
29
|
+
"jinaai/jina-embeddings-v5-text-small",
|
|
30
|
+
trust_remote_code=True,
|
|
31
|
+
dtype=torch.bfloat16,
|
|
32
|
+
attn_implementation="sdpa"
|
|
33
|
+
)
|
|
34
|
+
self.jina_model = self.jina_model.to("cpu")
|
|
35
|
+
|
|
36
|
+
chroma_client = chromadb.PersistentClient(path="./memory_db")
|
|
37
|
+
self.collection = chroma_client.get_or_create_collection(
|
|
38
|
+
name="ltm",
|
|
39
|
+
embedding_function=self.JinaEmbeddingFunction(self.jina_model)
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
self.query_ef = self.JinaQueryEmbeddingFunction(self.jina_model)
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
with open(self.convo_file, "r") as file:
|
|
46
|
+
self.messages = json.load(file)
|
|
47
|
+
self.msg_count = len(self.messages)
|
|
48
|
+
except:
|
|
49
|
+
self.messages = []
|
|
50
|
+
self.msg_count = 0
|
|
51
|
+
|
|
52
|
+
def save_memory(self, text: str, memory_id: str) -> None:
|
|
53
|
+
self.collection.upsert(documents=[text], ids=[memory_id])
|
|
54
|
+
|
|
55
|
+
def retrieve_memories(self, query: str, n: int = 3) -> str:
|
|
56
|
+
query_embedding = self.query_ef([query])[0]
|
|
57
|
+
results = self.collection.query(query_embeddings=[query_embedding], n_results=n)
|
|
58
|
+
docs = results["documents"][0] # type: ignore
|
|
59
|
+
return "\n".join(docs) if docs else ""
|
|
60
|
+
|
|
61
|
+
def _parse_arguments(self, arguments) -> dict:
|
|
62
|
+
"""Ensure tool call arguments are always a dict, never a JSON string."""
|
|
63
|
+
if isinstance(arguments, dict):
|
|
64
|
+
return arguments
|
|
65
|
+
if isinstance(arguments, str):
|
|
66
|
+
try:
|
|
67
|
+
parsed = json.loads(arguments)
|
|
68
|
+
return parsed if isinstance(parsed, dict) else {}
|
|
69
|
+
except (json.JSONDecodeError, ValueError):
|
|
70
|
+
return {}
|
|
71
|
+
return dict(arguments) if arguments else {}
|
|
72
|
+
|
|
73
|
+
def ask_LLM(self, text: str) -> str:
|
|
74
|
+
system_prompt = self.PROMT
|
|
75
|
+
|
|
76
|
+
memories = self.retrieve_memories(text)
|
|
77
|
+
if memories:
|
|
78
|
+
system_prompt += f"\n\nRelevant memories:\n{memories}"
|
|
79
|
+
|
|
80
|
+
self.messages.append({"role": "user", "content": text})
|
|
81
|
+
|
|
82
|
+
resp: ChatResponse = chat(
|
|
83
|
+
model=self.MODEL,
|
|
84
|
+
messages=[{"role": "system", "content": system_prompt}] + self.messages[-10:],
|
|
85
|
+
stream=False,
|
|
86
|
+
think=self.think,
|
|
87
|
+
tools=self.tools_def
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
reply = resp.message.content or ""
|
|
91
|
+
self.messages.append(self._serialize_assistant_message(resp.message))
|
|
92
|
+
print(f"\n{reply}\n")
|
|
93
|
+
|
|
94
|
+
while resp.message.tool_calls:
|
|
95
|
+
result = self.call_tools(resp.message.tool_calls) # type: ignore
|
|
96
|
+
self.messages.extend(result)
|
|
97
|
+
|
|
98
|
+
resp: ChatResponse = chat(
|
|
99
|
+
model=self.MODEL,
|
|
100
|
+
messages=[{"role": "system", "content": system_prompt}] + self.messages[-10:],
|
|
101
|
+
stream=False,
|
|
102
|
+
think=self.think,
|
|
103
|
+
tools=self.tools_def
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
reply = resp.message.content or ""
|
|
107
|
+
self.messages.append(self._serialize_assistant_message(resp.message))
|
|
108
|
+
print(f"\n{reply}\n")
|
|
109
|
+
|
|
110
|
+
self.msg_count += 1
|
|
111
|
+
self.save_memory(
|
|
112
|
+
f"User: {text}\nAssistant: {reply}",
|
|
113
|
+
memory_id=f"msg_{self.msg_count}"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
with open(self.convo_file, "w") as file:
|
|
117
|
+
json.dump(self.messages, file)
|
|
118
|
+
|
|
119
|
+
return re.sub(r".*?</think>", "", reply, flags=re.DOTALL).strip()
|
|
120
|
+
|
|
121
|
+
def call_tools(self, tool_calls: list) -> list:
|
|
122
|
+
results = []
|
|
123
|
+
|
|
124
|
+
for call in tool_calls:
|
|
125
|
+
fn = call.function
|
|
126
|
+
fn_name = fn.name
|
|
127
|
+
fn_args = self._parse_arguments(fn.arguments)
|
|
128
|
+
|
|
129
|
+
if fn_name in self.tools_fn:
|
|
130
|
+
print(f"CALLING TOOL {fn_name}({fn_args})")
|
|
131
|
+
try:
|
|
132
|
+
out = self.tools_fn[fn_name](**fn_args)
|
|
133
|
+
content = str(out) if out is not None else "Done."
|
|
134
|
+
except Exception as e:
|
|
135
|
+
content = f"Error: {e}"
|
|
136
|
+
else:
|
|
137
|
+
content = f"Unknown tool: {fn_name}"
|
|
138
|
+
|
|
139
|
+
results.append({
|
|
140
|
+
"role": "tool",
|
|
141
|
+
"content": content,
|
|
142
|
+
"tool_call_id": getattr(call, "id", fn_name)
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
return results
|
|
146
|
+
|
|
147
|
+
def _serialize_assistant_message(self, message) -> dict:
|
|
148
|
+
msg = {"role": "assistant", "content": message.content or ""}
|
|
149
|
+
tool_calls = message.tool_calls
|
|
150
|
+
if tool_calls:
|
|
151
|
+
msg["tool_calls"] = [
|
|
152
|
+
{
|
|
153
|
+
"id": getattr(tc, "id", tc.function.name),
|
|
154
|
+
"function": {
|
|
155
|
+
"name": tc.function.name,
|
|
156
|
+
# Always store as dict so reloaded messages stay valid
|
|
157
|
+
"arguments": self._parse_arguments(tc.function.arguments)
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
for tc in tool_calls
|
|
161
|
+
]
|
|
162
|
+
return msg
|
|
163
|
+
|
|
164
|
+
class JinaEmbeddingFunction(EmbeddingFunction):
|
|
165
|
+
def __init__(self, model) -> None:
|
|
166
|
+
super().__init__()
|
|
167
|
+
|
|
168
|
+
self.jina_model = model
|
|
169
|
+
|
|
170
|
+
def __call__(self, input: Documents) -> Embeddings:
|
|
171
|
+
embedings = self.jina_model.encode(
|
|
172
|
+
texts=input,
|
|
173
|
+
task="retrieval",
|
|
174
|
+
prompt_name="document"
|
|
175
|
+
)
|
|
176
|
+
return embedings.tolist()
|
|
177
|
+
|
|
178
|
+
class JinaQueryEmbeddingFunction(EmbeddingFunction):
|
|
179
|
+
def __init__(self, model) -> None:
|
|
180
|
+
super().__init__()
|
|
181
|
+
|
|
182
|
+
self.jina_model = model
|
|
183
|
+
|
|
184
|
+
def __call__(self, input: Documents) -> Embeddings:
|
|
185
|
+
embeddings = self.jina_model.encode(
|
|
186
|
+
texts=input,
|
|
187
|
+
task="retrieval",
|
|
188
|
+
prompt_name="query"
|
|
189
|
+
)
|
|
190
|
+
return embeddings.tolist()
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import io
|
|
2
|
+
try:
|
|
3
|
+
import numpy as np
|
|
4
|
+
from pywhispercpp.model import Model
|
|
5
|
+
except:
|
|
6
|
+
raise ImportError("The 'stt' module is required to use this. Install it with 'pip install miscai[stt]'.")
|
|
7
|
+
|
|
8
|
+
class STT():
|
|
9
|
+
def __init__(self) -> None:
|
|
10
|
+
self.model = Model('base.en')
|
|
11
|
+
|
|
12
|
+
def transcribe(self, audio_bytes: np.ndarray) -> str:
|
|
13
|
+
segments = self.model.transcribe(audio_bytes)
|
|
14
|
+
|
|
15
|
+
final = ""
|
|
16
|
+
for segment in segments:
|
|
17
|
+
final += segment.text + "\n"
|
|
18
|
+
|
|
19
|
+
return final
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import importlib.util
|
|
3
|
+
|
|
4
|
+
class ToolLoader():
|
|
5
|
+
def __init__(self, tools_dir: str) -> None:
|
|
6
|
+
self.tools_dir = tools_dir
|
|
7
|
+
|
|
8
|
+
def get_tools(self) -> tuple[list, dict]:
|
|
9
|
+
tools_list = []
|
|
10
|
+
func_map = {}
|
|
11
|
+
|
|
12
|
+
for filename in os.listdir(self.tools_dir):
|
|
13
|
+
if not filename.endswith(".py") or filename.startswith("_"):
|
|
14
|
+
continue
|
|
15
|
+
|
|
16
|
+
module_name = filename[:-3]
|
|
17
|
+
filepath = os.path.join(self.tools_dir, filename)
|
|
18
|
+
|
|
19
|
+
spec = importlib.util.spec_from_file_location(module_name, filepath)
|
|
20
|
+
module = importlib.util.module_from_spec(spec) # type: ignore
|
|
21
|
+
spec.loader.exec_module(module) # type: ignore
|
|
22
|
+
|
|
23
|
+
if not hasattr(module, "tool"):
|
|
24
|
+
print(f"TOOL: {filename} DOESN'T HAVE TOOL DEFINITION!")
|
|
25
|
+
continue
|
|
26
|
+
|
|
27
|
+
tool_def = module.tool
|
|
28
|
+
fn_name = tool_def["function"]["name"]
|
|
29
|
+
|
|
30
|
+
if not hasattr(module, fn_name):
|
|
31
|
+
print(f"TOOL {filename}'S FUNCTION DOESN'T MATCH DEFINITION!")
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
tools_list.append(tool_def)
|
|
35
|
+
func_map[fn_name] = getattr(module, fn_name)
|
|
36
|
+
print(f"LOADED {filename} SUCCESSFULLY!")
|
|
37
|
+
|
|
38
|
+
return tools_list, func_map
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import math
|
|
2
|
+
try:
|
|
3
|
+
import numpy as np
|
|
4
|
+
from pocket_tts import TTSModel
|
|
5
|
+
from scipy.signal import resample_poly
|
|
6
|
+
except:
|
|
7
|
+
raise ImportError("The 'tts' module is required to use this. Install it with 'pip install miscai[tts]'.")
|
|
8
|
+
|
|
9
|
+
class TTS():
|
|
10
|
+
def __init__(self, voice_base_file: str) -> None:
|
|
11
|
+
self.tts_model = TTSModel.load_model()
|
|
12
|
+
self.voice_state = self.tts_model.get_state_for_audio_prompt(
|
|
13
|
+
voice_base_file
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
def get_audio(self, text: str):
|
|
17
|
+
audio = self.tts_model.generate_audio(self.voice_state, text)
|
|
18
|
+
|
|
19
|
+
gcd = math.gcd(self.tts_model.sample_rate, 44100)
|
|
20
|
+
audio_resamp = resample_poly(audio.numpy(), 44100 // gcd, self.tts_model.sample_rate // gcd).astype(np.float32)
|
|
21
|
+
|
|
22
|
+
return audio_resamp, 44100
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
try:
|
|
2
|
+
import torch
|
|
3
|
+
import numpy as np
|
|
4
|
+
except:
|
|
5
|
+
raise ImportError("The 'vad' module is required to use this. Install it with 'pip install miscai[vad]'.")
|
|
6
|
+
|
|
7
|
+
class VAD():
|
|
8
|
+
def __init__(self, threshold: float) -> None:
|
|
9
|
+
self.model, self.utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad') # type: ignore
|
|
10
|
+
|
|
11
|
+
self.SAMPLING_RATE = 16000
|
|
12
|
+
self.CHUNK = 512
|
|
13
|
+
self.THRESHOLD = threshold
|
|
14
|
+
|
|
15
|
+
def is_speech(self, audio_bytes: np.ndarray) -> bool:
|
|
16
|
+
audio_np = audio_bytes.flatten().astype(np.float32)
|
|
17
|
+
audio_tensor = torch.from_numpy(audio_np)
|
|
18
|
+
|
|
19
|
+
speech_prob = self.model(audio_tensor, self.SAMPLING_RATE).item()
|
|
20
|
+
|
|
21
|
+
if speech_prob > self.THRESHOLD:
|
|
22
|
+
return True
|
|
23
|
+
else:
|
|
24
|
+
return False
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
try:
|
|
2
|
+
import lwake
|
|
3
|
+
except:
|
|
4
|
+
raise ImportError("The 'wake' module is required to use this. Install it with 'pip install miscai[wake]'.")
|
|
5
|
+
|
|
6
|
+
class WakeWord():
|
|
7
|
+
def __init__(self, threshold: float, audio_dir: str) -> None:
|
|
8
|
+
self.THRESHOLD = threshold
|
|
9
|
+
self.REF_DIR = audio_dir
|
|
10
|
+
|
|
11
|
+
def waitForWord(self, callback, stream = None ) -> None:
|
|
12
|
+
lwake.listen(self.REF_DIR, threshold=self.THRESHOLD, method="embedding", callback=callback, stream=stream)
|