ailia-speech 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ailia_speech-1.5.0/PKG-INFO +163 -0
- ailia_speech-1.5.0/README.md +141 -0
- ailia_speech-1.5.0/ailia_speech/LICENSE_AILIA_EN.pdf +0 -0
- ailia_speech-1.5.0/ailia_speech/LICENSE_AILIA_JA.pdf +0 -0
- ailia_speech-1.5.0/ailia_speech/__init__.py +762 -0
- ailia_speech-1.5.0/ailia_speech/linux/arm64-v8a/libailia_speech.so +0 -0
- ailia_speech-1.5.0/ailia_speech/linux/x64/libailia_speech.so +0 -0
- ailia_speech-1.5.0/ailia_speech/mac/libailia_speech.dylib +0 -0
- ailia_speech-1.5.0/ailia_speech/windows/x64/ailia_speech.dll +0 -0
- ailia_speech-1.5.0/ailia_speech.egg-info/PKG-INFO +163 -0
- ailia_speech-1.5.0/ailia_speech.egg-info/SOURCES.txt +14 -0
- ailia_speech-1.5.0/ailia_speech.egg-info/dependency_links.txt +1 -0
- ailia_speech-1.5.0/ailia_speech.egg-info/requires.txt +2 -0
- ailia_speech-1.5.0/ailia_speech.egg-info/top_level.txt +1 -0
- ailia_speech-1.5.0/setup.cfg +4 -0
- ailia_speech-1.5.0/setup.py +73 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ailia_speech
|
|
3
|
+
Version: 1.5.0
|
|
4
|
+
Summary: ailia AI Speech
|
|
5
|
+
Home-page: https://ailia.ai/en/
|
|
6
|
+
Author: ailia Inc.
|
|
7
|
+
Author-email: contact@ailia.ai
|
|
8
|
+
License: https://ailia.ai/en/license/
|
|
9
|
+
Requires-Python: >3.6
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: ailia
|
|
12
|
+
Requires-Dist: ailia_tokenizer
|
|
13
|
+
Dynamic: author
|
|
14
|
+
Dynamic: author-email
|
|
15
|
+
Dynamic: description
|
|
16
|
+
Dynamic: description-content-type
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: license
|
|
19
|
+
Dynamic: requires-dist
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
Dynamic: summary
|
|
22
|
+
|
|
23
|
+
# ailia AI Speech Python API
|
|
24
|
+
|
|
25
|
+
!! CAUTION !!
|
|
26
|
+
“ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
|
|
27
|
+
As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
|
|
28
|
+
|
|
29
|
+
## About ailia AI Speech
|
|
30
|
+
|
|
31
|
+
ailia AI Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia AI Speech, you can easily integrate AI powered speech recognition into your applications.
|
|
32
|
+
|
|
33
|
+
## Install from pip
|
|
34
|
+
|
|
35
|
+
You can install the ailia AI Speech free evaluation package with the following command.
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
pip3 install ailia_speech
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Install from package
|
|
42
|
+
|
|
43
|
+
You can install the ailia AI Speech from Package with the following command.
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
python3 bootstrap.py
|
|
47
|
+
pip3 install ./
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
|
|
52
|
+
### Batch mode
|
|
53
|
+
|
|
54
|
+
In batch mode, the entire audio is transcribed at once.
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
import ailia_speech
|
|
58
|
+
|
|
59
|
+
import librosa
|
|
60
|
+
|
|
61
|
+
import os
|
|
62
|
+
import urllib.request
|
|
63
|
+
|
|
64
|
+
# Load target audio
|
|
65
|
+
input_file_path = "demo.wav"
|
|
66
|
+
if not os.path.exists(input_file_path):
|
|
67
|
+
urllib.request.urlretrieve(
|
|
68
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
|
|
69
|
+
"demo.wav"
|
|
70
|
+
)
|
|
71
|
+
audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
|
|
72
|
+
|
|
73
|
+
# Model Initialize
|
|
74
|
+
speech = ailia_speech.Whisper()
|
|
75
|
+
model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
|
|
76
|
+
|
|
77
|
+
# When using sensevoice
|
|
78
|
+
#speech = ailia_speech.SenseVoice()
|
|
79
|
+
#model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
|
|
80
|
+
|
|
81
|
+
# Infer
|
|
82
|
+
speech.initialize_model(model_path = "./models/", model_type = model_type)
|
|
83
|
+
recognized_text = speech.transcribe(audio_waveform, sampling_rate)
|
|
84
|
+
for text in recognized_text:
|
|
85
|
+
print(text)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Step mode
|
|
89
|
+
|
|
90
|
+
In step mode, the audio is input in chunks and transcribed sequentially.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import ailia_speech
|
|
94
|
+
|
|
95
|
+
import librosa
|
|
96
|
+
|
|
97
|
+
import os
|
|
98
|
+
import urllib.request
|
|
99
|
+
|
|
100
|
+
# Load target audio
|
|
101
|
+
input_file_path = "demo.wav"
|
|
102
|
+
if not os.path.exists(input_file_path):
|
|
103
|
+
urllib.request.urlretrieve(
|
|
104
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
|
|
105
|
+
"demo.wav"
|
|
106
|
+
)
|
|
107
|
+
audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
|
|
108
|
+
|
|
109
|
+
# Infer
|
|
110
|
+
speech = ailia_speech.Whisper()
|
|
111
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO)
|
|
112
|
+
speech.set_silent_threshold(silent_threshold = 0.5, speech_sec = 1.0, no_speech_sec = 0.5)
|
|
113
|
+
for i in range(0, audio_waveform.shape[0], sampling_rate):
|
|
114
|
+
complete = False
|
|
115
|
+
if i + sampling_rate >= audio_waveform.shape[0]:
|
|
116
|
+
complete = True
|
|
117
|
+
recognized_text = speech.transcribe_step(audio_waveform[i:min(audio_waveform.shape[0], i + sampling_rate)], sampling_rate, complete)
|
|
118
|
+
for text in recognized_text:
|
|
119
|
+
print(text)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Dialization mode
|
|
123
|
+
|
|
124
|
+
By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, diarization_type = ailia_speech.AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Available model types
|
|
131
|
+
|
|
132
|
+
It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
|
|
133
|
+
|
|
134
|
+
Whisper
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
|
|
138
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
|
|
139
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
|
|
140
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
|
|
141
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
|
|
142
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
|
|
143
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
SenseVoice
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### Available vad versions
|
|
153
|
+
|
|
154
|
+
By default, version "4" of SileroVAD is used. The version can be specified from "4", "5", "6", and "6_2".
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
speech.initialize_model(model_path = "./models/", vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, vad_version = "6_2")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## API specification
|
|
161
|
+
|
|
162
|
+
https://github.com/axinc-ai/ailia-sdk
|
|
163
|
+
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# ailia AI Speech Python API
|
|
2
|
+
|
|
3
|
+
!! CAUTION !!
|
|
4
|
+
“ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
|
|
5
|
+
As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
|
|
6
|
+
|
|
7
|
+
## About ailia AI Speech
|
|
8
|
+
|
|
9
|
+
ailia AI Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia AI Speech, you can easily integrate AI powered speech recognition into your applications.
|
|
10
|
+
|
|
11
|
+
## Install from pip
|
|
12
|
+
|
|
13
|
+
You can install the ailia AI Speech free evaluation package with the following command.
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
pip3 install ailia_speech
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Install from package
|
|
20
|
+
|
|
21
|
+
You can install the ailia AI Speech from Package with the following command.
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
python3 bootstrap.py
|
|
25
|
+
pip3 install ./
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
### Batch mode
|
|
31
|
+
|
|
32
|
+
In batch mode, the entire audio is transcribed at once.
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
import ailia_speech
|
|
36
|
+
|
|
37
|
+
import librosa
|
|
38
|
+
|
|
39
|
+
import os
|
|
40
|
+
import urllib.request
|
|
41
|
+
|
|
42
|
+
# Load target audio
|
|
43
|
+
input_file_path = "demo.wav"
|
|
44
|
+
if not os.path.exists(input_file_path):
|
|
45
|
+
urllib.request.urlretrieve(
|
|
46
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
|
|
47
|
+
"demo.wav"
|
|
48
|
+
)
|
|
49
|
+
audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
|
|
50
|
+
|
|
51
|
+
# Model Initialize
|
|
52
|
+
speech = ailia_speech.Whisper()
|
|
53
|
+
model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
|
|
54
|
+
|
|
55
|
+
# When using sensevoice
|
|
56
|
+
#speech = ailia_speech.SenseVoice()
|
|
57
|
+
#model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
|
|
58
|
+
|
|
59
|
+
# Infer
|
|
60
|
+
speech.initialize_model(model_path = "./models/", model_type = model_type)
|
|
61
|
+
recognized_text = speech.transcribe(audio_waveform, sampling_rate)
|
|
62
|
+
for text in recognized_text:
|
|
63
|
+
print(text)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Step mode
|
|
67
|
+
|
|
68
|
+
In step mode, the audio is input in chunks and transcribed sequentially.
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
import ailia_speech
|
|
72
|
+
|
|
73
|
+
import librosa
|
|
74
|
+
|
|
75
|
+
import os
|
|
76
|
+
import urllib.request
|
|
77
|
+
|
|
78
|
+
# Load target audio
|
|
79
|
+
input_file_path = "demo.wav"
|
|
80
|
+
if not os.path.exists(input_file_path):
|
|
81
|
+
urllib.request.urlretrieve(
|
|
82
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
|
|
83
|
+
"demo.wav"
|
|
84
|
+
)
|
|
85
|
+
audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
|
|
86
|
+
|
|
87
|
+
# Infer
|
|
88
|
+
speech = ailia_speech.Whisper()
|
|
89
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO)
|
|
90
|
+
speech.set_silent_threshold(silent_threshold = 0.5, speech_sec = 1.0, no_speech_sec = 0.5)
|
|
91
|
+
for i in range(0, audio_waveform.shape[0], sampling_rate):
|
|
92
|
+
complete = False
|
|
93
|
+
if i + sampling_rate >= audio_waveform.shape[0]:
|
|
94
|
+
complete = True
|
|
95
|
+
recognized_text = speech.transcribe_step(audio_waveform[i:min(audio_waveform.shape[0], i + sampling_rate)], sampling_rate, complete)
|
|
96
|
+
for text in recognized_text:
|
|
97
|
+
print(text)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Dialization mode
|
|
101
|
+
|
|
102
|
+
By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, diarization_type = ailia_speech.AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Available model types
|
|
109
|
+
|
|
110
|
+
It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
|
|
111
|
+
|
|
112
|
+
Whisper
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
|
|
116
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
|
|
117
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
|
|
118
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
|
|
119
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
|
|
120
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
|
|
121
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
SenseVoice
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Available vad versions
|
|
131
|
+
|
|
132
|
+
By default, version "4" of SileroVAD is used. The version can be specified from "4", "5", "6", and "6_2".
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
speech.initialize_model(model_path = "./models/", vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, vad_version = "6_2")
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## API specification
|
|
139
|
+
|
|
140
|
+
https://github.com/axinc-ai/ailia-sdk
|
|
141
|
+
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,762 @@
|
|
|
1
|
+
import ctypes
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import numpy
|
|
6
|
+
import ailia
|
|
7
|
+
import ailia.audio
|
|
8
|
+
import ailia_tokenizer
|
|
9
|
+
|
|
10
|
+
import urllib.request
|
|
11
|
+
import ssl
|
|
12
|
+
import shutil
|
|
13
|
+
import platform
|
|
14
|
+
|
|
15
|
+
#### dependency check
|
|
16
|
+
if sys.platform == "win32":
|
|
17
|
+
import ctypes
|
|
18
|
+
try:
|
|
19
|
+
for library in ["vcruntime140.dll", "vcruntime140_1.dll", "msvcp140.dll"]:
|
|
20
|
+
ctypes.windll.LoadLibrary(library)
|
|
21
|
+
except:
|
|
22
|
+
print(" WARNING Please install MSVC 2015-2019 runtime from https://docs.microsoft.com/ja-jp/cpp/windows/latest-supported-vc-redist")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
#### loading DLL / DYLIB / SO ####
|
|
26
|
+
if sys.platform == "win32":
|
|
27
|
+
dll_platform = "windows/x64"
|
|
28
|
+
dll_name = "ailia_speech.dll"
|
|
29
|
+
load_fn = ctypes.WinDLL
|
|
30
|
+
elif sys.platform == "darwin":
|
|
31
|
+
dll_platform = "mac"
|
|
32
|
+
dll_name = "libailia_speech.dylib"
|
|
33
|
+
load_fn = ctypes.CDLL
|
|
34
|
+
else:
|
|
35
|
+
is_arm = "arm" in platform.machine() or platform.machine() == "aarch64"
|
|
36
|
+
if is_arm:
|
|
37
|
+
if platform.architecture()[0] == "32bit":
|
|
38
|
+
dll_platform = "linux/armeabi-v7a"
|
|
39
|
+
else:
|
|
40
|
+
dll_platform = "linux/arm64-v8a"
|
|
41
|
+
else:
|
|
42
|
+
dll_platform = "linux/x64"
|
|
43
|
+
dll_name = "libailia_speech.so"
|
|
44
|
+
load_fn = ctypes.CDLL
|
|
45
|
+
|
|
46
|
+
dll_found = False
|
|
47
|
+
candidate = ["", str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep), str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep) + dll_platform + str(os.sep)]
|
|
48
|
+
for dir in candidate:
|
|
49
|
+
try:
|
|
50
|
+
dll = load_fn(dir + dll_name)
|
|
51
|
+
dll_found = True
|
|
52
|
+
except:
|
|
53
|
+
pass
|
|
54
|
+
if not dll_found:
|
|
55
|
+
msg = "DLL load failed : \'" + dll_name + "\' is not found"
|
|
56
|
+
raise ImportError(msg)
|
|
57
|
+
|
|
58
|
+
# ==============================================================================
|
|
59
|
+
|
|
60
|
+
from ctypes import *
|
|
61
|
+
|
|
62
|
+
AILIA_SPEECH_STATUS_SUCCESS = ( 0 )
|
|
63
|
+
|
|
64
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY = (0)
|
|
65
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE = (1)
|
|
66
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL = (2)
|
|
67
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM = (3)
|
|
68
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE = (4)
|
|
69
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 = (5)
|
|
70
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO = (6)
|
|
71
|
+
AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL = (10)
|
|
72
|
+
|
|
73
|
+
AILIA_SPEECH_TASK_TRANSCRIBE = (0)
|
|
74
|
+
AILIA_SPEECH_TASK_TRANSLATE = (1)
|
|
75
|
+
|
|
76
|
+
AILIA_SPEECH_FLAG_NONE = (0)
|
|
77
|
+
AILIA_SPEECH_FLAG_LIVE = (1)
|
|
78
|
+
|
|
79
|
+
AILIA_SPEECH_VAD_TYPE_SILERO = (0)
|
|
80
|
+
|
|
81
|
+
AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO = (0)
|
|
82
|
+
|
|
83
|
+
AILIA_SPEECH_API_CALLBACK_VERSION = (6)
|
|
84
|
+
|
|
85
|
+
AILIA_SPEECH_TEXT_VERSION = (2)
|
|
86
|
+
AILIA_SPEECH_SPEAKER_ID_UNKNOWN = (0xFFFFFFFF)
|
|
87
|
+
|
|
88
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
|
|
89
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
|
|
90
|
+
|
|
91
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p) , c_int, c_int)
|
|
92
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A = CFUNCTYPE((c_int), c_void_p , c_char_p)
|
|
93
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W = CFUNCTYPE((c_int), c_void_p , c_wchar)
|
|
94
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE = CFUNCTYPE((c_int), c_void_p , c_char_p)
|
|
95
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
|
|
96
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS = CFUNCTYPE((c_int), c_void_p , POINTER(c_int) , c_uint)
|
|
97
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE = CFUNCTYPE((c_int), c_void_p , POINTER(c_int), c_uint)
|
|
98
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
|
|
99
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT = CFUNCTYPE((c_int), c_void_p , c_char_p , c_uint)
|
|
100
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY = CFUNCTYPE((c_int), c_void_p)
|
|
101
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32 = CFUNCTYPE((c_int), POINTER(c_uint) , POINTER(c_uint) , c_char_p , c_uint)
|
|
102
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8 = CFUNCTYPE((c_int), c_char_p, POINTER(c_uint) , c_uint)
|
|
103
|
+
|
|
104
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE = CFUNCTYPE((c_int), POINTER(c_float), POINTER(c_float), c_int, c_int, c_int, c_int)
|
|
105
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN = CFUNCTYPE((c_int), POINTER(c_int), c_int, c_int, c_int)
|
|
106
|
+
AILIA_SPEECH_USER_API_AILIA_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p), c_int, c_int)
|
|
107
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A = CFUNCTYPE((c_int), c_void_p, c_char_p)
|
|
108
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W = CFUNCTYPE((c_int), c_void_p, POINTER(c_wchar))
|
|
109
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM = CFUNCTYPE((c_int), c_void_p, POINTER(c_byte), c_uint)
|
|
110
|
+
AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE = CFUNCTYPE((c_int), c_void_p, c_uint)
|
|
111
|
+
AILIA_SPEECH_USER_API_AILIA_DESTROY = CFUNCTYPE((None), c_void_p)
|
|
112
|
+
AILIA_SPEECH_USER_API_AILIA_UPDATE = CFUNCTYPE((c_int), c_void_p)
|
|
113
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
|
|
114
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
|
|
115
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
|
|
116
|
+
AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
|
|
117
|
+
AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
|
|
118
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
|
|
119
|
+
AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL = CFUNCTYPE((c_char_p), c_void_p)
|
|
120
|
+
|
|
121
|
+
AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, c_uint, c_void_p, c_uint)
|
|
122
|
+
AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT = CFUNCTYPE((c_int), POINTER(c_void_p), c_uint, c_uint)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class struct__AILIASpeechApiCallback(Structure):
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
struct__AILIASpeechApiCallback.__slots__ = [
|
|
129
|
+
'ailiaAudioGetFrameLen',
|
|
130
|
+
'ailiaAudioGetMelSpectrogram',
|
|
131
|
+
'ailiaAudioResample',
|
|
132
|
+
'ailiaAudioGetResampleLen',
|
|
133
|
+
|
|
134
|
+
'ailiaTokenizerCreate',
|
|
135
|
+
'ailiaTokenizerOpenModelFileA',
|
|
136
|
+
'ailiaTokenizerOpenModelFileW',
|
|
137
|
+
'ailiaTokenizerEncode',
|
|
138
|
+
'ailiaTokenizerGetTokenCount',
|
|
139
|
+
'ailiaTokenizerGetTokens',
|
|
140
|
+
'ailiaTokenizerDecode',
|
|
141
|
+
'ailiaTokenizerGetTextLength',
|
|
142
|
+
'ailiaTokenizerGetText',
|
|
143
|
+
'ailiaTokenizerDestroy',
|
|
144
|
+
'ailiaTokenizerUtf8ToUtf32',
|
|
145
|
+
'ailiaTokenizerUtf32ToUtf8',
|
|
146
|
+
|
|
147
|
+
'ailiaCreate',
|
|
148
|
+
'ailiaOpenWeightFileA',
|
|
149
|
+
'ailiaOpenWeightFileW',
|
|
150
|
+
'ailiaOpenWeightMem',
|
|
151
|
+
'ailiaSetMemoryMode',
|
|
152
|
+
'ailiaDestroy',
|
|
153
|
+
'ailiaUpdate',
|
|
154
|
+
'ailiaGetBlobIndexByInputIndex',
|
|
155
|
+
'ailiaGetBlobIndexByOutputIndex',
|
|
156
|
+
'ailiaGetBlobData',
|
|
157
|
+
'ailiaSetInputBlobData',
|
|
158
|
+
'ailiaSetInputBlobShape',
|
|
159
|
+
'ailiaGetBlobShape',
|
|
160
|
+
'ailiaGetErrorDetail',
|
|
161
|
+
'ailiaCopyBlobData',
|
|
162
|
+
'ailiaGetEnvironment',
|
|
163
|
+
]
|
|
164
|
+
struct__AILIASpeechApiCallback._fields_ = [
|
|
165
|
+
('ailiaAudioGetFrameLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN),
|
|
166
|
+
('ailiaAudioGetMelSpectrogram', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM),
|
|
167
|
+
('ailiaAudioResample', AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE),
|
|
168
|
+
('ailiaAudioGetResampleLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN),
|
|
169
|
+
|
|
170
|
+
('ailiaTokenizerCreate', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE),
|
|
171
|
+
('ailiaTokenizerOpenModelFileA', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A),
|
|
172
|
+
('ailiaTokenizerOpenModelFileW', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W),
|
|
173
|
+
('ailiaTokenizerEncode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE),
|
|
174
|
+
('ailiaTokenizerGetTokenCount', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT),
|
|
175
|
+
('ailiaTokenizerGetTokens', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS),
|
|
176
|
+
('ailiaTokenizerDecode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE),
|
|
177
|
+
('ailiaTokenizerGetTextLength', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH),
|
|
178
|
+
('ailiaTokenizerGetText', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT),
|
|
179
|
+
('ailiaTokenizerDestroy', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY),
|
|
180
|
+
('ailiaTokenizerUtf8ToUtf32', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32),
|
|
181
|
+
('ailiaTokenizerUtf32ToUtf8', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8),
|
|
182
|
+
|
|
183
|
+
('ailiaCreate', AILIA_SPEECH_USER_API_AILIA_CREATE),
|
|
184
|
+
('ailiaOpenWeightFileA', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A),
|
|
185
|
+
('ailiaOpenWeightFileW', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W),
|
|
186
|
+
('ailiaOpenWeightMem', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM),
|
|
187
|
+
('ailiaSetMemoryMode', AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE),
|
|
188
|
+
('ailiaDestroy', AILIA_SPEECH_USER_API_AILIA_DESTROY),
|
|
189
|
+
('ailiaUpdate', AILIA_SPEECH_USER_API_AILIA_UPDATE),
|
|
190
|
+
('ailiaGetBlobIndexByInputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX),
|
|
191
|
+
('ailiaGetBlobIndexByOutputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX),
|
|
192
|
+
('ailiaGetBlobData', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA),
|
|
193
|
+
('ailiaSetInputBlobData', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA),
|
|
194
|
+
('ailiaSetInputBlobShape', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE),
|
|
195
|
+
('ailiaGetBlobShape', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE),
|
|
196
|
+
('ailiaGetErrorDetail', AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL),
|
|
197
|
+
('ailiaCopyBlobData', AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA),
|
|
198
|
+
('ailiaGetEnvironment', AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT),
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
AILIASpeechApiCallback = struct__AILIASpeechApiCallback
|
|
202
|
+
|
|
203
|
+
# ==============================================================================
|
|
204
|
+
|
|
205
|
+
dll.ailiaSpeechCreate.restype = c_int
|
|
206
|
+
dll.ailiaSpeechCreate.argtypes = (POINTER(c_void_p), c_int32, c_int32, c_int32, c_int32, c_int32, AILIASpeechApiCallback, c_int32)
|
|
207
|
+
|
|
208
|
+
dll.ailiaSpeechDestroy.restype = None
|
|
209
|
+
dll.ailiaSpeechDestroy.argtypes = (c_void_p, )
|
|
210
|
+
|
|
211
|
+
dll.ailiaSpeechOpenModelFileA.restype = c_int
|
|
212
|
+
dll.ailiaSpeechOpenModelFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
|
|
213
|
+
|
|
214
|
+
dll.ailiaSpeechOpenModelFileW.restype = c_int
|
|
215
|
+
dll.ailiaSpeechOpenModelFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
|
|
216
|
+
|
|
217
|
+
dll.ailiaSpeechOpenVadFileA.restype = c_int
|
|
218
|
+
dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
|
|
219
|
+
|
|
220
|
+
dll.ailiaSpeechOpenVadFileW.restype = c_int
|
|
221
|
+
dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
|
|
222
|
+
|
|
223
|
+
dll.ailiaSpeechOpenDiarizationFileA.restype = c_int
|
|
224
|
+
dll.ailiaSpeechOpenDiarizationFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
|
|
225
|
+
|
|
226
|
+
dll.ailiaSpeechOpenDiarizationFileW.restype = c_int
|
|
227
|
+
dll.ailiaSpeechOpenDiarizationFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
|
|
228
|
+
|
|
229
|
+
dll.ailiaSpeechPushInputData.restype = c_int
|
|
230
|
+
dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
|
|
231
|
+
dtype=numpy.float32, flags='CONTIGUOUS'
|
|
232
|
+
), # src
|
|
233
|
+
ctypes.c_uint,
|
|
234
|
+
ctypes.c_uint,
|
|
235
|
+
ctypes.c_uint)
|
|
236
|
+
|
|
237
|
+
dll.ailiaSpeechFinalizeInputData.restype = c_int
|
|
238
|
+
dll.ailiaSpeechFinalizeInputData.argtypes = (c_void_p, )
|
|
239
|
+
|
|
240
|
+
dll.ailiaSpeechBuffered.restype = c_int
|
|
241
|
+
dll.ailiaSpeechBuffered.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
242
|
+
|
|
243
|
+
dll.ailiaSpeechComplete.restype = c_int
|
|
244
|
+
dll.ailiaSpeechComplete.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
245
|
+
|
|
246
|
+
dll.ailiaSpeechTranscribe.restype = c_int
|
|
247
|
+
dll.ailiaSpeechTranscribe.argtypes = (c_void_p, )
|
|
248
|
+
|
|
249
|
+
dll.ailiaSpeechGetTextCount.restype = c_int
|
|
250
|
+
dll.ailiaSpeechGetTextCount.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
251
|
+
|
|
252
|
+
class AILIASpeechText(ctypes.Structure):
|
|
253
|
+
_fields_ = [
|
|
254
|
+
("text", ctypes.c_char_p),
|
|
255
|
+
("time_stamp_begin", ctypes.c_float),
|
|
256
|
+
("time_stamp_end", ctypes.c_float),
|
|
257
|
+
("speaker_id", ctypes.c_uint),
|
|
258
|
+
("language", ctypes.c_char_p),
|
|
259
|
+
("confidence", ctypes.c_float)]
|
|
260
|
+
|
|
261
|
+
dll.ailiaSpeechGetText.restype = c_int
|
|
262
|
+
dll.ailiaSpeechGetText.argtypes = (c_void_p, POINTER(AILIASpeechText), ctypes.c_uint, ctypes.c_uint)
|
|
263
|
+
|
|
264
|
+
dll.ailiaSpeechResetTranscribeState.restype = c_int
|
|
265
|
+
dll.ailiaSpeechResetTranscribeState.argtypes = (c_void_p, )
|
|
266
|
+
|
|
267
|
+
AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK = CFUNCTYPE((c_int), c_int64, c_char_p)
|
|
268
|
+
|
|
269
|
+
dll.ailiaSpeechSetIntermediateCallback.restype = c_int
|
|
270
|
+
dll.ailiaSpeechSetIntermediateCallback.argtypes = (c_void_p, AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK, c_int64)
|
|
271
|
+
|
|
272
|
+
dll.ailiaSpeechSetLanguage.restype = c_int
|
|
273
|
+
dll.ailiaSpeechSetLanguage.argtypes = (c_void_p, c_char_p)
|
|
274
|
+
|
|
275
|
+
dll.ailiaSpeechSetSilentThreshold.restype = c_int
|
|
276
|
+
dll.ailiaSpeechSetSilentThreshold.argtypes = (c_void_p, c_float, c_float, c_float)
|
|
277
|
+
|
|
278
|
+
# ==============================================================================
|
|
279
|
+
# model download
|
|
280
|
+
# ==============================================================================
|
|
281
|
+
|
|
282
|
+
def progress_print(block_count, block_size, total_size):
|
|
283
|
+
percentage = 100.0 * block_count * block_size / total_size
|
|
284
|
+
if percentage > 100:
|
|
285
|
+
# Bigger than 100 does not look good, so...
|
|
286
|
+
percentage = 100
|
|
287
|
+
max_bar = 50
|
|
288
|
+
bar_num = int(percentage / (100 / max_bar))
|
|
289
|
+
progress_element = '=' * bar_num
|
|
290
|
+
if bar_num != max_bar:
|
|
291
|
+
progress_element += '>'
|
|
292
|
+
bar_fill = ' ' # fill the blanks
|
|
293
|
+
bar = progress_element.ljust(max_bar, bar_fill)
|
|
294
|
+
total_size_kb = total_size / 1024
|
|
295
|
+
print(f'[{bar} {percentage:.2f}% ( {total_size_kb:.0f}KB )]', end='\r')
|
|
296
|
+
|
|
297
|
+
def urlretrieve(remote_path, weight_path, progress_print):
|
|
298
|
+
temp_path = weight_path + ".tmp"
|
|
299
|
+
try:
|
|
300
|
+
#raise ssl.SSLError # test
|
|
301
|
+
urllib.request.urlretrieve(
|
|
302
|
+
remote_path,
|
|
303
|
+
temp_path,
|
|
304
|
+
progress_print,
|
|
305
|
+
)
|
|
306
|
+
except ssl.SSLError as e:
|
|
307
|
+
print(f'SSLError detected, so try to download without ssl')
|
|
308
|
+
remote_path = remote_path.replace("https","http")
|
|
309
|
+
urllib.request.urlretrieve(
|
|
310
|
+
remote_path,
|
|
311
|
+
temp_path,
|
|
312
|
+
progress_print,
|
|
313
|
+
)
|
|
314
|
+
shutil.move(temp_path, weight_path)
|
|
315
|
+
|
|
316
|
+
def check_and_download_file(file_path, remote_path):
|
|
317
|
+
if not os.path.exists(file_path):
|
|
318
|
+
print('Downloading %s...' % file_path)
|
|
319
|
+
urlretrieve(remote_path + os.path.basename(file_path), file_path, progress_print)
|
|
320
|
+
|
|
321
|
+
# ==============================================================================
|
|
322
|
+
# base model class
|
|
323
|
+
# ==============================================================================
|
|
324
|
+
|
|
325
|
+
intermediate_callback_cnt = 0
|
|
326
|
+
intermediate_callback_map = {}
|
|
327
|
+
|
|
328
|
+
def intermediate_callback(handle, text):
|
|
329
|
+
intermediate_callback_map[handle](text.decode())
|
|
330
|
+
return 0
|
|
331
|
+
|
|
332
|
+
class AiliaSpeechError(RuntimeError):
|
|
333
|
+
def __init__(self, message, code):
|
|
334
|
+
super().__init__(f"{message} code:{code}")
|
|
335
|
+
self.code = code
|
|
336
|
+
|
|
337
|
+
class AiliaSpeechModel:
|
|
338
|
+
_api_callback = None
|
|
339
|
+
_instance = None
|
|
340
|
+
_c_callback = None
|
|
341
|
+
|
|
342
|
+
def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
|
|
343
|
+
""" constructor of ailia Specch model instance.
|
|
344
|
+
|
|
345
|
+
Parameters
|
|
346
|
+
----------
|
|
347
|
+
env_id : int, optional, default:ENVIRONMENT_AUTO(-1)
|
|
348
|
+
environment id of ailia excecution.
|
|
349
|
+
To retrieve env_id value, use
|
|
350
|
+
ailia.get_environment_count() / ailia.get_environment() pair
|
|
351
|
+
or
|
|
352
|
+
ailia.get_gpu_environment_id() .
|
|
353
|
+
num_thread : int, optional, default: MULTITHREAD_AUTO(0)
|
|
354
|
+
number of threads.
|
|
355
|
+
valid values:
|
|
356
|
+
MULTITHREAD_AUTO=0 [means systems's logical processor count],
|
|
357
|
+
1 to 32.
|
|
358
|
+
memory_mode : int, optional, default: 11 (reuse interstage)
|
|
359
|
+
memory management mode of ailia excecution.
|
|
360
|
+
To retrieve memory_mode value, use ailia.get_memory_mode() .
|
|
361
|
+
task : int, optional, default: AILIA_SPEECH_TASK_TRANSCRIBE
|
|
362
|
+
AILIA_SPEECH_TASK_TRANSCRIBE or AILIA_SPEECH_TASK_TRANSLATE
|
|
363
|
+
flags : int, optional, default: AILIA_SPEECH_FLAG_NONE
|
|
364
|
+
Reserved
|
|
365
|
+
callback : func or None, optional, default: None
|
|
366
|
+
Callback for receiving intermediate result text .
|
|
367
|
+
Example
|
|
368
|
+
def f_callback(text):
|
|
369
|
+
print(text)
|
|
370
|
+
"""
|
|
371
|
+
self._instance = ctypes.c_void_p(None)
|
|
372
|
+
self._create_callback()
|
|
373
|
+
self._check(dll.ailiaSpeechCreate(cast(pointer(self._instance), POINTER(c_void_p)), ctypes.c_int32(env_id), ctypes.c_int32(num_thread), ctypes.c_int32(memory_mode), ctypes.c_int32(task), ctypes.c_int32(flags), self._api_callback, ctypes.c_int32(AILIA_SPEECH_API_CALLBACK_VERSION)))
|
|
374
|
+
if callback is not None:
|
|
375
|
+
self._c_callback = AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK(intermediate_callback)
|
|
376
|
+
global intermediate_callback_cnt
|
|
377
|
+
global intermediate_callback_map
|
|
378
|
+
intermediate_callback_map[intermediate_callback_cnt] = callback
|
|
379
|
+
self._check(dll.ailiaSpeechSetIntermediateCallback(self._instance, self._c_callback, intermediate_callback_cnt))
|
|
380
|
+
intermediate_callback_cnt = intermediate_callback_cnt + 1
|
|
381
|
+
|
|
382
|
+
def _check(self, status):
|
|
383
|
+
if status != AILIA_SPEECH_STATUS_SUCCESS:
|
|
384
|
+
raise AiliaSpeechError(f"ailia speech error", status)
|
|
385
|
+
|
|
386
|
+
def _string_buffer_aw(self, path):
|
|
387
|
+
if sys.platform == "win32":
|
|
388
|
+
return ctypes.create_unicode_buffer(path)
|
|
389
|
+
else:
|
|
390
|
+
return ctypes.create_string_buffer(path.encode("utf-8"))
|
|
391
|
+
|
|
392
|
+
def _string_buffer(self, path):
|
|
393
|
+
return ctypes.create_string_buffer(path.encode("utf-8"))
|
|
394
|
+
|
|
395
|
+
def _create_callback(self):
|
|
396
|
+
callback = AILIASpeechApiCallback()
|
|
397
|
+
callback.ailiaAudioGetFrameLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN(("ailiaAudioGetFrameLen", ailia.audio.audio_core.dll))
|
|
398
|
+
callback.ailiaAudioGetMelSpectrogram = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM(("ailiaAudioGetMelSpectrogram", ailia.audio.audio_core.dll))
|
|
399
|
+
callback.ailiaAudioResample = AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE(("ailiaAudioResample", ailia.audio.audio_core.dll))
|
|
400
|
+
callback.ailiaAudioGetResampleLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN(("ailiaAudioGetResampleLen", ailia.audio.audio_core.dll))
|
|
401
|
+
|
|
402
|
+
callback.ailiaTokenizerCreate = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE(("ailiaTokenizerCreate", ailia_tokenizer.dll))
|
|
403
|
+
callback.ailiaTokenizerOpenModelFileA = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A(("ailiaTokenizerOpenModelFileA", ailia_tokenizer.dll))
|
|
404
|
+
callback.ailiaTokenizerOpenModelFileW = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W(("ailiaTokenizerOpenModelFileW", ailia_tokenizer.dll))
|
|
405
|
+
callback.ailiaTokenizerEncode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE(("ailiaTokenizerEncode", ailia_tokenizer.dll))
|
|
406
|
+
callback.ailiaTokenizerGetTokenCount = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT(("ailiaTokenizerGetTokenCount", ailia_tokenizer.dll))
|
|
407
|
+
callback.ailiaTokenizerGetTokens = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS(("ailiaTokenizerGetTokens", ailia_tokenizer.dll))
|
|
408
|
+
callback.ailiaTokenizerDecode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE(("ailiaTokenizerDecode", ailia_tokenizer.dll))
|
|
409
|
+
callback.ailiaTokenizerGetTextLength = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH(("ailiaTokenizerGetTextLength", ailia_tokenizer.dll))
|
|
410
|
+
callback.ailiaTokenizerGetText = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT(("ailiaTokenizerGetText", ailia_tokenizer.dll))
|
|
411
|
+
callback.ailiaTokenizerDestroy = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY(("ailiaTokenizerDestroy", ailia_tokenizer.dll))
|
|
412
|
+
callback.ailiaTokenizerUtf8ToUtf32 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32(("ailiaTokenizerUtf8ToUtf32", ailia_tokenizer.dll))
|
|
413
|
+
callback.ailiaTokenizerUtf32ToUtf8 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8(("ailiaTokenizerUtf32ToUtf8", ailia_tokenizer.dll))
|
|
414
|
+
|
|
415
|
+
callback.ailiaCreate = AILIA_SPEECH_USER_API_AILIA_CREATE(("ailiaCreate", ailia.core.dll))
|
|
416
|
+
callback.ailiaOpenWeightFileA = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A(("ailiaOpenWeightFileA", ailia.core.dll))
|
|
417
|
+
callback.ailiaOpenWeightFileW = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W(("ailiaOpenWeightFileW", ailia.core.dll))
|
|
418
|
+
callback.ailiaOpenWeightMem = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM(("ailiaOpenWeightMem", ailia.core.dll))
|
|
419
|
+
callback.ailiaSetMemoryMode = AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE(("ailiaSetMemoryMode", ailia.core.dll))
|
|
420
|
+
callback.ailiaDestroy = AILIA_SPEECH_USER_API_AILIA_DESTROY(("ailiaDestroy", ailia.core.dll))
|
|
421
|
+
callback.ailiaUpdate = AILIA_SPEECH_USER_API_AILIA_UPDATE(("ailiaUpdate", ailia.core.dll))
|
|
422
|
+
callback.ailiaGetBlobIndexByInputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX(("ailiaGetBlobIndexByInputIndex", ailia.core.dll))
|
|
423
|
+
callback.ailiaGetBlobIndexByOutputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX(("ailiaGetBlobIndexByOutputIndex", ailia.core.dll))
|
|
424
|
+
callback.ailiaGetBlobData = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA(("ailiaGetBlobData", ailia.core.dll))
|
|
425
|
+
callback.ailiaSetInputBlobData = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA(("ailiaSetInputBlobData", ailia.core.dll))
|
|
426
|
+
callback.ailiaSetInputBlobShape = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE(("ailiaSetInputBlobShape", ailia.core.dll))
|
|
427
|
+
callback.ailiaGetBlobShape = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE(("ailiaGetBlobShape", ailia.core.dll))
|
|
428
|
+
callback.ailiaGetErrorDetail = AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL(("ailiaGetErrorDetail", ailia.core.dll))
|
|
429
|
+
callback.ailiaCopyBlobData = AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA(("ailiaCopyBlobData", ailia.core.dll))
|
|
430
|
+
callback.ailiaGetEnvironment = AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT(("ailiaGetEnvironment", ailia.core.dll))
|
|
431
|
+
|
|
432
|
+
self._api_callback = callback # prevent GC
|
|
433
|
+
|
|
434
|
+
def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, vad_version, diarization_type, model_type):
|
|
435
|
+
if model_type == AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL:
|
|
436
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/sensevoice/"
|
|
437
|
+
else:
|
|
438
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
|
|
439
|
+
os.makedirs(model_path, exist_ok = True)
|
|
440
|
+
check_and_download_file(model_path + encoder_path, REMOTE_PATH)
|
|
441
|
+
check_and_download_file(model_path + decoder_path, REMOTE_PATH)
|
|
442
|
+
if encoder_pb_path is not None:
|
|
443
|
+
check_and_download_file(model_path + encoder_pb_path, REMOTE_PATH)
|
|
444
|
+
if decoder_pb_path is not None:
|
|
445
|
+
check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
|
|
446
|
+
|
|
447
|
+
if vad_type is not None:
|
|
448
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
|
|
449
|
+
check_and_download_file(model_path + self._vad_model_name(vad_version), REMOTE_PATH)
|
|
450
|
+
|
|
451
|
+
if diarization_type is not None:
|
|
452
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/pyannote-audio/"
|
|
453
|
+
check_and_download_file(model_path + "segmentation.onnx", REMOTE_PATH)
|
|
454
|
+
check_and_download_file(model_path + "speaker-embedding.onnx", REMOTE_PATH)
|
|
455
|
+
|
|
456
|
+
def _open_model(self, encoder, decoder, model_type):
|
|
457
|
+
p1 = self._string_buffer_aw(encoder)
|
|
458
|
+
p2 = self._string_buffer_aw(decoder)
|
|
459
|
+
|
|
460
|
+
if sys.platform == "win32":
|
|
461
|
+
self._check(dll.ailiaSpeechOpenModelFileW(self._instance, p1, p2, model_type))
|
|
462
|
+
else:
|
|
463
|
+
self._check(dll.ailiaSpeechOpenModelFileA(self._instance, p1, p2, model_type))
|
|
464
|
+
|
|
465
|
+
def _open_vad(self, vad, vad_type):
|
|
466
|
+
p1 = self._string_buffer_aw(vad)
|
|
467
|
+
|
|
468
|
+
if sys.platform == "win32":
|
|
469
|
+
self._check(dll.ailiaSpeechOpenVadFileW(self._instance, p1, vad_type))
|
|
470
|
+
else:
|
|
471
|
+
self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
|
|
472
|
+
|
|
473
|
+
def _open_diarization(self, segmentation, embedding, diarization_type):
|
|
474
|
+
p1 = self._string_buffer_aw(segmentation)
|
|
475
|
+
p2 = self._string_buffer_aw(embedding)
|
|
476
|
+
|
|
477
|
+
if sys.platform == "win32":
|
|
478
|
+
self._check(dll.ailiaSpeechOpenDiarizationFileW(self._instance, p1, p2, diarization_type))
|
|
479
|
+
else:
|
|
480
|
+
self._check(dll.ailiaSpeechOpenDiarizationFileA(self._instance, p1, p2, diarization_type))
|
|
481
|
+
|
|
482
|
+
def set_silent_threshold(self, silent_threshold, speech_sec, no_speech_sec):
|
|
483
|
+
""" Set silent threshold. If there are more than a certain number of sounded sections, and if the silent section lasts for a certain amount of time or more, the remaining buffer is processed without waiting for 30 seconds.
|
|
484
|
+
|
|
485
|
+
Parameters
|
|
486
|
+
----------
|
|
487
|
+
silent_threshold : float
|
|
488
|
+
volume threshold, standard value 0.5
|
|
489
|
+
speech_sec : float
|
|
490
|
+
speech time, standard value 1.0
|
|
491
|
+
no_speech_sec : float
|
|
492
|
+
no_speech time, standard value 1.0
|
|
493
|
+
"""
|
|
494
|
+
self._check(dll.ailiaSpeechSetSilentThreshold(self._instance, silent_threshold, speech_sec, no_speech_sec))
|
|
495
|
+
|
|
496
|
+
def transcribe(self, audio_waveform, sampling_rate, lang = None):
|
|
497
|
+
""" Perform speech recognition. Processes the entire audio at once.
|
|
498
|
+
|
|
499
|
+
Parameters
|
|
500
|
+
----------
|
|
501
|
+
audio_waveform : np.array
|
|
502
|
+
PCM data
|
|
503
|
+
sampling_rate : int
|
|
504
|
+
Sampling rate (Hz)
|
|
505
|
+
lang : str, optional, default : None
|
|
506
|
+
Language code (ja, en, etc.) (automatic detection if None)
|
|
507
|
+
|
|
508
|
+
Returns
|
|
509
|
+
----------
|
|
510
|
+
text : Speech recognition result text
|
|
511
|
+
time_stamp_begin : Start Time (seconds)
|
|
512
|
+
time_stamp_end : End Time (seconds)
|
|
513
|
+
speaker_id :Speaker ID (when diarization is enabled)
|
|
514
|
+
language : Language code
|
|
515
|
+
confidence : Confidence level
|
|
516
|
+
"""
|
|
517
|
+
if len(audio_waveform.shape) == 1:
|
|
518
|
+
channels = 1
|
|
519
|
+
elif len(audio_waveform.shape) == 2:
|
|
520
|
+
channels = audio_waveform.shape[0]
|
|
521
|
+
audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
|
|
522
|
+
else:
|
|
523
|
+
raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
|
|
524
|
+
|
|
525
|
+
audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
|
|
526
|
+
|
|
527
|
+
if lang is not None:
|
|
528
|
+
self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
|
|
529
|
+
|
|
530
|
+
self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
|
|
531
|
+
self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
|
|
532
|
+
|
|
533
|
+
while True:
|
|
534
|
+
complete = ctypes.c_uint(0)
|
|
535
|
+
self._check(dll.ailiaSpeechComplete(self._instance, ctypes.byref(complete)))
|
|
536
|
+
if complete.value == 1:
|
|
537
|
+
break
|
|
538
|
+
|
|
539
|
+
self._check(dll.ailiaSpeechTranscribe(self._instance))
|
|
540
|
+
|
|
541
|
+
count = ctypes.c_uint(0)
|
|
542
|
+
self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
|
|
543
|
+
results = []
|
|
544
|
+
for i in range(count.value):
|
|
545
|
+
text = AILIASpeechText()
|
|
546
|
+
self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
|
|
547
|
+
yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "speaker_id" : None if text.speaker_id == AILIA_SPEECH_SPEAKER_ID_UNKNOWN else text.speaker_id, "language" : text.language.decode(), "confidence" : text.confidence}
|
|
548
|
+
|
|
549
|
+
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
550
|
+
|
|
551
|
+
def transcribe_step(self, audio_waveform, sampling_rate, complete, lang = None):
|
|
552
|
+
""" Perform speech recognition. Processes the audio sequentially.
|
|
553
|
+
|
|
554
|
+
Parameters
|
|
555
|
+
----------
|
|
556
|
+
audio_waveform : np.array
|
|
557
|
+
PCM data
|
|
558
|
+
sampling_rate : int
|
|
559
|
+
Sampling rate (Hz)
|
|
560
|
+
lang : str, optional, default : None
|
|
561
|
+
Language code (ja, en, etc.) (automatic detection if None)
|
|
562
|
+
complete : bool
|
|
563
|
+
True if this is the final audio input.
|
|
564
|
+
transcribe_step executes a step each time there is microphone input, and by setting complete to True at the end, the buffer can be flushed.
|
|
565
|
+
|
|
566
|
+
Returns
|
|
567
|
+
----------
|
|
568
|
+
text : Speech recognition result text
|
|
569
|
+
time_stamp_begin : Start Time (seconds)
|
|
570
|
+
time_stamp_end : End Time (seconds)
|
|
571
|
+
speaker_id :Speaker ID (when diarization is enabled)
|
|
572
|
+
language : Language code
|
|
573
|
+
confidence : Confidence level
|
|
574
|
+
"""
|
|
575
|
+
if len(audio_waveform.shape) == 1:
|
|
576
|
+
channels = 1
|
|
577
|
+
elif len(audio_waveform.shape) == 2:
|
|
578
|
+
channels = audio_waveform.shape[0]
|
|
579
|
+
audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
|
|
580
|
+
else:
|
|
581
|
+
raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
|
|
582
|
+
|
|
583
|
+
audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
|
|
584
|
+
|
|
585
|
+
if lang is not None:
|
|
586
|
+
self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
|
|
587
|
+
|
|
588
|
+
self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
|
|
589
|
+
if complete:
|
|
590
|
+
self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
|
|
591
|
+
|
|
592
|
+
while True:
|
|
593
|
+
buffered = ctypes.c_uint(0)
|
|
594
|
+
self._check(dll.ailiaSpeechBuffered(self._instance, ctypes.byref(buffered)))
|
|
595
|
+
if buffered.value == 0:
|
|
596
|
+
break
|
|
597
|
+
|
|
598
|
+
self._check(dll.ailiaSpeechTranscribe(self._instance))
|
|
599
|
+
|
|
600
|
+
count = ctypes.c_uint(0)
|
|
601
|
+
self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
|
|
602
|
+
results = []
|
|
603
|
+
for i in range(count.value):
|
|
604
|
+
text = AILIASpeechText()
|
|
605
|
+
self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
|
|
606
|
+
yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "speaker_id" : None if text.speaker_id == AILIA_SPEECH_SPEAKER_ID_UNKNOWN else text.speaker_id, "language" : text.language.decode(), "confidence" : text.confidence}
|
|
607
|
+
|
|
608
|
+
if complete:
|
|
609
|
+
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
610
|
+
|
|
611
|
+
def _vad_model_name(self, vad_version):
|
|
612
|
+
if vad_version == "4":
|
|
613
|
+
vad_path = "silero_vad.onnx"
|
|
614
|
+
elif vad_version == "5" or vad_version == "6" or vad_version == "6_2":
|
|
615
|
+
vad_path = "silero_vad_v" + vad_version + ".onnx"
|
|
616
|
+
else:
|
|
617
|
+
raise Exception("Unknown vad_version")
|
|
618
|
+
return vad_path
|
|
619
|
+
|
|
620
|
+
def __del__(self):
|
|
621
|
+
if self._instance:
|
|
622
|
+
dll.ailiaSpeechDestroy(cast(self._instance, c_void_p))
|
|
623
|
+
|
|
624
|
+
# ==============================================================================
|
|
625
|
+
# Public class
|
|
626
|
+
# ==============================================================================
|
|
627
|
+
|
|
628
|
+
class Whisper(AiliaSpeechModel):
|
|
629
|
+
def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
|
|
630
|
+
super().__init__(env_id = env_id, num_thread = num_thread, memory_mode = memory_mode, task = task, flags = flags, callback = callback)
|
|
631
|
+
|
|
632
|
+
def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY, vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, vad_version = "4", diarization_type = None, is_fp16 = False):
|
|
633
|
+
""" Initialize and download the model.
|
|
634
|
+
|
|
635
|
+
Parameters
|
|
636
|
+
----------
|
|
637
|
+
model_path : string, optional, default : "./"
|
|
638
|
+
Destination for saving the model file
|
|
639
|
+
model_type : int, optional, default : AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
|
|
640
|
+
Type of model. Can be set to AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY, AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE, AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL, AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM, AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE, AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 or AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO.
|
|
641
|
+
vad_type : int, optional, default : AILIA_SPEECH_VAD_TYPE_SILERO
|
|
642
|
+
Type of VAD. Can be set to None or AILIA_SPEECH_VAD_TYPE_SILERO.
|
|
643
|
+
vad_version : string, optional, default : "4"
|
|
644
|
+
Versions 4, 5, and 6.2 of SileroVAD can be specified.
|
|
645
|
+
diarization_type : int, optional, default : None
|
|
646
|
+
Type of diarization. Can be set to None or AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO.
|
|
647
|
+
transcribe_step executes a step each time there is microphone input, and by setting complete to True at the end, the buffer can be flushed.
|
|
648
|
+
is_fp16 : bool, optional, default : False
|
|
649
|
+
Whether to use an FP16 model.
|
|
650
|
+
"""
|
|
651
|
+
if "time_license" in ailia.get_version():
|
|
652
|
+
ailia.check_and_download_license()
|
|
653
|
+
if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
|
|
654
|
+
if is_fp16:
|
|
655
|
+
encoder_path = "encoder_tiny_fp16.opt3.onnx"
|
|
656
|
+
decoder_path = "decoder_tiny_fix_kv_cache_fp16.opt3.onnx"
|
|
657
|
+
else:
|
|
658
|
+
encoder_path = "encoder_tiny.opt3.onnx"
|
|
659
|
+
decoder_path = "decoder_tiny_fix_kv_cache.opt3.onnx"
|
|
660
|
+
encoder_pb_path = None
|
|
661
|
+
decoder_pb_path = None
|
|
662
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE:
|
|
663
|
+
if is_fp16:
|
|
664
|
+
encoder_path = "encoder_base_fp16.opt3.onnx"
|
|
665
|
+
decoder_path = "decoder_base_fix_kv_cache_fp16.opt3.onnx"
|
|
666
|
+
else:
|
|
667
|
+
encoder_path = "encoder_base.opt3.onnx"
|
|
668
|
+
decoder_path = "decoder_base_fix_kv_cache.opt3.onnx"
|
|
669
|
+
encoder_pb_path = None
|
|
670
|
+
decoder_pb_path = None
|
|
671
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL:
|
|
672
|
+
if is_fp16:
|
|
673
|
+
encoder_path = "encoder_small_fp16.opt3.onnx"
|
|
674
|
+
decoder_path = "decoder_small_fix_kv_cache_fp16.opt3.onnx"
|
|
675
|
+
else:
|
|
676
|
+
encoder_path = "encoder_small.opt3.onnx"
|
|
677
|
+
decoder_path = "decoder_small_fix_kv_cache.opt3.onnx"
|
|
678
|
+
encoder_pb_path = None
|
|
679
|
+
decoder_pb_path = None
|
|
680
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM:
|
|
681
|
+
if is_fp16:
|
|
682
|
+
encoder_path = "encoder_medium_fp16.opt3.onnx"
|
|
683
|
+
decoder_path = "decoder_medium_fix_kv_cache_fp16.opt3.onnx"
|
|
684
|
+
else:
|
|
685
|
+
encoder_path = "encoder_medium.opt3.onnx"
|
|
686
|
+
decoder_path = "decoder_medium_fix_kv_cache.opt3.onnx"
|
|
687
|
+
encoder_pb_path = None
|
|
688
|
+
decoder_pb_path = None
|
|
689
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE:
|
|
690
|
+
encoder_path = "encoder_large.onnx"
|
|
691
|
+
decoder_path = "decoder_large_fix_kv_cache.onnx"
|
|
692
|
+
encoder_pb_path = "encoder_large_weights.pb"
|
|
693
|
+
decoder_pb_path = "decoder_large_fix_kv_cache_weights.pb"
|
|
694
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3:
|
|
695
|
+
encoder_path = "encoder_large_v3.onnx"
|
|
696
|
+
decoder_path = "decoder_large_v3_fix_kv_cache.onnx"
|
|
697
|
+
encoder_pb_path = "encoder_large_v3_weights.pb"
|
|
698
|
+
decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
|
|
699
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO:
|
|
700
|
+
if is_fp16:
|
|
701
|
+
encoder_path = "encoder_turbo_fp16.opt.onnx"
|
|
702
|
+
decoder_path = "decoder_turbo_fix_kv_cache_fp16.opt.onnx"
|
|
703
|
+
encoder_pb_path = None
|
|
704
|
+
else:
|
|
705
|
+
encoder_path = "encoder_turbo.opt.onnx"
|
|
706
|
+
decoder_path = "decoder_turbo_fix_kv_cache.opt.onnx"
|
|
707
|
+
encoder_pb_path = "encoder_turbo_weights.opt.pb"
|
|
708
|
+
decoder_pb_path = None
|
|
709
|
+
model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
|
|
710
|
+
else:
|
|
711
|
+
raise Exception("Unknown model type")
|
|
712
|
+
self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, vad_version, diarization_type, model_type)
|
|
713
|
+
self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
|
|
714
|
+
if vad_type is not None:
|
|
715
|
+
self._open_vad(model_path + self._vad_model_name(vad_version), vad_type)
|
|
716
|
+
self.set_silent_threshold(0.5, 1.0, 1.0)
|
|
717
|
+
if diarization_type is not None:
|
|
718
|
+
self._open_diarization(model_path + "segmentation.onnx", model_path + "speaker-embedding.onnx", diarization_type)
|
|
719
|
+
|
|
720
|
+
class SenseVoice(AiliaSpeechModel):
|
|
721
|
+
def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
|
|
722
|
+
super().__init__(env_id = env_id, num_thread = num_thread, memory_mode = memory_mode, task = task, flags = flags, callback = callback)
|
|
723
|
+
|
|
724
|
+
def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL, vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, vad_version = "4", diarization_type = None, is_fp16 = False):
|
|
725
|
+
""" Initialize and download the model.
|
|
726
|
+
|
|
727
|
+
Parameters
|
|
728
|
+
----------
|
|
729
|
+
model_path : string, optional, default : "./"
|
|
730
|
+
Destination for saving the model file
|
|
731
|
+
model_type : int, optional, default : AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
|
|
732
|
+
Type of model. Can be set to AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL.
|
|
733
|
+
vad_type : int, optional, default : AILIA_SPEECH_VAD_TYPE_SILERO
|
|
734
|
+
Type of VAD. Can be set to None or AILIA_SPEECH_VAD_TYPE_SILERO.
|
|
735
|
+
vad_version : string, optional, default : "4"
|
|
736
|
+
Versions 4, 5, and 6.2 of SileroVAD can be specified.
|
|
737
|
+
diarization_type : int, optional, default : None
|
|
738
|
+
Type of diarization. Can be set to None or AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO.
|
|
739
|
+
transcribe_step executes a step each time there is microphone input, and by setting complete to True at the end, the buffer can be flushed.
|
|
740
|
+
is_fp16 : bool, optional, default : False
|
|
741
|
+
Whether to use an FP16 model.
|
|
742
|
+
"""
|
|
743
|
+
|
|
744
|
+
if "time_license" in ailia.get_version():
|
|
745
|
+
ailia.check_and_download_license()
|
|
746
|
+
if model_type == AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL:
|
|
747
|
+
if is_fp16:
|
|
748
|
+
encoder_path = "sensevoice_small_fp16.onnx"
|
|
749
|
+
else:
|
|
750
|
+
encoder_path = "sensevoice_small.onnx"
|
|
751
|
+
decoder_path = "sensevoice_small.model"
|
|
752
|
+
encoder_pb_path = None
|
|
753
|
+
decoder_pb_path = None
|
|
754
|
+
else:
|
|
755
|
+
raise Exception("Unknown model type")
|
|
756
|
+
self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, vad_version, diarization_type, model_type)
|
|
757
|
+
self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
|
|
758
|
+
if vad_type is not None:
|
|
759
|
+
self._open_vad(model_path + self._vad_model_name(vad_version), vad_type)
|
|
760
|
+
self.set_silent_threshold(0.5, 1.0, 1.0)
|
|
761
|
+
if diarization_type is not None:
|
|
762
|
+
self._open_diarization(model_path + "segmentation.onnx", model_path + "speaker-embedding.onnx", diarization_type)
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ailia_speech
|
|
3
|
+
Version: 1.5.0
|
|
4
|
+
Summary: ailia AI Speech
|
|
5
|
+
Home-page: https://ailia.ai/en/
|
|
6
|
+
Author: ailia Inc.
|
|
7
|
+
Author-email: contact@ailia.ai
|
|
8
|
+
License: https://ailia.ai/en/license/
|
|
9
|
+
Requires-Python: >3.6
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: ailia
|
|
12
|
+
Requires-Dist: ailia_tokenizer
|
|
13
|
+
Dynamic: author
|
|
14
|
+
Dynamic: author-email
|
|
15
|
+
Dynamic: description
|
|
16
|
+
Dynamic: description-content-type
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: license
|
|
19
|
+
Dynamic: requires-dist
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
Dynamic: summary
|
|
22
|
+
|
|
23
|
+
# ailia AI Speech Python API
|
|
24
|
+
|
|
25
|
+
!! CAUTION !!
|
|
26
|
+
“ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
|
|
27
|
+
As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
|
|
28
|
+
|
|
29
|
+
## About ailia AI Speech
|
|
30
|
+
|
|
31
|
+
ailia AI Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia AI Speech, you can easily integrate AI powered speech recognition into your applications.
|
|
32
|
+
|
|
33
|
+
## Install from pip
|
|
34
|
+
|
|
35
|
+
You can install the ailia AI Speech free evaluation package with the following command.
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
pip3 install ailia_speech
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Install from package
|
|
42
|
+
|
|
43
|
+
You can install the ailia AI Speech from Package with the following command.
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
python3 bootstrap.py
|
|
47
|
+
pip3 install ./
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
|
|
52
|
+
### Batch mode
|
|
53
|
+
|
|
54
|
+
In batch mode, the entire audio is transcribed at once.
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
import ailia_speech
|
|
58
|
+
|
|
59
|
+
import librosa
|
|
60
|
+
|
|
61
|
+
import os
|
|
62
|
+
import urllib.request
|
|
63
|
+
|
|
64
|
+
# Load target audio
|
|
65
|
+
input_file_path = "demo.wav"
|
|
66
|
+
if not os.path.exists(input_file_path):
|
|
67
|
+
urllib.request.urlretrieve(
|
|
68
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
|
|
69
|
+
"demo.wav"
|
|
70
|
+
)
|
|
71
|
+
audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
|
|
72
|
+
|
|
73
|
+
# Model Initialize
|
|
74
|
+
speech = ailia_speech.Whisper()
|
|
75
|
+
model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
|
|
76
|
+
|
|
77
|
+
# When using sensevoice
|
|
78
|
+
#speech = ailia_speech.SenseVoice()
|
|
79
|
+
#model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
|
|
80
|
+
|
|
81
|
+
# Infer
|
|
82
|
+
speech.initialize_model(model_path = "./models/", model_type = model_type)
|
|
83
|
+
recognized_text = speech.transcribe(audio_waveform, sampling_rate)
|
|
84
|
+
for text in recognized_text:
|
|
85
|
+
print(text)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Step mode
|
|
89
|
+
|
|
90
|
+
In step mode, the audio is input in chunks and transcribed sequentially.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import ailia_speech
|
|
94
|
+
|
|
95
|
+
import librosa
|
|
96
|
+
|
|
97
|
+
import os
|
|
98
|
+
import urllib.request
|
|
99
|
+
|
|
100
|
+
# Load target audio
|
|
101
|
+
input_file_path = "demo.wav"
|
|
102
|
+
if not os.path.exists(input_file_path):
|
|
103
|
+
urllib.request.urlretrieve(
|
|
104
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
|
|
105
|
+
"demo.wav"
|
|
106
|
+
)
|
|
107
|
+
audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
|
|
108
|
+
|
|
109
|
+
# Infer
|
|
110
|
+
speech = ailia_speech.Whisper()
|
|
111
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO)
|
|
112
|
+
speech.set_silent_threshold(silent_threshold = 0.5, speech_sec = 1.0, no_speech_sec = 0.5)
|
|
113
|
+
for i in range(0, audio_waveform.shape[0], sampling_rate):
|
|
114
|
+
complete = False
|
|
115
|
+
if i + sampling_rate >= audio_waveform.shape[0]:
|
|
116
|
+
complete = True
|
|
117
|
+
recognized_text = speech.transcribe_step(audio_waveform[i:min(audio_waveform.shape[0], i + sampling_rate)], sampling_rate, complete)
|
|
118
|
+
for text in recognized_text:
|
|
119
|
+
print(text)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Dialization mode
|
|
123
|
+
|
|
124
|
+
By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, diarization_type = ailia_speech.AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Available model types
|
|
131
|
+
|
|
132
|
+
It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
|
|
133
|
+
|
|
134
|
+
Whisper
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
|
|
138
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
|
|
139
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
|
|
140
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
|
|
141
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
|
|
142
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
|
|
143
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
SenseVoice
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### Available vad versions
|
|
153
|
+
|
|
154
|
+
By default, version "4" of SileroVAD is used. The version can be specified from "4", "5", "6", and "6_2".
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
speech.initialize_model(model_path = "./models/", vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, vad_version = "6_2")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## API specification
|
|
161
|
+
|
|
162
|
+
https://github.com/axinc-ai/ailia-sdk
|
|
163
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
ailia_speech/LICENSE_AILIA_EN.pdf
|
|
4
|
+
ailia_speech/LICENSE_AILIA_JA.pdf
|
|
5
|
+
ailia_speech/__init__.py
|
|
6
|
+
ailia_speech.egg-info/PKG-INFO
|
|
7
|
+
ailia_speech.egg-info/SOURCES.txt
|
|
8
|
+
ailia_speech.egg-info/dependency_links.txt
|
|
9
|
+
ailia_speech.egg-info/requires.txt
|
|
10
|
+
ailia_speech.egg-info/top_level.txt
|
|
11
|
+
ailia_speech/linux/arm64-v8a/libailia_speech.so
|
|
12
|
+
ailia_speech/linux/x64/libailia_speech.so
|
|
13
|
+
ailia_speech/mac/libailia_speech.dylib
|
|
14
|
+
ailia_speech/windows/x64/ailia_speech.dll
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ailia_speech
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import platform
|
|
4
|
+
import glob
|
|
5
|
+
import shutil
|
|
6
|
+
import platform
|
|
7
|
+
|
|
8
|
+
from setuptools import setup, Extension
|
|
9
|
+
from setuptools import find_packages
|
|
10
|
+
|
|
11
|
+
with open("README.md", encoding="utf-8") as f:
|
|
12
|
+
long_description = f.read()
|
|
13
|
+
|
|
14
|
+
scripts = []
|
|
15
|
+
for f in glob.glob("ailia_speech/*.py"):
|
|
16
|
+
scripts.append(f)
|
|
17
|
+
|
|
18
|
+
def find_libraries():
|
|
19
|
+
dll_names = []
|
|
20
|
+
platforms = ["win32", "darwin", "linux_armv7l", "linux_aarch64", "linux_x86_64"]
|
|
21
|
+
|
|
22
|
+
for platform in platforms:
|
|
23
|
+
if platform == "win32":
|
|
24
|
+
dll_platform = "windows/x64"
|
|
25
|
+
dll_type = ".dll"
|
|
26
|
+
elif platform == "darwin":
|
|
27
|
+
dll_platform = "mac"
|
|
28
|
+
dll_type = ".dylib"
|
|
29
|
+
else:
|
|
30
|
+
if platform == "linux_armv7l":
|
|
31
|
+
dll_platform = "linux/armeabi-v7a"
|
|
32
|
+
elif platform == "linux_aarch64":
|
|
33
|
+
dll_platform = "linux/arm64-v8a"
|
|
34
|
+
else:
|
|
35
|
+
dll_platform = "linux/x64"
|
|
36
|
+
dll_type = ".so"
|
|
37
|
+
|
|
38
|
+
dll_path = "./ailia_speech/" + dll_platform + "/"
|
|
39
|
+
|
|
40
|
+
for f in glob.glob(dll_path+"*"+dll_type):
|
|
41
|
+
f = f.replace("\\", "/")
|
|
42
|
+
f = f.replace("./ailia_speech/", "./")
|
|
43
|
+
dll_names.append(f)
|
|
44
|
+
|
|
45
|
+
dll_names.append("./LICENSE_AILIA_EN.pdf")
|
|
46
|
+
dll_names.append("./LICENSE_AILIA_JA.pdf")
|
|
47
|
+
dll_names.append("./oss/LICENSE_SILERO_VAD.txt")
|
|
48
|
+
dll_names.append("./oss/LICENSE_SRELL.txt")
|
|
49
|
+
dll_names.append("./oss/LICENSE_WHISPER.txt")
|
|
50
|
+
|
|
51
|
+
return dll_names
|
|
52
|
+
|
|
53
|
+
if __name__ == "__main__":
|
|
54
|
+
setup(
|
|
55
|
+
name="ailia_speech",
|
|
56
|
+
scripts=scripts,
|
|
57
|
+
version="1.5.0",
|
|
58
|
+
install_requires=[
|
|
59
|
+
"ailia",
|
|
60
|
+
"ailia_tokenizer",
|
|
61
|
+
],
|
|
62
|
+
description="ailia AI Speech",
|
|
63
|
+
long_description=long_description,
|
|
64
|
+
long_description_content_type="text/markdown",
|
|
65
|
+
author="ailia Inc.",
|
|
66
|
+
author_email="contact@ailia.ai",
|
|
67
|
+
url="https://ailia.ai/en/",
|
|
68
|
+
license="https://ailia.ai/en/license/",
|
|
69
|
+
packages=find_packages(),
|
|
70
|
+
package_data={"ailia_speech":find_libraries()},
|
|
71
|
+
include_package_data=True,
|
|
72
|
+
python_requires=">3.6",
|
|
73
|
+
)
|