ailia-speech 1.3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ailia-speech might be problematic. Click here for more details.
- ailia_speech-1.3.0.0/PKG-INFO +71 -0
- ailia_speech-1.3.0.0/README.md +58 -0
- ailia_speech-1.3.0.0/ailia_speech/LICENSE_AILIA_EN.pdf +0 -0
- ailia_speech-1.3.0.0/ailia_speech/LICENSE_AILIA_JA.pdf +0 -0
- ailia_speech-1.3.0.0/ailia_speech/__init__.py +493 -0
- ailia_speech-1.3.0.0/ailia_speech/linux/arm64-v8a/libailia_speech.so +0 -0
- ailia_speech-1.3.0.0/ailia_speech/linux/x64/libailia_speech.so +0 -0
- ailia_speech-1.3.0.0/ailia_speech/mac/libailia_speech.dylib +0 -0
- ailia_speech-1.3.0.0/ailia_speech/windows/x64/ailia_speech.dll +0 -0
- ailia_speech-1.3.0.0/ailia_speech.egg-info/PKG-INFO +71 -0
- ailia_speech-1.3.0.0/ailia_speech.egg-info/SOURCES.txt +14 -0
- ailia_speech-1.3.0.0/ailia_speech.egg-info/dependency_links.txt +1 -0
- ailia_speech-1.3.0.0/ailia_speech.egg-info/requires.txt +2 -0
- ailia_speech-1.3.0.0/ailia_speech.egg-info/top_level.txt +1 -0
- ailia_speech-1.3.0.0/setup.cfg +4 -0
- ailia_speech-1.3.0.0/setup.py +73 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ailia_speech
|
|
3
|
+
Version: 1.3.0.0
|
|
4
|
+
Summary: ailia AI Speech
|
|
5
|
+
Home-page: https://ailia.jp/
|
|
6
|
+
Author: ax Inc.
|
|
7
|
+
Author-email: contact@axinc.jp
|
|
8
|
+
License: https://ailia.ai/en/license/
|
|
9
|
+
Requires-Python: >3.6
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: ailia
|
|
12
|
+
Requires-Dist: ailia_tokenizer
|
|
13
|
+
|
|
14
|
+
# ailia AI Speech Python API
|
|
15
|
+
|
|
16
|
+
!! CAUTION !!
|
|
17
|
+
“ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
|
|
18
|
+
As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
|
|
19
|
+
|
|
20
|
+
## About ailia AI Speech
|
|
21
|
+
|
|
22
|
+
ailia Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia Speech, you can easily integrate AI powered speech recognition into your applications.
|
|
23
|
+
|
|
24
|
+
## Install from pip
|
|
25
|
+
|
|
26
|
+
You can install the ailia SDK free evaluation package with the following command.
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
pip3 install ailia_speech
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Install from package
|
|
33
|
+
|
|
34
|
+
You can install the ailia SDK from Package with the following command.
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
python3 bootstrap.py
|
|
38
|
+
pip3 install ./
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import ailia
|
|
45
|
+
import ailia_speech
|
|
46
|
+
|
|
47
|
+
import librosa
|
|
48
|
+
|
|
49
|
+
import os
|
|
50
|
+
import urllib.request
|
|
51
|
+
|
|
52
|
+
# Load target audio
|
|
53
|
+
ref_file_path = "demo.wav"
|
|
54
|
+
if not os.path.exists(ref_file_path):
|
|
55
|
+
urllib.request.urlretrieve(
|
|
56
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
|
|
57
|
+
"demo.wav"
|
|
58
|
+
)
|
|
59
|
+
audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
|
|
60
|
+
|
|
61
|
+
# Infer
|
|
62
|
+
speech = ailia_speech.Whisper()
|
|
63
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
|
|
64
|
+
recognized_text = speech.transcribe(audio_waveform, sampling_rate)
|
|
65
|
+
print(recognized_text)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## API specification
|
|
69
|
+
|
|
70
|
+
https://github.com/axinc-ai/ailia-sdk
|
|
71
|
+
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# ailia AI Speech Python API
|
|
2
|
+
|
|
3
|
+
!! CAUTION !!
|
|
4
|
+
“ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
|
|
5
|
+
As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
|
|
6
|
+
|
|
7
|
+
## About ailia AI Speech
|
|
8
|
+
|
|
9
|
+
ailia Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia Speech, you can easily integrate AI powered speech recognition into your applications.
|
|
10
|
+
|
|
11
|
+
## Install from pip
|
|
12
|
+
|
|
13
|
+
You can install the ailia SDK free evaluation package with the following command.
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
pip3 install ailia_speech
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Install from package
|
|
20
|
+
|
|
21
|
+
You can install the ailia SDK from Package with the following command.
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
python3 bootstrap.py
|
|
25
|
+
pip3 install ./
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import ailia
|
|
32
|
+
import ailia_speech
|
|
33
|
+
|
|
34
|
+
import librosa
|
|
35
|
+
|
|
36
|
+
import os
|
|
37
|
+
import urllib.request
|
|
38
|
+
|
|
39
|
+
# Load target audio
|
|
40
|
+
ref_file_path = "demo.wav"
|
|
41
|
+
if not os.path.exists(ref_file_path):
|
|
42
|
+
urllib.request.urlretrieve(
|
|
43
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
|
|
44
|
+
"demo.wav"
|
|
45
|
+
)
|
|
46
|
+
audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
|
|
47
|
+
|
|
48
|
+
# Infer
|
|
49
|
+
speech = ailia_speech.Whisper()
|
|
50
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
|
|
51
|
+
recognized_text = speech.transcribe(audio_waveform, sampling_rate)
|
|
52
|
+
print(recognized_text)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## API specification
|
|
56
|
+
|
|
57
|
+
https://github.com/axinc-ai/ailia-sdk
|
|
58
|
+
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
import ctypes
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import numpy
|
|
6
|
+
import ailia
|
|
7
|
+
import ailia.audio
|
|
8
|
+
import ailia_tokenizer
|
|
9
|
+
|
|
10
|
+
import urllib.request
|
|
11
|
+
import ssl
|
|
12
|
+
import shutil
|
|
13
|
+
import platform
|
|
14
|
+
|
|
15
|
+
#### dependency check
|
|
16
|
+
if sys.platform == "win32":
|
|
17
|
+
import ctypes
|
|
18
|
+
try:
|
|
19
|
+
for library in ["vcruntime140.dll", "vcruntime140_1.dll", "msvcp140.dll"]:
|
|
20
|
+
ctypes.windll.LoadLibrary(library)
|
|
21
|
+
except:
|
|
22
|
+
print(" WARNING Please install MSVC 2015-2019 runtime from https://docs.microsoft.com/ja-jp/cpp/windows/latest-supported-vc-redist")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
#### loading DLL / DYLIB / SO ####
|
|
26
|
+
if sys.platform == "win32":
|
|
27
|
+
dll_platform = "windows/x64"
|
|
28
|
+
dll_name = "ailia_speech.dll"
|
|
29
|
+
load_fn = ctypes.WinDLL
|
|
30
|
+
elif sys.platform == "darwin":
|
|
31
|
+
dll_platform = "mac"
|
|
32
|
+
dll_name = "libailia_speech.dylib"
|
|
33
|
+
load_fn = ctypes.CDLL
|
|
34
|
+
else:
|
|
35
|
+
is_arm = "arm" in platform.machine() or platform.machine() == "aarch64"
|
|
36
|
+
if is_arm:
|
|
37
|
+
if platform.architecture()[0] == "32bit":
|
|
38
|
+
dll_platform = "linux/armeabi-v7a"
|
|
39
|
+
else:
|
|
40
|
+
dll_platform = "linux/arm64-v8a"
|
|
41
|
+
else:
|
|
42
|
+
dll_platform = "linux/x64"
|
|
43
|
+
dll_name = "libailia_speech.so"
|
|
44
|
+
load_fn = ctypes.CDLL
|
|
45
|
+
|
|
46
|
+
dll_found = False
|
|
47
|
+
candidate = ["", str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep), str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep) + dll_platform + str(os.sep)]
|
|
48
|
+
for dir in candidate:
|
|
49
|
+
try:
|
|
50
|
+
dll = load_fn(dir + dll_name)
|
|
51
|
+
dll_found = True
|
|
52
|
+
except:
|
|
53
|
+
pass
|
|
54
|
+
if not dll_found:
|
|
55
|
+
msg = "DLL load failed : \'" + dll_name + "\' is not found"
|
|
56
|
+
raise ImportError(msg)
|
|
57
|
+
|
|
58
|
+
# ==============================================================================
|
|
59
|
+
|
|
60
|
+
from ctypes import *
|
|
61
|
+
|
|
62
|
+
AILIA_SPEECH_STATUS_SUCCESS = ( 0 )
|
|
63
|
+
|
|
64
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY = (0)
|
|
65
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE = (1)
|
|
66
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL = (2)
|
|
67
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM = (3)
|
|
68
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE = (4)
|
|
69
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 = (5)
|
|
70
|
+
|
|
71
|
+
AILIA_SPEECH_TASK_TRANSCRIBE = (0)
|
|
72
|
+
AILIA_SPEECH_TASK_TRANSLATE = (1)
|
|
73
|
+
|
|
74
|
+
AILIA_SPEECH_FLAG_NONE = (0)
|
|
75
|
+
AILIA_SPEECH_FLAG_LIVE = (1)
|
|
76
|
+
|
|
77
|
+
AILIA_SPEECH_VAD_TYPE_SILERO = (0)
|
|
78
|
+
AILIA_SPEECH_API_CALLBACK_VERSION = (6)
|
|
79
|
+
|
|
80
|
+
AILIA_SPEECH_TEXT_VERSION = (2)
|
|
81
|
+
|
|
82
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
|
|
83
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
|
|
84
|
+
|
|
85
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p) , c_int, c_int)
|
|
86
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A = CFUNCTYPE((c_int), c_void_p , c_char_p)
|
|
87
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W = CFUNCTYPE((c_int), c_void_p , c_wchar)
|
|
88
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE = CFUNCTYPE((c_int), c_void_p , c_char_p)
|
|
89
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
|
|
90
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS = CFUNCTYPE((c_int), c_void_p , POINTER(c_int) , c_uint)
|
|
91
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE = CFUNCTYPE((c_int), c_void_p , POINTER(c_int), c_uint)
|
|
92
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
|
|
93
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT = CFUNCTYPE((c_int), c_void_p , c_char_p , c_uint)
|
|
94
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY = CFUNCTYPE((c_int), c_void_p)
|
|
95
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32 = CFUNCTYPE((c_int), POINTER(c_uint) , POINTER(c_uint) , c_char_p , c_uint)
|
|
96
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8 = CFUNCTYPE((c_int), c_char_p, POINTER(c_uint) , c_uint)
|
|
97
|
+
|
|
98
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE = CFUNCTYPE((c_int), POINTER(c_float), POINTER(c_float), c_int, c_int, c_int, c_int)
|
|
99
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN = CFUNCTYPE((c_int), POINTER(c_int), c_int, c_int, c_int)
|
|
100
|
+
AILIA_SPEECH_USER_API_AILIA_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p), c_int, c_int)
|
|
101
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A = CFUNCTYPE((c_int), c_void_p, c_char_p)
|
|
102
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W = CFUNCTYPE((c_int), c_void_p, POINTER(c_wchar))
|
|
103
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM = CFUNCTYPE((c_int), c_void_p, POINTER(c_byte), c_uint)
|
|
104
|
+
AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE = CFUNCTYPE((c_int), c_void_p, c_uint)
|
|
105
|
+
AILIA_SPEECH_USER_API_AILIA_DESTROY = CFUNCTYPE((None), c_void_p)
|
|
106
|
+
AILIA_SPEECH_USER_API_AILIA_UPDATE = CFUNCTYPE((c_int), c_void_p)
|
|
107
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
|
|
108
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
|
|
109
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
|
|
110
|
+
AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
|
|
111
|
+
AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
|
|
112
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
|
|
113
|
+
AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL = CFUNCTYPE((c_char_p), c_void_p)
|
|
114
|
+
|
|
115
|
+
AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, c_uint, c_void_p, c_uint)
|
|
116
|
+
AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT = CFUNCTYPE((c_int), POINTER(c_void_p), c_uint, c_uint)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class struct__AILIASpeechApiCallback(Structure):
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
struct__AILIASpeechApiCallback.__slots__ = [
|
|
123
|
+
'ailiaAudioGetFrameLen',
|
|
124
|
+
'ailiaAudioGetMelSpectrogram',
|
|
125
|
+
'ailiaAudioResample',
|
|
126
|
+
'ailiaAudioGetResampleLen',
|
|
127
|
+
|
|
128
|
+
'ailiaTokenizerCreate',
|
|
129
|
+
'ailiaTokenizerOpenModelFileA',
|
|
130
|
+
'ailiaTokenizerOpenModelFileW',
|
|
131
|
+
'ailiaTokenizerEncode',
|
|
132
|
+
'ailiaTokenizerGetTokenCount',
|
|
133
|
+
'ailiaTokenizerGetTokens',
|
|
134
|
+
'ailiaTokenizerDecode',
|
|
135
|
+
'ailiaTokenizerGetTextLength',
|
|
136
|
+
'ailiaTokenizerGetText',
|
|
137
|
+
'ailiaTokenizerDestroy',
|
|
138
|
+
'ailiaTokenizerUtf8ToUtf32',
|
|
139
|
+
'ailiaTokenizerUtf32ToUtf8',
|
|
140
|
+
|
|
141
|
+
'ailiaCreate',
|
|
142
|
+
'ailiaOpenWeightFileA',
|
|
143
|
+
'ailiaOpenWeightFileW',
|
|
144
|
+
'ailiaOpenWeightMem',
|
|
145
|
+
'ailiaSetMemoryMode',
|
|
146
|
+
'ailiaDestroy',
|
|
147
|
+
'ailiaUpdate',
|
|
148
|
+
'ailiaGetBlobIndexByInputIndex',
|
|
149
|
+
'ailiaGetBlobIndexByOutputIndex',
|
|
150
|
+
'ailiaGetBlobData',
|
|
151
|
+
'ailiaSetInputBlobData',
|
|
152
|
+
'ailiaSetInputBlobShape',
|
|
153
|
+
'ailiaGetBlobShape',
|
|
154
|
+
'ailiaGetErrorDetail',
|
|
155
|
+
'ailiaCopyBlobData',
|
|
156
|
+
'ailiaGetEnvironment',
|
|
157
|
+
]
|
|
158
|
+
struct__AILIASpeechApiCallback._fields_ = [
|
|
159
|
+
('ailiaAudioGetFrameLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN),
|
|
160
|
+
('ailiaAudioGetMelSpectrogram', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM),
|
|
161
|
+
('ailiaAudioResample', AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE),
|
|
162
|
+
('ailiaAudioGetResampleLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN),
|
|
163
|
+
|
|
164
|
+
('ailiaTokenizerCreate', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE),
|
|
165
|
+
('ailiaTokenizerOpenModelFileA', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A),
|
|
166
|
+
('ailiaTokenizerOpenModelFileW', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W),
|
|
167
|
+
('ailiaTokenizerEncode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE),
|
|
168
|
+
('ailiaTokenizerGetTokenCount', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT),
|
|
169
|
+
('ailiaTokenizerGetTokens', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS),
|
|
170
|
+
('ailiaTokenizerDecode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE),
|
|
171
|
+
('ailiaTokenizerGetTextLength', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH),
|
|
172
|
+
('ailiaTokenizerGetText', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT),
|
|
173
|
+
('ailiaTokenizerDestroy', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY),
|
|
174
|
+
('ailiaTokenizerUtf8ToUtf32', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32),
|
|
175
|
+
('ailiaTokenizerUtf32ToUtf8', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8),
|
|
176
|
+
|
|
177
|
+
('ailiaCreate', AILIA_SPEECH_USER_API_AILIA_CREATE),
|
|
178
|
+
('ailiaOpenWeightFileA', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A),
|
|
179
|
+
('ailiaOpenWeightFileW', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W),
|
|
180
|
+
('ailiaOpenWeightMem', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM),
|
|
181
|
+
('ailiaSetMemoryMode', AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE),
|
|
182
|
+
('ailiaDestroy', AILIA_SPEECH_USER_API_AILIA_DESTROY),
|
|
183
|
+
('ailiaUpdate', AILIA_SPEECH_USER_API_AILIA_UPDATE),
|
|
184
|
+
('ailiaGetBlobIndexByInputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX),
|
|
185
|
+
('ailiaGetBlobIndexByOutputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX),
|
|
186
|
+
('ailiaGetBlobData', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA),
|
|
187
|
+
('ailiaSetInputBlobData', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA),
|
|
188
|
+
('ailiaSetInputBlobShape', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE),
|
|
189
|
+
('ailiaGetBlobShape', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE),
|
|
190
|
+
('ailiaGetErrorDetail', AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL),
|
|
191
|
+
('ailiaCopyBlobData', AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA),
|
|
192
|
+
('ailiaGetEnvironment', AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT),
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
AILIASpeechApiCallback = struct__AILIASpeechApiCallback
|
|
196
|
+
|
|
197
|
+
# ==============================================================================
|
|
198
|
+
|
|
199
|
+
dll.ailiaSpeechCreate.restype = c_int
|
|
200
|
+
dll.ailiaSpeechCreate.argtypes = (POINTER(c_void_p), c_int32, c_int32, c_int32, c_int32, c_int32, AILIASpeechApiCallback, c_int32)
|
|
201
|
+
|
|
202
|
+
dll.ailiaSpeechDestroy.restype = None
|
|
203
|
+
dll.ailiaSpeechDestroy.argtypes = (c_void_p, )
|
|
204
|
+
|
|
205
|
+
dll.ailiaSpeechOpenModelFileA.restype = c_int
|
|
206
|
+
dll.ailiaSpeechOpenModelFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
|
|
207
|
+
|
|
208
|
+
dll.ailiaSpeechOpenModelFileW.restype = c_int
|
|
209
|
+
dll.ailiaSpeechOpenModelFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
|
|
210
|
+
|
|
211
|
+
dll.ailiaSpeechOpenVadFileA.restype = c_int
|
|
212
|
+
dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
|
|
213
|
+
|
|
214
|
+
dll.ailiaSpeechOpenVadFileW.restype = c_int
|
|
215
|
+
dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
|
|
216
|
+
|
|
217
|
+
dll.ailiaSpeechPushInputData.restype = c_int
|
|
218
|
+
dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
|
|
219
|
+
dtype=numpy.float32, flags='CONTIGUOUS'
|
|
220
|
+
), # src
|
|
221
|
+
ctypes.c_uint,
|
|
222
|
+
ctypes.c_uint,
|
|
223
|
+
ctypes.c_uint)
|
|
224
|
+
|
|
225
|
+
dll.ailiaSpeechFinalizeInputData.restype = c_int
|
|
226
|
+
dll.ailiaSpeechFinalizeInputData.argtypes = (c_void_p, )
|
|
227
|
+
|
|
228
|
+
dll.ailiaSpeechBuffered.restype = c_int
|
|
229
|
+
dll.ailiaSpeechBuffered.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
230
|
+
|
|
231
|
+
dll.ailiaSpeechComplete.restype = c_int
|
|
232
|
+
dll.ailiaSpeechComplete.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
233
|
+
|
|
234
|
+
dll.ailiaSpeechTranscribe.restype = c_int
|
|
235
|
+
dll.ailiaSpeechTranscribe.argtypes = (c_void_p, )
|
|
236
|
+
|
|
237
|
+
dll.ailiaSpeechGetTextCount.restype = c_int
|
|
238
|
+
dll.ailiaSpeechGetTextCount.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
239
|
+
|
|
240
|
+
class AILIASpeechText(ctypes.Structure):
|
|
241
|
+
_fields_ = [
|
|
242
|
+
("text", ctypes.c_char_p),
|
|
243
|
+
("time_stamp_begin", ctypes.c_float),
|
|
244
|
+
("time_stamp_end", ctypes.c_float),
|
|
245
|
+
("person_id", ctypes.c_uint),
|
|
246
|
+
("language", ctypes.c_char_p),
|
|
247
|
+
("confidence", ctypes.c_float)]
|
|
248
|
+
|
|
249
|
+
dll.ailiaSpeechGetText.restype = c_int
|
|
250
|
+
dll.ailiaSpeechGetText.argtypes = (c_void_p, POINTER(AILIASpeechText), ctypes.c_uint, ctypes.c_uint)
|
|
251
|
+
|
|
252
|
+
dll.ailiaSpeechResetTranscribeState.restype = c_int
|
|
253
|
+
dll.ailiaSpeechResetTranscribeState.argtypes = (c_void_p, )
|
|
254
|
+
|
|
255
|
+
AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK = CFUNCTYPE((c_int), c_int64, c_char_p)
|
|
256
|
+
|
|
257
|
+
dll.ailiaSpeechSetIntermediateCallback.restype = c_int
|
|
258
|
+
dll.ailiaSpeechSetIntermediateCallback.argtypes = (c_void_p, AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK, c_int64)
|
|
259
|
+
|
|
260
|
+
dll.ailiaSpeechSetLanguage.restype = c_int
|
|
261
|
+
dll.ailiaSpeechSetLanguage.argtypes = (c_void_p, c_char_p)
|
|
262
|
+
|
|
263
|
+
# ==============================================================================
|
|
264
|
+
# model download
|
|
265
|
+
# ==============================================================================
|
|
266
|
+
|
|
267
|
+
def progress_print(block_count, block_size, total_size):
|
|
268
|
+
percentage = 100.0 * block_count * block_size / total_size
|
|
269
|
+
if percentage > 100:
|
|
270
|
+
# Bigger than 100 does not look good, so...
|
|
271
|
+
percentage = 100
|
|
272
|
+
max_bar = 50
|
|
273
|
+
bar_num = int(percentage / (100 / max_bar))
|
|
274
|
+
progress_element = '=' * bar_num
|
|
275
|
+
if bar_num != max_bar:
|
|
276
|
+
progress_element += '>'
|
|
277
|
+
bar_fill = ' ' # fill the blanks
|
|
278
|
+
bar = progress_element.ljust(max_bar, bar_fill)
|
|
279
|
+
total_size_kb = total_size / 1024
|
|
280
|
+
print(f'[{bar} {percentage:.2f}% ( {total_size_kb:.0f}KB )]', end='\r')
|
|
281
|
+
|
|
282
|
+
def urlretrieve(remote_path, weight_path, progress_print):
|
|
283
|
+
temp_path = weight_path + ".tmp"
|
|
284
|
+
try:
|
|
285
|
+
#raise ssl.SSLError # test
|
|
286
|
+
urllib.request.urlretrieve(
|
|
287
|
+
remote_path,
|
|
288
|
+
temp_path,
|
|
289
|
+
progress_print,
|
|
290
|
+
)
|
|
291
|
+
except ssl.SSLError as e:
|
|
292
|
+
print(f'SSLError detected, so try to download without ssl')
|
|
293
|
+
remote_path = remote_path.replace("https","http")
|
|
294
|
+
urllib.request.urlretrieve(
|
|
295
|
+
remote_path,
|
|
296
|
+
temp_path,
|
|
297
|
+
progress_print,
|
|
298
|
+
)
|
|
299
|
+
shutil.move(temp_path, weight_path)
|
|
300
|
+
|
|
301
|
+
def check_and_download_file(file_path, remote_path):
|
|
302
|
+
if not os.path.exists(file_path):
|
|
303
|
+
print('Downloading %s...' % file_path)
|
|
304
|
+
urlretrieve(remote_path + os.path.basename(file_path), file_path, progress_print)
|
|
305
|
+
|
|
306
|
+
# ==============================================================================
|
|
307
|
+
# base model class
|
|
308
|
+
# ==============================================================================
|
|
309
|
+
|
|
310
|
+
class AiliaSpeechError(RuntimeError):
|
|
311
|
+
def __init__(self, message, code):
|
|
312
|
+
super().__init__(f"{message} code:{code}")
|
|
313
|
+
self.code = code
|
|
314
|
+
|
|
315
|
+
class AiliaSpeechModel:
|
|
316
|
+
_api_callback = None
|
|
317
|
+
_instance = None
|
|
318
|
+
|
|
319
|
+
def _check(self, status):
|
|
320
|
+
if status != AILIA_SPEECH_STATUS_SUCCESS:
|
|
321
|
+
raise AiliaSpeechError(f"ailia speech error", status)
|
|
322
|
+
|
|
323
|
+
def _string_buffer(self, path):
|
|
324
|
+
if sys.platform == "win32":
|
|
325
|
+
return ctypes.create_unicode_buffer(path)
|
|
326
|
+
else:
|
|
327
|
+
return ctypes.create_string_buffer(path.encode("utf-8"))
|
|
328
|
+
|
|
329
|
+
def _create_callback(self):
|
|
330
|
+
callback = AILIASpeechApiCallback()
|
|
331
|
+
callback.ailiaAudioGetFrameLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN(("ailiaAudioGetFrameLen", ailia.audio.audio_core.dll))
|
|
332
|
+
callback.ailiaAudioGetMelSpectrogram = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM(("ailiaAudioGetMelSpectrogram", ailia.audio.audio_core.dll))
|
|
333
|
+
callback.ailiaAudioResample = AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE(("ailiaAudioResample", ailia.audio.audio_core.dll))
|
|
334
|
+
callback.ailiaAudioGetResampleLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN(("ailiaAudioGetResampleLen", ailia.audio.audio_core.dll))
|
|
335
|
+
|
|
336
|
+
callback.ailiaTokenizerCreate = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE(("ailiaTokenizerCreate", ailia_tokenizer.dll))
|
|
337
|
+
callback.ailiaTokenizerOpenModelFileA = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A(("ailiaTokenizerOpenModelFileA", ailia_tokenizer.dll))
|
|
338
|
+
callback.ailiaTokenizerOpenModelFileW = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W(("ailiaTokenizerOpenModelFileW", ailia_tokenizer.dll))
|
|
339
|
+
callback.ailiaTokenizerEncode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE(("ailiaTokenizerEncode", ailia_tokenizer.dll))
|
|
340
|
+
callback.ailiaTokenizerGetTokenCount = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT(("ailiaTokenizerGetTokenCount", ailia_tokenizer.dll))
|
|
341
|
+
callback.ailiaTokenizerGetTokens = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS(("ailiaTokenizerGetTokens", ailia_tokenizer.dll))
|
|
342
|
+
callback.ailiaTokenizerDecode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE(("ailiaTokenizerDecode", ailia_tokenizer.dll))
|
|
343
|
+
callback.ailiaTokenizerGetTextLength = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH(("ailiaTokenizerGetTextLength", ailia_tokenizer.dll))
|
|
344
|
+
callback.ailiaTokenizerGetText = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT(("ailiaTokenizerGetText", ailia_tokenizer.dll))
|
|
345
|
+
callback.ailiaTokenizerDestroy = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY(("ailiaTokenizerDestroy", ailia_tokenizer.dll))
|
|
346
|
+
callback.ailiaTokenizerUtf8ToUtf32 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32(("ailiaTokenizerUtf8ToUtf32", ailia_tokenizer.dll))
|
|
347
|
+
callback.ailiaTokenizerUtf32ToUtf8 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8(("ailiaTokenizerUtf32ToUtf8", ailia_tokenizer.dll))
|
|
348
|
+
|
|
349
|
+
callback.ailiaCreate = AILIA_SPEECH_USER_API_AILIA_CREATE(("ailiaCreate", ailia.core.dll))
|
|
350
|
+
callback.ailiaOpenWeightFileA = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A(("ailiaOpenWeightFileA", ailia.core.dll))
|
|
351
|
+
callback.ailiaOpenWeightFileW = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W(("ailiaOpenWeightFileW", ailia.core.dll))
|
|
352
|
+
callback.ailiaOpenWeightMem = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM(("ailiaOpenWeightMem", ailia.core.dll))
|
|
353
|
+
callback.ailiaSetMemoryMode = AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE(("ailiaSetMemoryMode", ailia.core.dll))
|
|
354
|
+
callback.ailiaDestroy = AILIA_SPEECH_USER_API_AILIA_DESTROY(("ailiaDestroy", ailia.core.dll))
|
|
355
|
+
callback.ailiaUpdate = AILIA_SPEECH_USER_API_AILIA_UPDATE(("ailiaUpdate", ailia.core.dll))
|
|
356
|
+
callback.ailiaGetBlobIndexByInputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX(("ailiaGetBlobIndexByInputIndex", ailia.core.dll))
|
|
357
|
+
callback.ailiaGetBlobIndexByOutputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX(("ailiaGetBlobIndexByOutputIndex", ailia.core.dll))
|
|
358
|
+
callback.ailiaGetBlobData = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA(("ailiaGetBlobData", ailia.core.dll))
|
|
359
|
+
callback.ailiaSetInputBlobData = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA(("ailiaSetInputBlobData", ailia.core.dll))
|
|
360
|
+
callback.ailiaSetInputBlobShape = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE(("ailiaSetInputBlobShape", ailia.core.dll))
|
|
361
|
+
callback.ailiaGetBlobShape = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE(("ailiaGetBlobShape", ailia.core.dll))
|
|
362
|
+
callback.ailiaGetErrorDetail = AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL(("ailiaGetErrorDetail", ailia.core.dll))
|
|
363
|
+
callback.ailiaCopyBlobData = AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA(("ailiaCopyBlobData", ailia.core.dll))
|
|
364
|
+
callback.ailiaGetEnvironment = AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT(("ailiaGetEnvironment", ailia.core.dll))
|
|
365
|
+
|
|
366
|
+
self._api_callback = callback # prevent GC
|
|
367
|
+
|
|
368
|
+
# ==============================================================================
|
|
369
|
+
# Public class
|
|
370
|
+
# ==============================================================================
|
|
371
|
+
|
|
372
|
+
intermediate_callback_cnt = 0
|
|
373
|
+
intermediate_callback_map = {}
|
|
374
|
+
|
|
375
|
+
def intermediate_callback(handle, text):
|
|
376
|
+
intermediate_callback_map[handle](text.decode())
|
|
377
|
+
return 0
|
|
378
|
+
|
|
379
|
+
class Whisper(AiliaSpeechModel):
|
|
380
|
+
_c_callback = None
|
|
381
|
+
|
|
382
|
+
def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
|
|
383
|
+
self._instance = ctypes.c_void_p(None)
|
|
384
|
+
self._create_callback()
|
|
385
|
+
self._check(dll.ailiaSpeechCreate(cast(pointer(self._instance), POINTER(c_void_p)), ctypes.c_int32(env_id), ctypes.c_int32(num_thread), ctypes.c_int32(memory_mode), ctypes.c_int32(task), ctypes.c_int32(flags), self._api_callback, ctypes.c_int32(AILIA_SPEECH_API_CALLBACK_VERSION)))
|
|
386
|
+
if callback is not None:
|
|
387
|
+
self._c_callback = AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK(intermediate_callback)
|
|
388
|
+
global intermediate_callback_cnt
|
|
389
|
+
global intermediate_callback_map
|
|
390
|
+
intermediate_callback_map[intermediate_callback_cnt] = callback
|
|
391
|
+
self._check(dll.ailiaSpeechSetIntermediateCallback(self._instance, self._c_callback, intermediate_callback_cnt))
|
|
392
|
+
intermediate_callback_cnt = intermediate_callback_cnt + 1
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY):
|
|
396
|
+
if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
|
|
397
|
+
encoder_path = "encoder_tiny.opt.onnx"
|
|
398
|
+
decoder_path = "decoder_tiny_fix_kv_cache.opt2.onnx"
|
|
399
|
+
encoder_pb_path = None
|
|
400
|
+
decoder_pb_path = None
|
|
401
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE:
|
|
402
|
+
encoder_path = "encoder_base.opt.onnx"
|
|
403
|
+
decoder_path = "decoder_base_fix_kv_cache.opt2.onnx"
|
|
404
|
+
encoder_pb_path = None
|
|
405
|
+
decoder_pb_path = None
|
|
406
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL:
|
|
407
|
+
encoder_path = "encoder_small.opt.onnx"
|
|
408
|
+
decoder_path = "decoder_small_fix_kv_cache.opt2.onnx"
|
|
409
|
+
encoder_pb_path = None
|
|
410
|
+
decoder_pb_path = None
|
|
411
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM:
|
|
412
|
+
encoder_path = "encoder_medium.opt.onnx"
|
|
413
|
+
decoder_path = "decoder_medium_fix_kv_cache.opt2.onnx"
|
|
414
|
+
encoder_pb_path = None
|
|
415
|
+
decoder_pb_path = None
|
|
416
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE:
|
|
417
|
+
encoder_path = "encoder_large.onnx"
|
|
418
|
+
decoder_path = "decoder_large_fix_kv_cache.onnx"
|
|
419
|
+
encoder_pb_path = "encoder_large_weights.pb"
|
|
420
|
+
decoder_pb_path = "decoder_large_fix_kv_cache_weights.pb"
|
|
421
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3:
|
|
422
|
+
encoder_path = "encoder_large_v3.onnx"
|
|
423
|
+
decoder_path = "decoder_large_v3_fix_kv_cache.onnx"
|
|
424
|
+
encoder_pb_path = "encoder_large_v3_weights.pb"
|
|
425
|
+
decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
|
|
426
|
+
self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
|
|
427
|
+
self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
|
|
428
|
+
self._open_vad(model_path + "silero_vad.onnx", AILIA_SPEECH_VAD_TYPE_SILERO)
|
|
429
|
+
|
|
430
|
+
def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path):
|
|
431
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
|
|
432
|
+
os.makedirs(model_path, exist_ok = True)
|
|
433
|
+
check_and_download_file(model_path + encoder_path, REMOTE_PATH)
|
|
434
|
+
check_and_download_file(model_path + decoder_path, REMOTE_PATH)
|
|
435
|
+
if encoder_pb_path is not None:
|
|
436
|
+
check_and_download_file(model_path + encoder_pb_path, REMOTE_PATH)
|
|
437
|
+
if decoder_pb_path is not None:
|
|
438
|
+
check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
|
|
439
|
+
|
|
440
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
|
|
441
|
+
check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
|
|
442
|
+
|
|
443
|
+
def _open_model(self, encoder, decoder, model_type):
|
|
444
|
+
p1 = self._string_buffer(encoder)
|
|
445
|
+
p2 = self._string_buffer(decoder)
|
|
446
|
+
|
|
447
|
+
if sys.platform == "win32":
|
|
448
|
+
self._check(dll.ailiaSpeechOpenModelFileW(self._instance, p1, p2, model_type))
|
|
449
|
+
else:
|
|
450
|
+
self._check(dll.ailiaSpeechOpenModelFileA(self._instance, p1, p2, model_type))
|
|
451
|
+
|
|
452
|
+
def _open_vad(self, vad, vad_type):
|
|
453
|
+
p1 = self._string_buffer(vad)
|
|
454
|
+
|
|
455
|
+
if sys.platform == "win32":
|
|
456
|
+
self._check(dll.ailiaSpeechOpenVadFileW(self._instance, p1, vad_type))
|
|
457
|
+
else:
|
|
458
|
+
self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
|
|
459
|
+
|
|
460
|
+
def transcribe(self, audio_waveform, sampling_rate, lang = None):
|
|
461
|
+
if len(audio_waveform.shape) == 1:
|
|
462
|
+
channels = 1
|
|
463
|
+
elif len(audio_waveform.shape) == 2:
|
|
464
|
+
channels = audio_waveform.shape[0]
|
|
465
|
+
audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
|
|
466
|
+
else:
|
|
467
|
+
raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
|
|
468
|
+
|
|
469
|
+
audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
|
|
470
|
+
|
|
471
|
+
if lang is not None:
|
|
472
|
+
self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
|
|
473
|
+
|
|
474
|
+
self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
|
|
475
|
+
self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
|
|
476
|
+
self._check(dll.ailiaSpeechTranscribe(self._instance))
|
|
477
|
+
|
|
478
|
+
count = ctypes.c_uint(0)
|
|
479
|
+
self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
|
|
480
|
+
results = []
|
|
481
|
+
for i in range(count.value):
|
|
482
|
+
text = AILIASpeechText()
|
|
483
|
+
self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
|
|
484
|
+
results.append({"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence})
|
|
485
|
+
|
|
486
|
+
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
487
|
+
|
|
488
|
+
return results
|
|
489
|
+
|
|
490
|
+
def __del__(self):
|
|
491
|
+
if self._instance:
|
|
492
|
+
dll.ailiaSpeechDestroy(cast(self._instance, c_void_p))
|
|
493
|
+
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ailia_speech
|
|
3
|
+
Version: 1.3.0.0
|
|
4
|
+
Summary: ailia AI Speech
|
|
5
|
+
Home-page: https://ailia.jp/
|
|
6
|
+
Author: ax Inc.
|
|
7
|
+
Author-email: contact@axinc.jp
|
|
8
|
+
License: https://ailia.ai/en/license/
|
|
9
|
+
Requires-Python: >3.6
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: ailia
|
|
12
|
+
Requires-Dist: ailia_tokenizer
|
|
13
|
+
|
|
14
|
+
# ailia AI Speech Python API
|
|
15
|
+
|
|
16
|
+
!! CAUTION !!
|
|
17
|
+
“ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
|
|
18
|
+
As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
|
|
19
|
+
|
|
20
|
+
## About ailia AI Speech
|
|
21
|
+
|
|
22
|
+
ailia Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia Speech, you can easily integrate AI powered speech recognition into your applications.
|
|
23
|
+
|
|
24
|
+
## Install from pip
|
|
25
|
+
|
|
26
|
+
You can install the ailia SDK free evaluation package with the following command.
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
pip3 install ailia_speech
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Install from package
|
|
33
|
+
|
|
34
|
+
You can install the ailia SDK from Package with the following command.
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
python3 bootstrap.py
|
|
38
|
+
pip3 install ./
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import ailia
|
|
45
|
+
import ailia_speech
|
|
46
|
+
|
|
47
|
+
import librosa
|
|
48
|
+
|
|
49
|
+
import os
|
|
50
|
+
import urllib.request
|
|
51
|
+
|
|
52
|
+
# Load target audio
|
|
53
|
+
ref_file_path = "demo.wav"
|
|
54
|
+
if not os.path.exists(ref_file_path):
|
|
55
|
+
urllib.request.urlretrieve(
|
|
56
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
|
|
57
|
+
"demo.wav"
|
|
58
|
+
)
|
|
59
|
+
audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
|
|
60
|
+
|
|
61
|
+
# Infer
|
|
62
|
+
speech = ailia_speech.Whisper()
|
|
63
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
|
|
64
|
+
recognized_text = speech.transcribe(audio_waveform, sampling_rate)
|
|
65
|
+
print(recognized_text)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## API specification
|
|
69
|
+
|
|
70
|
+
https://github.com/axinc-ai/ailia-sdk
|
|
71
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
ailia_speech/LICENSE_AILIA_EN.pdf
|
|
4
|
+
ailia_speech/LICENSE_AILIA_JA.pdf
|
|
5
|
+
ailia_speech/__init__.py
|
|
6
|
+
ailia_speech.egg-info/PKG-INFO
|
|
7
|
+
ailia_speech.egg-info/SOURCES.txt
|
|
8
|
+
ailia_speech.egg-info/dependency_links.txt
|
|
9
|
+
ailia_speech.egg-info/requires.txt
|
|
10
|
+
ailia_speech.egg-info/top_level.txt
|
|
11
|
+
ailia_speech/linux/arm64-v8a/libailia_speech.so
|
|
12
|
+
ailia_speech/linux/x64/libailia_speech.so
|
|
13
|
+
ailia_speech/mac/libailia_speech.dylib
|
|
14
|
+
ailia_speech/windows/x64/ailia_speech.dll
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ailia_speech
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import platform
|
|
4
|
+
import glob
|
|
5
|
+
import shutil
|
|
6
|
+
import platform
|
|
7
|
+
|
|
8
|
+
from setuptools import setup, Extension
|
|
9
|
+
from setuptools import find_packages
|
|
10
|
+
|
|
11
|
+
with open("README.md", encoding="utf-8") as f:
|
|
12
|
+
long_description = f.read()
|
|
13
|
+
|
|
14
|
+
scripts = []
|
|
15
|
+
for f in glob.glob("ailia_speech/*.py"):
|
|
16
|
+
scripts.append(f)
|
|
17
|
+
|
|
18
|
+
def find_libraries():
|
|
19
|
+
dll_names = []
|
|
20
|
+
platforms = ["win32", "darwin", "linux_armv7l", "linux_aarch64", "linux_x86_64"]
|
|
21
|
+
|
|
22
|
+
for platform in platforms:
|
|
23
|
+
if platform == "win32":
|
|
24
|
+
dll_platform = "windows/x64"
|
|
25
|
+
dll_type = ".dll"
|
|
26
|
+
elif platform == "darwin":
|
|
27
|
+
dll_platform = "mac"
|
|
28
|
+
dll_type = ".dylib"
|
|
29
|
+
else:
|
|
30
|
+
if platform == "linux_armv7l":
|
|
31
|
+
dll_platform = "linux/armeabi-v7a"
|
|
32
|
+
elif platform == "linux_aarch64":
|
|
33
|
+
dll_platform = "linux/arm64-v8a"
|
|
34
|
+
else:
|
|
35
|
+
dll_platform = "linux/x64"
|
|
36
|
+
dll_type = ".so"
|
|
37
|
+
|
|
38
|
+
dll_path = "./ailia_speech/" + dll_platform + "/"
|
|
39
|
+
|
|
40
|
+
for f in glob.glob(dll_path+"*"+dll_type):
|
|
41
|
+
f = f.replace("\\", "/")
|
|
42
|
+
f = f.replace("./ailia_speech/", "./")
|
|
43
|
+
dll_names.append(f)
|
|
44
|
+
|
|
45
|
+
dll_names.append("./LICENSE_AILIA_EN.pdf")
|
|
46
|
+
dll_names.append("./LICENSE_AILIA_JA.pdf")
|
|
47
|
+
dll_names.append("./oss/LICENSE_SILERO_VAD.txt")
|
|
48
|
+
dll_names.append("./oss/LICENSE_SRELL.txt")
|
|
49
|
+
dll_names.append("./oss/LICENSE_WHISPER.txt")
|
|
50
|
+
|
|
51
|
+
return dll_names
|
|
52
|
+
|
|
53
|
+
if __name__ == "__main__":
|
|
54
|
+
setup(
|
|
55
|
+
name="ailia_speech",
|
|
56
|
+
scripts=scripts,
|
|
57
|
+
version="1.3.0.0",
|
|
58
|
+
install_requires=[
|
|
59
|
+
"ailia",
|
|
60
|
+
"ailia_tokenizer",
|
|
61
|
+
],
|
|
62
|
+
description="ailia AI Speech",
|
|
63
|
+
long_description=long_description,
|
|
64
|
+
long_description_content_type="text/markdown",
|
|
65
|
+
author="ax Inc.",
|
|
66
|
+
author_email="contact@axinc.jp",
|
|
67
|
+
url="https://ailia.jp/",
|
|
68
|
+
license="https://ailia.ai/en/license/",
|
|
69
|
+
packages=find_packages(),
|
|
70
|
+
package_data={"ailia_speech":find_libraries()},
|
|
71
|
+
include_package_data=True,
|
|
72
|
+
python_requires=">3.6",
|
|
73
|
+
)
|