ailia-speech 1.3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ailia-speech might be problematic. Click here for more details.
- ailia_speech/LICENSE_AILIA_EN.pdf +0 -0
- ailia_speech/LICENSE_AILIA_JA.pdf +0 -0
- ailia_speech/__init__.py +493 -0
- ailia_speech/linux/arm64-v8a/libailia_speech.so +0 -0
- ailia_speech/linux/x64/libailia_speech.so +0 -0
- ailia_speech/mac/libailia_speech.dylib +0 -0
- ailia_speech/windows/x64/ailia_speech.dll +0 -0
- ailia_speech-1.3.0.0.data/scripts/__init__.py +493 -0
- ailia_speech-1.3.0.0.dist-info/METADATA +71 -0
- ailia_speech-1.3.0.0.dist-info/RECORD +12 -0
- ailia_speech-1.3.0.0.dist-info/WHEEL +5 -0
- ailia_speech-1.3.0.0.dist-info/top_level.txt +1 -0
|
Binary file
|
|
Binary file
|
ailia_speech/__init__.py
ADDED
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
import ctypes
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import numpy
|
|
6
|
+
import ailia
|
|
7
|
+
import ailia.audio
|
|
8
|
+
import ailia_tokenizer
|
|
9
|
+
|
|
10
|
+
import urllib.request
|
|
11
|
+
import ssl
|
|
12
|
+
import shutil
|
|
13
|
+
import platform
|
|
14
|
+
|
|
15
|
+
#### dependency check
|
|
16
|
+
if sys.platform == "win32":
|
|
17
|
+
import ctypes
|
|
18
|
+
try:
|
|
19
|
+
for library in ["vcruntime140.dll", "vcruntime140_1.dll", "msvcp140.dll"]:
|
|
20
|
+
ctypes.windll.LoadLibrary(library)
|
|
21
|
+
except:
|
|
22
|
+
print(" WARNING Please install MSVC 2015-2019 runtime from https://docs.microsoft.com/ja-jp/cpp/windows/latest-supported-vc-redist")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
#### loading DLL / DYLIB / SO ####
|
|
26
|
+
if sys.platform == "win32":
|
|
27
|
+
dll_platform = "windows/x64"
|
|
28
|
+
dll_name = "ailia_speech.dll"
|
|
29
|
+
load_fn = ctypes.WinDLL
|
|
30
|
+
elif sys.platform == "darwin":
|
|
31
|
+
dll_platform = "mac"
|
|
32
|
+
dll_name = "libailia_speech.dylib"
|
|
33
|
+
load_fn = ctypes.CDLL
|
|
34
|
+
else:
|
|
35
|
+
is_arm = "arm" in platform.machine() or platform.machine() == "aarch64"
|
|
36
|
+
if is_arm:
|
|
37
|
+
if platform.architecture()[0] == "32bit":
|
|
38
|
+
dll_platform = "linux/armeabi-v7a"
|
|
39
|
+
else:
|
|
40
|
+
dll_platform = "linux/arm64-v8a"
|
|
41
|
+
else:
|
|
42
|
+
dll_platform = "linux/x64"
|
|
43
|
+
dll_name = "libailia_speech.so"
|
|
44
|
+
load_fn = ctypes.CDLL
|
|
45
|
+
|
|
46
|
+
dll_found = False
|
|
47
|
+
candidate = ["", str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep), str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep) + dll_platform + str(os.sep)]
|
|
48
|
+
for dir in candidate:
|
|
49
|
+
try:
|
|
50
|
+
dll = load_fn(dir + dll_name)
|
|
51
|
+
dll_found = True
|
|
52
|
+
except:
|
|
53
|
+
pass
|
|
54
|
+
if not dll_found:
|
|
55
|
+
msg = "DLL load failed : \'" + dll_name + "\' is not found"
|
|
56
|
+
raise ImportError(msg)
|
|
57
|
+
|
|
58
|
+
# ==============================================================================
|
|
59
|
+
|
|
60
|
+
from ctypes import *
|
|
61
|
+
|
|
62
|
+
AILIA_SPEECH_STATUS_SUCCESS = ( 0 )
|
|
63
|
+
|
|
64
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY = (0)
|
|
65
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE = (1)
|
|
66
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL = (2)
|
|
67
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM = (3)
|
|
68
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE = (4)
|
|
69
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 = (5)
|
|
70
|
+
|
|
71
|
+
AILIA_SPEECH_TASK_TRANSCRIBE = (0)
|
|
72
|
+
AILIA_SPEECH_TASK_TRANSLATE = (1)
|
|
73
|
+
|
|
74
|
+
AILIA_SPEECH_FLAG_NONE = (0)
|
|
75
|
+
AILIA_SPEECH_FLAG_LIVE = (1)
|
|
76
|
+
|
|
77
|
+
AILIA_SPEECH_VAD_TYPE_SILERO = (0)
|
|
78
|
+
AILIA_SPEECH_API_CALLBACK_VERSION = (6)
|
|
79
|
+
|
|
80
|
+
AILIA_SPEECH_TEXT_VERSION = (2)
|
|
81
|
+
|
|
82
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
|
|
83
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
|
|
84
|
+
|
|
85
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p) , c_int, c_int)
|
|
86
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A = CFUNCTYPE((c_int), c_void_p , c_char_p)
|
|
87
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W = CFUNCTYPE((c_int), c_void_p , c_wchar)
|
|
88
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE = CFUNCTYPE((c_int), c_void_p , c_char_p)
|
|
89
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
|
|
90
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS = CFUNCTYPE((c_int), c_void_p , POINTER(c_int) , c_uint)
|
|
91
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE = CFUNCTYPE((c_int), c_void_p , POINTER(c_int), c_uint)
|
|
92
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
|
|
93
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT = CFUNCTYPE((c_int), c_void_p , c_char_p , c_uint)
|
|
94
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY = CFUNCTYPE((c_int), c_void_p)
|
|
95
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32 = CFUNCTYPE((c_int), POINTER(c_uint) , POINTER(c_uint) , c_char_p , c_uint)
|
|
96
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8 = CFUNCTYPE((c_int), c_char_p, POINTER(c_uint) , c_uint)
|
|
97
|
+
|
|
98
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE = CFUNCTYPE((c_int), POINTER(c_float), POINTER(c_float), c_int, c_int, c_int, c_int)
|
|
99
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN = CFUNCTYPE((c_int), POINTER(c_int), c_int, c_int, c_int)
|
|
100
|
+
AILIA_SPEECH_USER_API_AILIA_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p), c_int, c_int)
|
|
101
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A = CFUNCTYPE((c_int), c_void_p, c_char_p)
|
|
102
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W = CFUNCTYPE((c_int), c_void_p, POINTER(c_wchar))
|
|
103
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM = CFUNCTYPE((c_int), c_void_p, POINTER(c_byte), c_uint)
|
|
104
|
+
AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE = CFUNCTYPE((c_int), c_void_p, c_uint)
|
|
105
|
+
AILIA_SPEECH_USER_API_AILIA_DESTROY = CFUNCTYPE((None), c_void_p)
|
|
106
|
+
AILIA_SPEECH_USER_API_AILIA_UPDATE = CFUNCTYPE((c_int), c_void_p)
|
|
107
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
|
|
108
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
|
|
109
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
|
|
110
|
+
AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
|
|
111
|
+
AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
|
|
112
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
|
|
113
|
+
AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL = CFUNCTYPE((c_char_p), c_void_p)
|
|
114
|
+
|
|
115
|
+
AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, c_uint, c_void_p, c_uint)
|
|
116
|
+
AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT = CFUNCTYPE((c_int), POINTER(c_void_p), c_uint, c_uint)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class struct__AILIASpeechApiCallback(Structure):
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
struct__AILIASpeechApiCallback.__slots__ = [
|
|
123
|
+
'ailiaAudioGetFrameLen',
|
|
124
|
+
'ailiaAudioGetMelSpectrogram',
|
|
125
|
+
'ailiaAudioResample',
|
|
126
|
+
'ailiaAudioGetResampleLen',
|
|
127
|
+
|
|
128
|
+
'ailiaTokenizerCreate',
|
|
129
|
+
'ailiaTokenizerOpenModelFileA',
|
|
130
|
+
'ailiaTokenizerOpenModelFileW',
|
|
131
|
+
'ailiaTokenizerEncode',
|
|
132
|
+
'ailiaTokenizerGetTokenCount',
|
|
133
|
+
'ailiaTokenizerGetTokens',
|
|
134
|
+
'ailiaTokenizerDecode',
|
|
135
|
+
'ailiaTokenizerGetTextLength',
|
|
136
|
+
'ailiaTokenizerGetText',
|
|
137
|
+
'ailiaTokenizerDestroy',
|
|
138
|
+
'ailiaTokenizerUtf8ToUtf32',
|
|
139
|
+
'ailiaTokenizerUtf32ToUtf8',
|
|
140
|
+
|
|
141
|
+
'ailiaCreate',
|
|
142
|
+
'ailiaOpenWeightFileA',
|
|
143
|
+
'ailiaOpenWeightFileW',
|
|
144
|
+
'ailiaOpenWeightMem',
|
|
145
|
+
'ailiaSetMemoryMode',
|
|
146
|
+
'ailiaDestroy',
|
|
147
|
+
'ailiaUpdate',
|
|
148
|
+
'ailiaGetBlobIndexByInputIndex',
|
|
149
|
+
'ailiaGetBlobIndexByOutputIndex',
|
|
150
|
+
'ailiaGetBlobData',
|
|
151
|
+
'ailiaSetInputBlobData',
|
|
152
|
+
'ailiaSetInputBlobShape',
|
|
153
|
+
'ailiaGetBlobShape',
|
|
154
|
+
'ailiaGetErrorDetail',
|
|
155
|
+
'ailiaCopyBlobData',
|
|
156
|
+
'ailiaGetEnvironment',
|
|
157
|
+
]
|
|
158
|
+
struct__AILIASpeechApiCallback._fields_ = [
|
|
159
|
+
('ailiaAudioGetFrameLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN),
|
|
160
|
+
('ailiaAudioGetMelSpectrogram', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM),
|
|
161
|
+
('ailiaAudioResample', AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE),
|
|
162
|
+
('ailiaAudioGetResampleLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN),
|
|
163
|
+
|
|
164
|
+
('ailiaTokenizerCreate', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE),
|
|
165
|
+
('ailiaTokenizerOpenModelFileA', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A),
|
|
166
|
+
('ailiaTokenizerOpenModelFileW', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W),
|
|
167
|
+
('ailiaTokenizerEncode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE),
|
|
168
|
+
('ailiaTokenizerGetTokenCount', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT),
|
|
169
|
+
('ailiaTokenizerGetTokens', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS),
|
|
170
|
+
('ailiaTokenizerDecode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE),
|
|
171
|
+
('ailiaTokenizerGetTextLength', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH),
|
|
172
|
+
('ailiaTokenizerGetText', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT),
|
|
173
|
+
('ailiaTokenizerDestroy', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY),
|
|
174
|
+
('ailiaTokenizerUtf8ToUtf32', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32),
|
|
175
|
+
('ailiaTokenizerUtf32ToUtf8', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8),
|
|
176
|
+
|
|
177
|
+
('ailiaCreate', AILIA_SPEECH_USER_API_AILIA_CREATE),
|
|
178
|
+
('ailiaOpenWeightFileA', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A),
|
|
179
|
+
('ailiaOpenWeightFileW', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W),
|
|
180
|
+
('ailiaOpenWeightMem', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM),
|
|
181
|
+
('ailiaSetMemoryMode', AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE),
|
|
182
|
+
('ailiaDestroy', AILIA_SPEECH_USER_API_AILIA_DESTROY),
|
|
183
|
+
('ailiaUpdate', AILIA_SPEECH_USER_API_AILIA_UPDATE),
|
|
184
|
+
('ailiaGetBlobIndexByInputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX),
|
|
185
|
+
('ailiaGetBlobIndexByOutputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX),
|
|
186
|
+
('ailiaGetBlobData', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA),
|
|
187
|
+
('ailiaSetInputBlobData', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA),
|
|
188
|
+
('ailiaSetInputBlobShape', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE),
|
|
189
|
+
('ailiaGetBlobShape', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE),
|
|
190
|
+
('ailiaGetErrorDetail', AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL),
|
|
191
|
+
('ailiaCopyBlobData', AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA),
|
|
192
|
+
('ailiaGetEnvironment', AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT),
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
AILIASpeechApiCallback = struct__AILIASpeechApiCallback
|
|
196
|
+
|
|
197
|
+
# ==============================================================================
|
|
198
|
+
|
|
199
|
+
dll.ailiaSpeechCreate.restype = c_int
|
|
200
|
+
dll.ailiaSpeechCreate.argtypes = (POINTER(c_void_p), c_int32, c_int32, c_int32, c_int32, c_int32, AILIASpeechApiCallback, c_int32)
|
|
201
|
+
|
|
202
|
+
dll.ailiaSpeechDestroy.restype = None
|
|
203
|
+
dll.ailiaSpeechDestroy.argtypes = (c_void_p, )
|
|
204
|
+
|
|
205
|
+
dll.ailiaSpeechOpenModelFileA.restype = c_int
|
|
206
|
+
dll.ailiaSpeechOpenModelFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
|
|
207
|
+
|
|
208
|
+
dll.ailiaSpeechOpenModelFileW.restype = c_int
|
|
209
|
+
dll.ailiaSpeechOpenModelFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
|
|
210
|
+
|
|
211
|
+
dll.ailiaSpeechOpenVadFileA.restype = c_int
|
|
212
|
+
dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
|
|
213
|
+
|
|
214
|
+
dll.ailiaSpeechOpenVadFileW.restype = c_int
|
|
215
|
+
dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
|
|
216
|
+
|
|
217
|
+
dll.ailiaSpeechPushInputData.restype = c_int
|
|
218
|
+
dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
|
|
219
|
+
dtype=numpy.float32, flags='CONTIGUOUS'
|
|
220
|
+
), # src
|
|
221
|
+
ctypes.c_uint,
|
|
222
|
+
ctypes.c_uint,
|
|
223
|
+
ctypes.c_uint)
|
|
224
|
+
|
|
225
|
+
dll.ailiaSpeechFinalizeInputData.restype = c_int
|
|
226
|
+
dll.ailiaSpeechFinalizeInputData.argtypes = (c_void_p, )
|
|
227
|
+
|
|
228
|
+
dll.ailiaSpeechBuffered.restype = c_int
|
|
229
|
+
dll.ailiaSpeechBuffered.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
230
|
+
|
|
231
|
+
dll.ailiaSpeechComplete.restype = c_int
|
|
232
|
+
dll.ailiaSpeechComplete.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
233
|
+
|
|
234
|
+
dll.ailiaSpeechTranscribe.restype = c_int
|
|
235
|
+
dll.ailiaSpeechTranscribe.argtypes = (c_void_p, )
|
|
236
|
+
|
|
237
|
+
dll.ailiaSpeechGetTextCount.restype = c_int
|
|
238
|
+
dll.ailiaSpeechGetTextCount.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
239
|
+
|
|
240
|
+
class AILIASpeechText(ctypes.Structure):
|
|
241
|
+
_fields_ = [
|
|
242
|
+
("text", ctypes.c_char_p),
|
|
243
|
+
("time_stamp_begin", ctypes.c_float),
|
|
244
|
+
("time_stamp_end", ctypes.c_float),
|
|
245
|
+
("person_id", ctypes.c_uint),
|
|
246
|
+
("language", ctypes.c_char_p),
|
|
247
|
+
("confidence", ctypes.c_float)]
|
|
248
|
+
|
|
249
|
+
dll.ailiaSpeechGetText.restype = c_int
|
|
250
|
+
dll.ailiaSpeechGetText.argtypes = (c_void_p, POINTER(AILIASpeechText), ctypes.c_uint, ctypes.c_uint)
|
|
251
|
+
|
|
252
|
+
dll.ailiaSpeechResetTranscribeState.restype = c_int
|
|
253
|
+
dll.ailiaSpeechResetTranscribeState.argtypes = (c_void_p, )
|
|
254
|
+
|
|
255
|
+
AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK = CFUNCTYPE((c_int), c_int64, c_char_p)
|
|
256
|
+
|
|
257
|
+
dll.ailiaSpeechSetIntermediateCallback.restype = c_int
|
|
258
|
+
dll.ailiaSpeechSetIntermediateCallback.argtypes = (c_void_p, AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK, c_int64)
|
|
259
|
+
|
|
260
|
+
dll.ailiaSpeechSetLanguage.restype = c_int
|
|
261
|
+
dll.ailiaSpeechSetLanguage.argtypes = (c_void_p, c_char_p)
|
|
262
|
+
|
|
263
|
+
# ==============================================================================
|
|
264
|
+
# model download
|
|
265
|
+
# ==============================================================================
|
|
266
|
+
|
|
267
|
+
def progress_print(block_count, block_size, total_size):
|
|
268
|
+
percentage = 100.0 * block_count * block_size / total_size
|
|
269
|
+
if percentage > 100:
|
|
270
|
+
# Bigger than 100 does not look good, so...
|
|
271
|
+
percentage = 100
|
|
272
|
+
max_bar = 50
|
|
273
|
+
bar_num = int(percentage / (100 / max_bar))
|
|
274
|
+
progress_element = '=' * bar_num
|
|
275
|
+
if bar_num != max_bar:
|
|
276
|
+
progress_element += '>'
|
|
277
|
+
bar_fill = ' ' # fill the blanks
|
|
278
|
+
bar = progress_element.ljust(max_bar, bar_fill)
|
|
279
|
+
total_size_kb = total_size / 1024
|
|
280
|
+
print(f'[{bar} {percentage:.2f}% ( {total_size_kb:.0f}KB )]', end='\r')
|
|
281
|
+
|
|
282
|
+
def urlretrieve(remote_path, weight_path, progress_print):
|
|
283
|
+
temp_path = weight_path + ".tmp"
|
|
284
|
+
try:
|
|
285
|
+
#raise ssl.SSLError # test
|
|
286
|
+
urllib.request.urlretrieve(
|
|
287
|
+
remote_path,
|
|
288
|
+
temp_path,
|
|
289
|
+
progress_print,
|
|
290
|
+
)
|
|
291
|
+
except ssl.SSLError as e:
|
|
292
|
+
print(f'SSLError detected, so try to download without ssl')
|
|
293
|
+
remote_path = remote_path.replace("https","http")
|
|
294
|
+
urllib.request.urlretrieve(
|
|
295
|
+
remote_path,
|
|
296
|
+
temp_path,
|
|
297
|
+
progress_print,
|
|
298
|
+
)
|
|
299
|
+
shutil.move(temp_path, weight_path)
|
|
300
|
+
|
|
301
|
+
def check_and_download_file(file_path, remote_path):
|
|
302
|
+
if not os.path.exists(file_path):
|
|
303
|
+
print('Downloading %s...' % file_path)
|
|
304
|
+
urlretrieve(remote_path + os.path.basename(file_path), file_path, progress_print)
|
|
305
|
+
|
|
306
|
+
# ==============================================================================
|
|
307
|
+
# base model class
|
|
308
|
+
# ==============================================================================
|
|
309
|
+
|
|
310
|
+
class AiliaSpeechError(RuntimeError):
|
|
311
|
+
def __init__(self, message, code):
|
|
312
|
+
super().__init__(f"{message} code:{code}")
|
|
313
|
+
self.code = code
|
|
314
|
+
|
|
315
|
+
class AiliaSpeechModel:
|
|
316
|
+
_api_callback = None
|
|
317
|
+
_instance = None
|
|
318
|
+
|
|
319
|
+
def _check(self, status):
|
|
320
|
+
if status != AILIA_SPEECH_STATUS_SUCCESS:
|
|
321
|
+
raise AiliaSpeechError(f"ailia speech error", status)
|
|
322
|
+
|
|
323
|
+
def _string_buffer(self, path):
|
|
324
|
+
if sys.platform == "win32":
|
|
325
|
+
return ctypes.create_unicode_buffer(path)
|
|
326
|
+
else:
|
|
327
|
+
return ctypes.create_string_buffer(path.encode("utf-8"))
|
|
328
|
+
|
|
329
|
+
def _create_callback(self):
|
|
330
|
+
callback = AILIASpeechApiCallback()
|
|
331
|
+
callback.ailiaAudioGetFrameLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN(("ailiaAudioGetFrameLen", ailia.audio.audio_core.dll))
|
|
332
|
+
callback.ailiaAudioGetMelSpectrogram = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM(("ailiaAudioGetMelSpectrogram", ailia.audio.audio_core.dll))
|
|
333
|
+
callback.ailiaAudioResample = AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE(("ailiaAudioResample", ailia.audio.audio_core.dll))
|
|
334
|
+
callback.ailiaAudioGetResampleLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN(("ailiaAudioGetResampleLen", ailia.audio.audio_core.dll))
|
|
335
|
+
|
|
336
|
+
callback.ailiaTokenizerCreate = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE(("ailiaTokenizerCreate", ailia_tokenizer.dll))
|
|
337
|
+
callback.ailiaTokenizerOpenModelFileA = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A(("ailiaTokenizerOpenModelFileA", ailia_tokenizer.dll))
|
|
338
|
+
callback.ailiaTokenizerOpenModelFileW = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W(("ailiaTokenizerOpenModelFileW", ailia_tokenizer.dll))
|
|
339
|
+
callback.ailiaTokenizerEncode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE(("ailiaTokenizerEncode", ailia_tokenizer.dll))
|
|
340
|
+
callback.ailiaTokenizerGetTokenCount = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT(("ailiaTokenizerGetTokenCount", ailia_tokenizer.dll))
|
|
341
|
+
callback.ailiaTokenizerGetTokens = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS(("ailiaTokenizerGetTokens", ailia_tokenizer.dll))
|
|
342
|
+
callback.ailiaTokenizerDecode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE(("ailiaTokenizerDecode", ailia_tokenizer.dll))
|
|
343
|
+
callback.ailiaTokenizerGetTextLength = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH(("ailiaTokenizerGetTextLength", ailia_tokenizer.dll))
|
|
344
|
+
callback.ailiaTokenizerGetText = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT(("ailiaTokenizerGetText", ailia_tokenizer.dll))
|
|
345
|
+
callback.ailiaTokenizerDestroy = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY(("ailiaTokenizerDestroy", ailia_tokenizer.dll))
|
|
346
|
+
callback.ailiaTokenizerUtf8ToUtf32 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32(("ailiaTokenizerUtf8ToUtf32", ailia_tokenizer.dll))
|
|
347
|
+
callback.ailiaTokenizerUtf32ToUtf8 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8(("ailiaTokenizerUtf32ToUtf8", ailia_tokenizer.dll))
|
|
348
|
+
|
|
349
|
+
callback.ailiaCreate = AILIA_SPEECH_USER_API_AILIA_CREATE(("ailiaCreate", ailia.core.dll))
|
|
350
|
+
callback.ailiaOpenWeightFileA = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A(("ailiaOpenWeightFileA", ailia.core.dll))
|
|
351
|
+
callback.ailiaOpenWeightFileW = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W(("ailiaOpenWeightFileW", ailia.core.dll))
|
|
352
|
+
callback.ailiaOpenWeightMem = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM(("ailiaOpenWeightMem", ailia.core.dll))
|
|
353
|
+
callback.ailiaSetMemoryMode = AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE(("ailiaSetMemoryMode", ailia.core.dll))
|
|
354
|
+
callback.ailiaDestroy = AILIA_SPEECH_USER_API_AILIA_DESTROY(("ailiaDestroy", ailia.core.dll))
|
|
355
|
+
callback.ailiaUpdate = AILIA_SPEECH_USER_API_AILIA_UPDATE(("ailiaUpdate", ailia.core.dll))
|
|
356
|
+
callback.ailiaGetBlobIndexByInputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX(("ailiaGetBlobIndexByInputIndex", ailia.core.dll))
|
|
357
|
+
callback.ailiaGetBlobIndexByOutputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX(("ailiaGetBlobIndexByOutputIndex", ailia.core.dll))
|
|
358
|
+
callback.ailiaGetBlobData = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA(("ailiaGetBlobData", ailia.core.dll))
|
|
359
|
+
callback.ailiaSetInputBlobData = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA(("ailiaSetInputBlobData", ailia.core.dll))
|
|
360
|
+
callback.ailiaSetInputBlobShape = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE(("ailiaSetInputBlobShape", ailia.core.dll))
|
|
361
|
+
callback.ailiaGetBlobShape = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE(("ailiaGetBlobShape", ailia.core.dll))
|
|
362
|
+
callback.ailiaGetErrorDetail = AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL(("ailiaGetErrorDetail", ailia.core.dll))
|
|
363
|
+
callback.ailiaCopyBlobData = AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA(("ailiaCopyBlobData", ailia.core.dll))
|
|
364
|
+
callback.ailiaGetEnvironment = AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT(("ailiaGetEnvironment", ailia.core.dll))
|
|
365
|
+
|
|
366
|
+
self._api_callback = callback # prevent GC
|
|
367
|
+
|
|
368
|
+
# ==============================================================================
|
|
369
|
+
# Public class
|
|
370
|
+
# ==============================================================================
|
|
371
|
+
|
|
372
|
+
intermediate_callback_cnt = 0
|
|
373
|
+
intermediate_callback_map = {}
|
|
374
|
+
|
|
375
|
+
def intermediate_callback(handle, text):
|
|
376
|
+
intermediate_callback_map[handle](text.decode())
|
|
377
|
+
return 0
|
|
378
|
+
|
|
379
|
+
class Whisper(AiliaSpeechModel):
|
|
380
|
+
_c_callback = None
|
|
381
|
+
|
|
382
|
+
def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
|
|
383
|
+
self._instance = ctypes.c_void_p(None)
|
|
384
|
+
self._create_callback()
|
|
385
|
+
self._check(dll.ailiaSpeechCreate(cast(pointer(self._instance), POINTER(c_void_p)), ctypes.c_int32(env_id), ctypes.c_int32(num_thread), ctypes.c_int32(memory_mode), ctypes.c_int32(task), ctypes.c_int32(flags), self._api_callback, ctypes.c_int32(AILIA_SPEECH_API_CALLBACK_VERSION)))
|
|
386
|
+
if callback is not None:
|
|
387
|
+
self._c_callback = AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK(intermediate_callback)
|
|
388
|
+
global intermediate_callback_cnt
|
|
389
|
+
global intermediate_callback_map
|
|
390
|
+
intermediate_callback_map[intermediate_callback_cnt] = callback
|
|
391
|
+
self._check(dll.ailiaSpeechSetIntermediateCallback(self._instance, self._c_callback, intermediate_callback_cnt))
|
|
392
|
+
intermediate_callback_cnt = intermediate_callback_cnt + 1
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY):
|
|
396
|
+
if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
|
|
397
|
+
encoder_path = "encoder_tiny.opt.onnx"
|
|
398
|
+
decoder_path = "decoder_tiny_fix_kv_cache.opt2.onnx"
|
|
399
|
+
encoder_pb_path = None
|
|
400
|
+
decoder_pb_path = None
|
|
401
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE:
|
|
402
|
+
encoder_path = "encoder_base.opt.onnx"
|
|
403
|
+
decoder_path = "decoder_base_fix_kv_cache.opt2.onnx"
|
|
404
|
+
encoder_pb_path = None
|
|
405
|
+
decoder_pb_path = None
|
|
406
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL:
|
|
407
|
+
encoder_path = "encoder_small.opt.onnx"
|
|
408
|
+
decoder_path = "decoder_small_fix_kv_cache.opt2.onnx"
|
|
409
|
+
encoder_pb_path = None
|
|
410
|
+
decoder_pb_path = None
|
|
411
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM:
|
|
412
|
+
encoder_path = "encoder_medium.opt.onnx"
|
|
413
|
+
decoder_path = "decoder_medium_fix_kv_cache.opt2.onnx"
|
|
414
|
+
encoder_pb_path = None
|
|
415
|
+
decoder_pb_path = None
|
|
416
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE:
|
|
417
|
+
encoder_path = "encoder_large.onnx"
|
|
418
|
+
decoder_path = "decoder_large_fix_kv_cache.onnx"
|
|
419
|
+
encoder_pb_path = "encoder_large_weights.pb"
|
|
420
|
+
decoder_pb_path = "decoder_large_fix_kv_cache_weights.pb"
|
|
421
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3:
|
|
422
|
+
encoder_path = "encoder_large_v3.onnx"
|
|
423
|
+
decoder_path = "decoder_large_v3_fix_kv_cache.onnx"
|
|
424
|
+
encoder_pb_path = "encoder_large_v3_weights.pb"
|
|
425
|
+
decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
|
|
426
|
+
self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
|
|
427
|
+
self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
|
|
428
|
+
self._open_vad(model_path + "silero_vad.onnx", AILIA_SPEECH_VAD_TYPE_SILERO)
|
|
429
|
+
|
|
430
|
+
def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path):
|
|
431
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
|
|
432
|
+
os.makedirs(model_path, exist_ok = True)
|
|
433
|
+
check_and_download_file(model_path + encoder_path, REMOTE_PATH)
|
|
434
|
+
check_and_download_file(model_path + decoder_path, REMOTE_PATH)
|
|
435
|
+
if encoder_pb_path is not None:
|
|
436
|
+
check_and_download_file(model_path + encoder_pb_path, REMOTE_PATH)
|
|
437
|
+
if decoder_pb_path is not None:
|
|
438
|
+
check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
|
|
439
|
+
|
|
440
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
|
|
441
|
+
check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
|
|
442
|
+
|
|
443
|
+
def _open_model(self, encoder, decoder, model_type):
|
|
444
|
+
p1 = self._string_buffer(encoder)
|
|
445
|
+
p2 = self._string_buffer(decoder)
|
|
446
|
+
|
|
447
|
+
if sys.platform == "win32":
|
|
448
|
+
self._check(dll.ailiaSpeechOpenModelFileW(self._instance, p1, p2, model_type))
|
|
449
|
+
else:
|
|
450
|
+
self._check(dll.ailiaSpeechOpenModelFileA(self._instance, p1, p2, model_type))
|
|
451
|
+
|
|
452
|
+
def _open_vad(self, vad, vad_type):
|
|
453
|
+
p1 = self._string_buffer(vad)
|
|
454
|
+
|
|
455
|
+
if sys.platform == "win32":
|
|
456
|
+
self._check(dll.ailiaSpeechOpenVadFileW(self._instance, p1, vad_type))
|
|
457
|
+
else:
|
|
458
|
+
self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
|
|
459
|
+
|
|
460
|
+
def transcribe(self, audio_waveform, sampling_rate, lang = None):
|
|
461
|
+
if len(audio_waveform.shape) == 1:
|
|
462
|
+
channels = 1
|
|
463
|
+
elif len(audio_waveform.shape) == 2:
|
|
464
|
+
channels = audio_waveform.shape[0]
|
|
465
|
+
audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
|
|
466
|
+
else:
|
|
467
|
+
raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
|
|
468
|
+
|
|
469
|
+
audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
|
|
470
|
+
|
|
471
|
+
if lang is not None:
|
|
472
|
+
self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
|
|
473
|
+
|
|
474
|
+
self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
|
|
475
|
+
self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
|
|
476
|
+
self._check(dll.ailiaSpeechTranscribe(self._instance))
|
|
477
|
+
|
|
478
|
+
count = ctypes.c_uint(0)
|
|
479
|
+
self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
|
|
480
|
+
results = []
|
|
481
|
+
for i in range(count.value):
|
|
482
|
+
text = AILIASpeechText()
|
|
483
|
+
self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
|
|
484
|
+
results.append({"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence})
|
|
485
|
+
|
|
486
|
+
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
487
|
+
|
|
488
|
+
return results
|
|
489
|
+
|
|
490
|
+
def __del__(self):
|
|
491
|
+
if self._instance:
|
|
492
|
+
dll.ailiaSpeechDestroy(cast(self._instance, c_void_p))
|
|
493
|
+
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
import ctypes
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import numpy
|
|
6
|
+
import ailia
|
|
7
|
+
import ailia.audio
|
|
8
|
+
import ailia_tokenizer
|
|
9
|
+
|
|
10
|
+
import urllib.request
|
|
11
|
+
import ssl
|
|
12
|
+
import shutil
|
|
13
|
+
import platform
|
|
14
|
+
|
|
15
|
+
#### dependency check
|
|
16
|
+
if sys.platform == "win32":
|
|
17
|
+
import ctypes
|
|
18
|
+
try:
|
|
19
|
+
for library in ["vcruntime140.dll", "vcruntime140_1.dll", "msvcp140.dll"]:
|
|
20
|
+
ctypes.windll.LoadLibrary(library)
|
|
21
|
+
except:
|
|
22
|
+
print(" WARNING Please install MSVC 2015-2019 runtime from https://docs.microsoft.com/ja-jp/cpp/windows/latest-supported-vc-redist")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
#### loading DLL / DYLIB / SO ####
|
|
26
|
+
if sys.platform == "win32":
|
|
27
|
+
dll_platform = "windows/x64"
|
|
28
|
+
dll_name = "ailia_speech.dll"
|
|
29
|
+
load_fn = ctypes.WinDLL
|
|
30
|
+
elif sys.platform == "darwin":
|
|
31
|
+
dll_platform = "mac"
|
|
32
|
+
dll_name = "libailia_speech.dylib"
|
|
33
|
+
load_fn = ctypes.CDLL
|
|
34
|
+
else:
|
|
35
|
+
is_arm = "arm" in platform.machine() or platform.machine() == "aarch64"
|
|
36
|
+
if is_arm:
|
|
37
|
+
if platform.architecture()[0] == "32bit":
|
|
38
|
+
dll_platform = "linux/armeabi-v7a"
|
|
39
|
+
else:
|
|
40
|
+
dll_platform = "linux/arm64-v8a"
|
|
41
|
+
else:
|
|
42
|
+
dll_platform = "linux/x64"
|
|
43
|
+
dll_name = "libailia_speech.so"
|
|
44
|
+
load_fn = ctypes.CDLL
|
|
45
|
+
|
|
46
|
+
dll_found = False
|
|
47
|
+
candidate = ["", str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep), str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep) + dll_platform + str(os.sep)]
|
|
48
|
+
for dir in candidate:
|
|
49
|
+
try:
|
|
50
|
+
dll = load_fn(dir + dll_name)
|
|
51
|
+
dll_found = True
|
|
52
|
+
except:
|
|
53
|
+
pass
|
|
54
|
+
if not dll_found:
|
|
55
|
+
msg = "DLL load failed : \'" + dll_name + "\' is not found"
|
|
56
|
+
raise ImportError(msg)
|
|
57
|
+
|
|
58
|
+
# ==============================================================================
|
|
59
|
+
|
|
60
|
+
from ctypes import *
|
|
61
|
+
|
|
62
|
+
AILIA_SPEECH_STATUS_SUCCESS = ( 0 )
|
|
63
|
+
|
|
64
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY = (0)
|
|
65
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE = (1)
|
|
66
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL = (2)
|
|
67
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM = (3)
|
|
68
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE = (4)
|
|
69
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 = (5)
|
|
70
|
+
|
|
71
|
+
AILIA_SPEECH_TASK_TRANSCRIBE = (0)
|
|
72
|
+
AILIA_SPEECH_TASK_TRANSLATE = (1)
|
|
73
|
+
|
|
74
|
+
AILIA_SPEECH_FLAG_NONE = (0)
|
|
75
|
+
AILIA_SPEECH_FLAG_LIVE = (1)
|
|
76
|
+
|
|
77
|
+
AILIA_SPEECH_VAD_TYPE_SILERO = (0)
|
|
78
|
+
AILIA_SPEECH_API_CALLBACK_VERSION = (6)
|
|
79
|
+
|
|
80
|
+
AILIA_SPEECH_TEXT_VERSION = (2)
|
|
81
|
+
|
|
82
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
|
|
83
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
|
|
84
|
+
|
|
85
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p) , c_int, c_int)
|
|
86
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A = CFUNCTYPE((c_int), c_void_p , c_char_p)
|
|
87
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W = CFUNCTYPE((c_int), c_void_p , c_wchar)
|
|
88
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE = CFUNCTYPE((c_int), c_void_p , c_char_p)
|
|
89
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
|
|
90
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS = CFUNCTYPE((c_int), c_void_p , POINTER(c_int) , c_uint)
|
|
91
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE = CFUNCTYPE((c_int), c_void_p , POINTER(c_int), c_uint)
|
|
92
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
|
|
93
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT = CFUNCTYPE((c_int), c_void_p , c_char_p , c_uint)
|
|
94
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY = CFUNCTYPE((c_int), c_void_p)
|
|
95
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32 = CFUNCTYPE((c_int), POINTER(c_uint) , POINTER(c_uint) , c_char_p , c_uint)
|
|
96
|
+
AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8 = CFUNCTYPE((c_int), c_char_p, POINTER(c_uint) , c_uint)
|
|
97
|
+
|
|
98
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE = CFUNCTYPE((c_int), POINTER(c_float), POINTER(c_float), c_int, c_int, c_int, c_int)
|
|
99
|
+
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN = CFUNCTYPE((c_int), POINTER(c_int), c_int, c_int, c_int)
|
|
100
|
+
AILIA_SPEECH_USER_API_AILIA_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p), c_int, c_int)
|
|
101
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A = CFUNCTYPE((c_int), c_void_p, c_char_p)
|
|
102
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W = CFUNCTYPE((c_int), c_void_p, POINTER(c_wchar))
|
|
103
|
+
AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM = CFUNCTYPE((c_int), c_void_p, POINTER(c_byte), c_uint)
|
|
104
|
+
AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE = CFUNCTYPE((c_int), c_void_p, c_uint)
|
|
105
|
+
AILIA_SPEECH_USER_API_AILIA_DESTROY = CFUNCTYPE((None), c_void_p)
|
|
106
|
+
AILIA_SPEECH_USER_API_AILIA_UPDATE = CFUNCTYPE((c_int), c_void_p)
|
|
107
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
|
|
108
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
|
|
109
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
|
|
110
|
+
AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
|
|
111
|
+
AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
|
|
112
|
+
AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
|
|
113
|
+
AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL = CFUNCTYPE((c_char_p), c_void_p)
|
|
114
|
+
|
|
115
|
+
AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, c_uint, c_void_p, c_uint)
|
|
116
|
+
AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT = CFUNCTYPE((c_int), POINTER(c_void_p), c_uint, c_uint)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class struct__AILIASpeechApiCallback(Structure):
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
struct__AILIASpeechApiCallback.__slots__ = [
|
|
123
|
+
'ailiaAudioGetFrameLen',
|
|
124
|
+
'ailiaAudioGetMelSpectrogram',
|
|
125
|
+
'ailiaAudioResample',
|
|
126
|
+
'ailiaAudioGetResampleLen',
|
|
127
|
+
|
|
128
|
+
'ailiaTokenizerCreate',
|
|
129
|
+
'ailiaTokenizerOpenModelFileA',
|
|
130
|
+
'ailiaTokenizerOpenModelFileW',
|
|
131
|
+
'ailiaTokenizerEncode',
|
|
132
|
+
'ailiaTokenizerGetTokenCount',
|
|
133
|
+
'ailiaTokenizerGetTokens',
|
|
134
|
+
'ailiaTokenizerDecode',
|
|
135
|
+
'ailiaTokenizerGetTextLength',
|
|
136
|
+
'ailiaTokenizerGetText',
|
|
137
|
+
'ailiaTokenizerDestroy',
|
|
138
|
+
'ailiaTokenizerUtf8ToUtf32',
|
|
139
|
+
'ailiaTokenizerUtf32ToUtf8',
|
|
140
|
+
|
|
141
|
+
'ailiaCreate',
|
|
142
|
+
'ailiaOpenWeightFileA',
|
|
143
|
+
'ailiaOpenWeightFileW',
|
|
144
|
+
'ailiaOpenWeightMem',
|
|
145
|
+
'ailiaSetMemoryMode',
|
|
146
|
+
'ailiaDestroy',
|
|
147
|
+
'ailiaUpdate',
|
|
148
|
+
'ailiaGetBlobIndexByInputIndex',
|
|
149
|
+
'ailiaGetBlobIndexByOutputIndex',
|
|
150
|
+
'ailiaGetBlobData',
|
|
151
|
+
'ailiaSetInputBlobData',
|
|
152
|
+
'ailiaSetInputBlobShape',
|
|
153
|
+
'ailiaGetBlobShape',
|
|
154
|
+
'ailiaGetErrorDetail',
|
|
155
|
+
'ailiaCopyBlobData',
|
|
156
|
+
'ailiaGetEnvironment',
|
|
157
|
+
]
|
|
158
|
+
struct__AILIASpeechApiCallback._fields_ = [
|
|
159
|
+
('ailiaAudioGetFrameLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN),
|
|
160
|
+
('ailiaAudioGetMelSpectrogram', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM),
|
|
161
|
+
('ailiaAudioResample', AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE),
|
|
162
|
+
('ailiaAudioGetResampleLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN),
|
|
163
|
+
|
|
164
|
+
('ailiaTokenizerCreate', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE),
|
|
165
|
+
('ailiaTokenizerOpenModelFileA', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A),
|
|
166
|
+
('ailiaTokenizerOpenModelFileW', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W),
|
|
167
|
+
('ailiaTokenizerEncode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE),
|
|
168
|
+
('ailiaTokenizerGetTokenCount', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT),
|
|
169
|
+
('ailiaTokenizerGetTokens', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS),
|
|
170
|
+
('ailiaTokenizerDecode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE),
|
|
171
|
+
('ailiaTokenizerGetTextLength', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH),
|
|
172
|
+
('ailiaTokenizerGetText', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT),
|
|
173
|
+
('ailiaTokenizerDestroy', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY),
|
|
174
|
+
('ailiaTokenizerUtf8ToUtf32', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32),
|
|
175
|
+
('ailiaTokenizerUtf32ToUtf8', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8),
|
|
176
|
+
|
|
177
|
+
('ailiaCreate', AILIA_SPEECH_USER_API_AILIA_CREATE),
|
|
178
|
+
('ailiaOpenWeightFileA', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A),
|
|
179
|
+
('ailiaOpenWeightFileW', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W),
|
|
180
|
+
('ailiaOpenWeightMem', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM),
|
|
181
|
+
('ailiaSetMemoryMode', AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE),
|
|
182
|
+
('ailiaDestroy', AILIA_SPEECH_USER_API_AILIA_DESTROY),
|
|
183
|
+
('ailiaUpdate', AILIA_SPEECH_USER_API_AILIA_UPDATE),
|
|
184
|
+
('ailiaGetBlobIndexByInputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX),
|
|
185
|
+
('ailiaGetBlobIndexByOutputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX),
|
|
186
|
+
('ailiaGetBlobData', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA),
|
|
187
|
+
('ailiaSetInputBlobData', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA),
|
|
188
|
+
('ailiaSetInputBlobShape', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE),
|
|
189
|
+
('ailiaGetBlobShape', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE),
|
|
190
|
+
('ailiaGetErrorDetail', AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL),
|
|
191
|
+
('ailiaCopyBlobData', AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA),
|
|
192
|
+
('ailiaGetEnvironment', AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT),
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
AILIASpeechApiCallback = struct__AILIASpeechApiCallback
|
|
196
|
+
|
|
197
|
+
# ==============================================================================
|
|
198
|
+
|
|
199
|
+
dll.ailiaSpeechCreate.restype = c_int
|
|
200
|
+
dll.ailiaSpeechCreate.argtypes = (POINTER(c_void_p), c_int32, c_int32, c_int32, c_int32, c_int32, AILIASpeechApiCallback, c_int32)
|
|
201
|
+
|
|
202
|
+
dll.ailiaSpeechDestroy.restype = None
|
|
203
|
+
dll.ailiaSpeechDestroy.argtypes = (c_void_p, )
|
|
204
|
+
|
|
205
|
+
dll.ailiaSpeechOpenModelFileA.restype = c_int
|
|
206
|
+
dll.ailiaSpeechOpenModelFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
|
|
207
|
+
|
|
208
|
+
dll.ailiaSpeechOpenModelFileW.restype = c_int
|
|
209
|
+
dll.ailiaSpeechOpenModelFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
|
|
210
|
+
|
|
211
|
+
dll.ailiaSpeechOpenVadFileA.restype = c_int
|
|
212
|
+
dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
|
|
213
|
+
|
|
214
|
+
dll.ailiaSpeechOpenVadFileW.restype = c_int
|
|
215
|
+
dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
|
|
216
|
+
|
|
217
|
+
dll.ailiaSpeechPushInputData.restype = c_int
|
|
218
|
+
dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
|
|
219
|
+
dtype=numpy.float32, flags='CONTIGUOUS'
|
|
220
|
+
), # src
|
|
221
|
+
ctypes.c_uint,
|
|
222
|
+
ctypes.c_uint,
|
|
223
|
+
ctypes.c_uint)
|
|
224
|
+
|
|
225
|
+
dll.ailiaSpeechFinalizeInputData.restype = c_int
|
|
226
|
+
dll.ailiaSpeechFinalizeInputData.argtypes = (c_void_p, )
|
|
227
|
+
|
|
228
|
+
dll.ailiaSpeechBuffered.restype = c_int
|
|
229
|
+
dll.ailiaSpeechBuffered.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
230
|
+
|
|
231
|
+
dll.ailiaSpeechComplete.restype = c_int
|
|
232
|
+
dll.ailiaSpeechComplete.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
233
|
+
|
|
234
|
+
dll.ailiaSpeechTranscribe.restype = c_int
|
|
235
|
+
dll.ailiaSpeechTranscribe.argtypes = (c_void_p, )
|
|
236
|
+
|
|
237
|
+
dll.ailiaSpeechGetTextCount.restype = c_int
|
|
238
|
+
dll.ailiaSpeechGetTextCount.argtypes = (c_void_p, POINTER(ctypes.c_uint))
|
|
239
|
+
|
|
240
|
+
class AILIASpeechText(ctypes.Structure):
|
|
241
|
+
_fields_ = [
|
|
242
|
+
("text", ctypes.c_char_p),
|
|
243
|
+
("time_stamp_begin", ctypes.c_float),
|
|
244
|
+
("time_stamp_end", ctypes.c_float),
|
|
245
|
+
("person_id", ctypes.c_uint),
|
|
246
|
+
("language", ctypes.c_char_p),
|
|
247
|
+
("confidence", ctypes.c_float)]
|
|
248
|
+
|
|
249
|
+
dll.ailiaSpeechGetText.restype = c_int
|
|
250
|
+
dll.ailiaSpeechGetText.argtypes = (c_void_p, POINTER(AILIASpeechText), ctypes.c_uint, ctypes.c_uint)
|
|
251
|
+
|
|
252
|
+
dll.ailiaSpeechResetTranscribeState.restype = c_int
|
|
253
|
+
dll.ailiaSpeechResetTranscribeState.argtypes = (c_void_p, )
|
|
254
|
+
|
|
255
|
+
AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK = CFUNCTYPE((c_int), c_int64, c_char_p)
|
|
256
|
+
|
|
257
|
+
dll.ailiaSpeechSetIntermediateCallback.restype = c_int
|
|
258
|
+
dll.ailiaSpeechSetIntermediateCallback.argtypes = (c_void_p, AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK, c_int64)
|
|
259
|
+
|
|
260
|
+
dll.ailiaSpeechSetLanguage.restype = c_int
|
|
261
|
+
dll.ailiaSpeechSetLanguage.argtypes = (c_void_p, c_char_p)
|
|
262
|
+
|
|
263
|
+
# ==============================================================================
|
|
264
|
+
# model download
|
|
265
|
+
# ==============================================================================
|
|
266
|
+
|
|
267
|
+
def progress_print(block_count, block_size, total_size):
|
|
268
|
+
percentage = 100.0 * block_count * block_size / total_size
|
|
269
|
+
if percentage > 100:
|
|
270
|
+
# Bigger than 100 does not look good, so...
|
|
271
|
+
percentage = 100
|
|
272
|
+
max_bar = 50
|
|
273
|
+
bar_num = int(percentage / (100 / max_bar))
|
|
274
|
+
progress_element = '=' * bar_num
|
|
275
|
+
if bar_num != max_bar:
|
|
276
|
+
progress_element += '>'
|
|
277
|
+
bar_fill = ' ' # fill the blanks
|
|
278
|
+
bar = progress_element.ljust(max_bar, bar_fill)
|
|
279
|
+
total_size_kb = total_size / 1024
|
|
280
|
+
print(f'[{bar} {percentage:.2f}% ( {total_size_kb:.0f}KB )]', end='\r')
|
|
281
|
+
|
|
282
|
+
def urlretrieve(remote_path, weight_path, progress_print):
|
|
283
|
+
temp_path = weight_path + ".tmp"
|
|
284
|
+
try:
|
|
285
|
+
#raise ssl.SSLError # test
|
|
286
|
+
urllib.request.urlretrieve(
|
|
287
|
+
remote_path,
|
|
288
|
+
temp_path,
|
|
289
|
+
progress_print,
|
|
290
|
+
)
|
|
291
|
+
except ssl.SSLError as e:
|
|
292
|
+
print(f'SSLError detected, so try to download without ssl')
|
|
293
|
+
remote_path = remote_path.replace("https","http")
|
|
294
|
+
urllib.request.urlretrieve(
|
|
295
|
+
remote_path,
|
|
296
|
+
temp_path,
|
|
297
|
+
progress_print,
|
|
298
|
+
)
|
|
299
|
+
shutil.move(temp_path, weight_path)
|
|
300
|
+
|
|
301
|
+
def check_and_download_file(file_path, remote_path):
|
|
302
|
+
if not os.path.exists(file_path):
|
|
303
|
+
print('Downloading %s...' % file_path)
|
|
304
|
+
urlretrieve(remote_path + os.path.basename(file_path), file_path, progress_print)
|
|
305
|
+
|
|
306
|
+
# ==============================================================================
|
|
307
|
+
# base model class
|
|
308
|
+
# ==============================================================================
|
|
309
|
+
|
|
310
|
+
class AiliaSpeechError(RuntimeError):
|
|
311
|
+
def __init__(self, message, code):
|
|
312
|
+
super().__init__(f"{message} code:{code}")
|
|
313
|
+
self.code = code
|
|
314
|
+
|
|
315
|
+
class AiliaSpeechModel:
|
|
316
|
+
_api_callback = None
|
|
317
|
+
_instance = None
|
|
318
|
+
|
|
319
|
+
def _check(self, status):
|
|
320
|
+
if status != AILIA_SPEECH_STATUS_SUCCESS:
|
|
321
|
+
raise AiliaSpeechError(f"ailia speech error", status)
|
|
322
|
+
|
|
323
|
+
def _string_buffer(self, path):
|
|
324
|
+
if sys.platform == "win32":
|
|
325
|
+
return ctypes.create_unicode_buffer(path)
|
|
326
|
+
else:
|
|
327
|
+
return ctypes.create_string_buffer(path.encode("utf-8"))
|
|
328
|
+
|
|
329
|
+
def _create_callback(self):
|
|
330
|
+
callback = AILIASpeechApiCallback()
|
|
331
|
+
callback.ailiaAudioGetFrameLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN(("ailiaAudioGetFrameLen", ailia.audio.audio_core.dll))
|
|
332
|
+
callback.ailiaAudioGetMelSpectrogram = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM(("ailiaAudioGetMelSpectrogram", ailia.audio.audio_core.dll))
|
|
333
|
+
callback.ailiaAudioResample = AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE(("ailiaAudioResample", ailia.audio.audio_core.dll))
|
|
334
|
+
callback.ailiaAudioGetResampleLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN(("ailiaAudioGetResampleLen", ailia.audio.audio_core.dll))
|
|
335
|
+
|
|
336
|
+
callback.ailiaTokenizerCreate = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE(("ailiaTokenizerCreate", ailia_tokenizer.dll))
|
|
337
|
+
callback.ailiaTokenizerOpenModelFileA = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A(("ailiaTokenizerOpenModelFileA", ailia_tokenizer.dll))
|
|
338
|
+
callback.ailiaTokenizerOpenModelFileW = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W(("ailiaTokenizerOpenModelFileW", ailia_tokenizer.dll))
|
|
339
|
+
callback.ailiaTokenizerEncode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE(("ailiaTokenizerEncode", ailia_tokenizer.dll))
|
|
340
|
+
callback.ailiaTokenizerGetTokenCount = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT(("ailiaTokenizerGetTokenCount", ailia_tokenizer.dll))
|
|
341
|
+
callback.ailiaTokenizerGetTokens = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS(("ailiaTokenizerGetTokens", ailia_tokenizer.dll))
|
|
342
|
+
callback.ailiaTokenizerDecode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE(("ailiaTokenizerDecode", ailia_tokenizer.dll))
|
|
343
|
+
callback.ailiaTokenizerGetTextLength = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH(("ailiaTokenizerGetTextLength", ailia_tokenizer.dll))
|
|
344
|
+
callback.ailiaTokenizerGetText = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT(("ailiaTokenizerGetText", ailia_tokenizer.dll))
|
|
345
|
+
callback.ailiaTokenizerDestroy = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY(("ailiaTokenizerDestroy", ailia_tokenizer.dll))
|
|
346
|
+
callback.ailiaTokenizerUtf8ToUtf32 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32(("ailiaTokenizerUtf8ToUtf32", ailia_tokenizer.dll))
|
|
347
|
+
callback.ailiaTokenizerUtf32ToUtf8 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8(("ailiaTokenizerUtf32ToUtf8", ailia_tokenizer.dll))
|
|
348
|
+
|
|
349
|
+
callback.ailiaCreate = AILIA_SPEECH_USER_API_AILIA_CREATE(("ailiaCreate", ailia.core.dll))
|
|
350
|
+
callback.ailiaOpenWeightFileA = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A(("ailiaOpenWeightFileA", ailia.core.dll))
|
|
351
|
+
callback.ailiaOpenWeightFileW = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W(("ailiaOpenWeightFileW", ailia.core.dll))
|
|
352
|
+
callback.ailiaOpenWeightMem = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM(("ailiaOpenWeightMem", ailia.core.dll))
|
|
353
|
+
callback.ailiaSetMemoryMode = AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE(("ailiaSetMemoryMode", ailia.core.dll))
|
|
354
|
+
callback.ailiaDestroy = AILIA_SPEECH_USER_API_AILIA_DESTROY(("ailiaDestroy", ailia.core.dll))
|
|
355
|
+
callback.ailiaUpdate = AILIA_SPEECH_USER_API_AILIA_UPDATE(("ailiaUpdate", ailia.core.dll))
|
|
356
|
+
callback.ailiaGetBlobIndexByInputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX(("ailiaGetBlobIndexByInputIndex", ailia.core.dll))
|
|
357
|
+
callback.ailiaGetBlobIndexByOutputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX(("ailiaGetBlobIndexByOutputIndex", ailia.core.dll))
|
|
358
|
+
callback.ailiaGetBlobData = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA(("ailiaGetBlobData", ailia.core.dll))
|
|
359
|
+
callback.ailiaSetInputBlobData = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA(("ailiaSetInputBlobData", ailia.core.dll))
|
|
360
|
+
callback.ailiaSetInputBlobShape = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE(("ailiaSetInputBlobShape", ailia.core.dll))
|
|
361
|
+
callback.ailiaGetBlobShape = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE(("ailiaGetBlobShape", ailia.core.dll))
|
|
362
|
+
callback.ailiaGetErrorDetail = AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL(("ailiaGetErrorDetail", ailia.core.dll))
|
|
363
|
+
callback.ailiaCopyBlobData = AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA(("ailiaCopyBlobData", ailia.core.dll))
|
|
364
|
+
callback.ailiaGetEnvironment = AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT(("ailiaGetEnvironment", ailia.core.dll))
|
|
365
|
+
|
|
366
|
+
self._api_callback = callback # prevent GC
|
|
367
|
+
|
|
368
|
+
# ==============================================================================
|
|
369
|
+
# Public class
|
|
370
|
+
# ==============================================================================
|
|
371
|
+
|
|
372
|
+
intermediate_callback_cnt = 0
|
|
373
|
+
intermediate_callback_map = {}
|
|
374
|
+
|
|
375
|
+
def intermediate_callback(handle, text):
|
|
376
|
+
intermediate_callback_map[handle](text.decode())
|
|
377
|
+
return 0
|
|
378
|
+
|
|
379
|
+
class Whisper(AiliaSpeechModel):
|
|
380
|
+
_c_callback = None
|
|
381
|
+
|
|
382
|
+
def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
|
|
383
|
+
self._instance = ctypes.c_void_p(None)
|
|
384
|
+
self._create_callback()
|
|
385
|
+
self._check(dll.ailiaSpeechCreate(cast(pointer(self._instance), POINTER(c_void_p)), ctypes.c_int32(env_id), ctypes.c_int32(num_thread), ctypes.c_int32(memory_mode), ctypes.c_int32(task), ctypes.c_int32(flags), self._api_callback, ctypes.c_int32(AILIA_SPEECH_API_CALLBACK_VERSION)))
|
|
386
|
+
if callback is not None:
|
|
387
|
+
self._c_callback = AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK(intermediate_callback)
|
|
388
|
+
global intermediate_callback_cnt
|
|
389
|
+
global intermediate_callback_map
|
|
390
|
+
intermediate_callback_map[intermediate_callback_cnt] = callback
|
|
391
|
+
self._check(dll.ailiaSpeechSetIntermediateCallback(self._instance, self._c_callback, intermediate_callback_cnt))
|
|
392
|
+
intermediate_callback_cnt = intermediate_callback_cnt + 1
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY):
|
|
396
|
+
if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
|
|
397
|
+
encoder_path = "encoder_tiny.opt.onnx"
|
|
398
|
+
decoder_path = "decoder_tiny_fix_kv_cache.opt2.onnx"
|
|
399
|
+
encoder_pb_path = None
|
|
400
|
+
decoder_pb_path = None
|
|
401
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE:
|
|
402
|
+
encoder_path = "encoder_base.opt.onnx"
|
|
403
|
+
decoder_path = "decoder_base_fix_kv_cache.opt2.onnx"
|
|
404
|
+
encoder_pb_path = None
|
|
405
|
+
decoder_pb_path = None
|
|
406
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL:
|
|
407
|
+
encoder_path = "encoder_small.opt.onnx"
|
|
408
|
+
decoder_path = "decoder_small_fix_kv_cache.opt2.onnx"
|
|
409
|
+
encoder_pb_path = None
|
|
410
|
+
decoder_pb_path = None
|
|
411
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM:
|
|
412
|
+
encoder_path = "encoder_medium.opt.onnx"
|
|
413
|
+
decoder_path = "decoder_medium_fix_kv_cache.opt2.onnx"
|
|
414
|
+
encoder_pb_path = None
|
|
415
|
+
decoder_pb_path = None
|
|
416
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE:
|
|
417
|
+
encoder_path = "encoder_large.onnx"
|
|
418
|
+
decoder_path = "decoder_large_fix_kv_cache.onnx"
|
|
419
|
+
encoder_pb_path = "encoder_large_weights.pb"
|
|
420
|
+
decoder_pb_path = "decoder_large_fix_kv_cache_weights.pb"
|
|
421
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3:
|
|
422
|
+
encoder_path = "encoder_large_v3.onnx"
|
|
423
|
+
decoder_path = "decoder_large_v3_fix_kv_cache.onnx"
|
|
424
|
+
encoder_pb_path = "encoder_large_v3_weights.pb"
|
|
425
|
+
decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
|
|
426
|
+
self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
|
|
427
|
+
self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
|
|
428
|
+
self._open_vad(model_path + "silero_vad.onnx", AILIA_SPEECH_VAD_TYPE_SILERO)
|
|
429
|
+
|
|
430
|
+
def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path):
|
|
431
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
|
|
432
|
+
os.makedirs(model_path, exist_ok = True)
|
|
433
|
+
check_and_download_file(model_path + encoder_path, REMOTE_PATH)
|
|
434
|
+
check_and_download_file(model_path + decoder_path, REMOTE_PATH)
|
|
435
|
+
if encoder_pb_path is not None:
|
|
436
|
+
check_and_download_file(model_path + encoder_pb_path, REMOTE_PATH)
|
|
437
|
+
if decoder_pb_path is not None:
|
|
438
|
+
check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
|
|
439
|
+
|
|
440
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
|
|
441
|
+
check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
|
|
442
|
+
|
|
443
|
+
def _open_model(self, encoder, decoder, model_type):
|
|
444
|
+
p1 = self._string_buffer(encoder)
|
|
445
|
+
p2 = self._string_buffer(decoder)
|
|
446
|
+
|
|
447
|
+
if sys.platform == "win32":
|
|
448
|
+
self._check(dll.ailiaSpeechOpenModelFileW(self._instance, p1, p2, model_type))
|
|
449
|
+
else:
|
|
450
|
+
self._check(dll.ailiaSpeechOpenModelFileA(self._instance, p1, p2, model_type))
|
|
451
|
+
|
|
452
|
+
def _open_vad(self, vad, vad_type):
|
|
453
|
+
p1 = self._string_buffer(vad)
|
|
454
|
+
|
|
455
|
+
if sys.platform == "win32":
|
|
456
|
+
self._check(dll.ailiaSpeechOpenVadFileW(self._instance, p1, vad_type))
|
|
457
|
+
else:
|
|
458
|
+
self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
|
|
459
|
+
|
|
460
|
+
def transcribe(self, audio_waveform, sampling_rate, lang = None):
|
|
461
|
+
if len(audio_waveform.shape) == 1:
|
|
462
|
+
channels = 1
|
|
463
|
+
elif len(audio_waveform.shape) == 2:
|
|
464
|
+
channels = audio_waveform.shape[0]
|
|
465
|
+
audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
|
|
466
|
+
else:
|
|
467
|
+
raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
|
|
468
|
+
|
|
469
|
+
audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
|
|
470
|
+
|
|
471
|
+
if lang is not None:
|
|
472
|
+
self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
|
|
473
|
+
|
|
474
|
+
self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
|
|
475
|
+
self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
|
|
476
|
+
self._check(dll.ailiaSpeechTranscribe(self._instance))
|
|
477
|
+
|
|
478
|
+
count = ctypes.c_uint(0)
|
|
479
|
+
self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
|
|
480
|
+
results = []
|
|
481
|
+
for i in range(count.value):
|
|
482
|
+
text = AILIASpeechText()
|
|
483
|
+
self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
|
|
484
|
+
results.append({"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence})
|
|
485
|
+
|
|
486
|
+
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
487
|
+
|
|
488
|
+
return results
|
|
489
|
+
|
|
490
|
+
def __del__(self):
|
|
491
|
+
if self._instance:
|
|
492
|
+
dll.ailiaSpeechDestroy(cast(self._instance, c_void_p))
|
|
493
|
+
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ailia_speech
|
|
3
|
+
Version: 1.3.0.0
|
|
4
|
+
Summary: ailia AI Speech
|
|
5
|
+
Home-page: https://ailia.jp/
|
|
6
|
+
Author: ax Inc.
|
|
7
|
+
Author-email: contact@axinc.jp
|
|
8
|
+
License: https://ailia.ai/en/license/
|
|
9
|
+
Requires-Python: >3.6
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: ailia
|
|
12
|
+
Requires-Dist: ailia-tokenizer
|
|
13
|
+
|
|
14
|
+
# ailia AI Speech Python API
|
|
15
|
+
|
|
16
|
+
!! CAUTION !!
|
|
17
|
+
“ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
|
|
18
|
+
As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
|
|
19
|
+
|
|
20
|
+
## About ailia AI Speech
|
|
21
|
+
|
|
22
|
+
ailia Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia Speech, you can easily integrate AI powered speech recognition into your applications.
|
|
23
|
+
|
|
24
|
+
## Install from pip
|
|
25
|
+
|
|
26
|
+
You can install the ailia SDK free evaluation package with the following command.
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
pip3 install ailia_speech
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Install from package
|
|
33
|
+
|
|
34
|
+
You can install the ailia SDK from Package with the following command.
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
python3 bootstrap.py
|
|
38
|
+
pip3 install ./
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import ailia
|
|
45
|
+
import ailia_speech
|
|
46
|
+
|
|
47
|
+
import librosa
|
|
48
|
+
|
|
49
|
+
import os
|
|
50
|
+
import urllib.request
|
|
51
|
+
|
|
52
|
+
# Load target audio
|
|
53
|
+
ref_file_path = "demo.wav"
|
|
54
|
+
if not os.path.exists(ref_file_path):
|
|
55
|
+
urllib.request.urlretrieve(
|
|
56
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
|
|
57
|
+
"demo.wav"
|
|
58
|
+
)
|
|
59
|
+
audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
|
|
60
|
+
|
|
61
|
+
# Infer
|
|
62
|
+
speech = ailia_speech.Whisper()
|
|
63
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
|
|
64
|
+
recognized_text = speech.transcribe(audio_waveform, sampling_rate)
|
|
65
|
+
print(recognized_text)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## API specification
|
|
69
|
+
|
|
70
|
+
https://github.com/axinc-ai/ailia-sdk
|
|
71
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
ailia_speech/LICENSE_AILIA_EN.pdf,sha256=1DzVViPnw1uAS8gJ5a8uN3iZNNR5I1ItIXmezHfUpeM,70149
|
|
2
|
+
ailia_speech/LICENSE_AILIA_JA.pdf,sha256=s628QN47S2bNqIfuSjm2LBf0vIluv2df6MSemn6Ksmw,174134
|
|
3
|
+
ailia_speech/__init__.py,sha256=7XiloklOFiXHRHs_wDWyDW0HHD6obPDklxY327Lmwmc,25333
|
|
4
|
+
ailia_speech/linux/arm64-v8a/libailia_speech.so,sha256=JAOwnBr7lbiMZmPCM99pd4vJQ08ZuXDPpq-FurrXSnE,166096
|
|
5
|
+
ailia_speech/linux/x64/libailia_speech.so,sha256=WbFvA5wKTgS_Zx8ErT7WBKJbzOUexavr4nP4EkLNawQ,171360
|
|
6
|
+
ailia_speech/mac/libailia_speech.dylib,sha256=-JAC40yLslAVMvfh6LhDvP3Zyt3hIT3WZc7wa9-07zU,317112
|
|
7
|
+
ailia_speech/windows/x64/ailia_speech.dll,sha256=WJCOHi0Na4tdMG1RT7dA7yAoWumiGSWeW1vxUtiXDS8,126464
|
|
8
|
+
ailia_speech-1.3.0.0.data/scripts/__init__.py,sha256=7XiloklOFiXHRHs_wDWyDW0HHD6obPDklxY327Lmwmc,25333
|
|
9
|
+
ailia_speech-1.3.0.0.dist-info/METADATA,sha256=xj7S7gD2gsqVQbXiBDaHh1s_Cron4mr5SN80GcPTBFc,1902
|
|
10
|
+
ailia_speech-1.3.0.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
11
|
+
ailia_speech-1.3.0.0.dist-info/top_level.txt,sha256=Ou9XeJ9AvdK8eutw07oosCthftD1tRYzAgNY2BrYhDc,13
|
|
12
|
+
ailia_speech-1.3.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ailia_speech
|