ailia-speech 1.3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ailia-speech might be problematic. Click here for more details.

Binary file
Binary file
@@ -0,0 +1,493 @@
1
+ import ctypes
2
+ import os
3
+ import sys
4
+
5
+ import numpy
6
+ import ailia
7
+ import ailia.audio
8
+ import ailia_tokenizer
9
+
10
+ import urllib.request
11
+ import ssl
12
+ import shutil
13
+ import platform
14
+
15
+ #### dependency check
16
+ if sys.platform == "win32":
17
+ import ctypes
18
+ try:
19
+ for library in ["vcruntime140.dll", "vcruntime140_1.dll", "msvcp140.dll"]:
20
+ ctypes.windll.LoadLibrary(library)
21
+ except:
22
+ print(" WARNING Please install MSVC 2015-2019 runtime from https://docs.microsoft.com/ja-jp/cpp/windows/latest-supported-vc-redist")
23
+
24
+
25
+ #### loading DLL / DYLIB / SO ####
26
+ if sys.platform == "win32":
27
+ dll_platform = "windows/x64"
28
+ dll_name = "ailia_speech.dll"
29
+ load_fn = ctypes.WinDLL
30
+ elif sys.platform == "darwin":
31
+ dll_platform = "mac"
32
+ dll_name = "libailia_speech.dylib"
33
+ load_fn = ctypes.CDLL
34
+ else:
35
+ is_arm = "arm" in platform.machine() or platform.machine() == "aarch64"
36
+ if is_arm:
37
+ if platform.architecture()[0] == "32bit":
38
+ dll_platform = "linux/armeabi-v7a"
39
+ else:
40
+ dll_platform = "linux/arm64-v8a"
41
+ else:
42
+ dll_platform = "linux/x64"
43
+ dll_name = "libailia_speech.so"
44
+ load_fn = ctypes.CDLL
45
+
46
+ dll_found = False
47
+ candidate = ["", str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep), str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep) + dll_platform + str(os.sep)]
48
+ for dir in candidate:
49
+ try:
50
+ dll = load_fn(dir + dll_name)
51
+ dll_found = True
52
+ except:
53
+ pass
54
+ if not dll_found:
55
+ msg = "DLL load failed : \'" + dll_name + "\' is not found"
56
+ raise ImportError(msg)
57
+
58
+ # ==============================================================================
59
+
60
+ from ctypes import *
61
+
62
+ AILIA_SPEECH_STATUS_SUCCESS = ( 0 )
63
+
64
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY = (0)
65
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE = (1)
66
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL = (2)
67
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM = (3)
68
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE = (4)
69
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 = (5)
70
+
71
+ AILIA_SPEECH_TASK_TRANSCRIBE = (0)
72
+ AILIA_SPEECH_TASK_TRANSLATE = (1)
73
+
74
+ AILIA_SPEECH_FLAG_NONE = (0)
75
+ AILIA_SPEECH_FLAG_LIVE = (1)
76
+
77
+ AILIA_SPEECH_VAD_TYPE_SILERO = (0)
78
+ AILIA_SPEECH_API_CALLBACK_VERSION = (6)
79
+
80
+ AILIA_SPEECH_TEXT_VERSION = (2)
81
+
82
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
83
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
84
+
85
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p) , c_int, c_int)
86
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A = CFUNCTYPE((c_int), c_void_p , c_char_p)
87
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W = CFUNCTYPE((c_int), c_void_p , c_wchar)
88
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE = CFUNCTYPE((c_int), c_void_p , c_char_p)
89
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
90
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS = CFUNCTYPE((c_int), c_void_p , POINTER(c_int) , c_uint)
91
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE = CFUNCTYPE((c_int), c_void_p , POINTER(c_int), c_uint)
92
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
93
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT = CFUNCTYPE((c_int), c_void_p , c_char_p , c_uint)
94
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY = CFUNCTYPE((c_int), c_void_p)
95
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32 = CFUNCTYPE((c_int), POINTER(c_uint) , POINTER(c_uint) , c_char_p , c_uint)
96
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8 = CFUNCTYPE((c_int), c_char_p, POINTER(c_uint) , c_uint)
97
+
98
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE = CFUNCTYPE((c_int), POINTER(c_float), POINTER(c_float), c_int, c_int, c_int, c_int)
99
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN = CFUNCTYPE((c_int), POINTER(c_int), c_int, c_int, c_int)
100
+ AILIA_SPEECH_USER_API_AILIA_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p), c_int, c_int)
101
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A = CFUNCTYPE((c_int), c_void_p, c_char_p)
102
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W = CFUNCTYPE((c_int), c_void_p, POINTER(c_wchar))
103
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM = CFUNCTYPE((c_int), c_void_p, POINTER(c_byte), c_uint)
104
+ AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE = CFUNCTYPE((c_int), c_void_p, c_uint)
105
+ AILIA_SPEECH_USER_API_AILIA_DESTROY = CFUNCTYPE((None), c_void_p)
106
+ AILIA_SPEECH_USER_API_AILIA_UPDATE = CFUNCTYPE((c_int), c_void_p)
107
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
108
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
109
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
110
+ AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
111
+ AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
112
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
113
+ AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL = CFUNCTYPE((c_char_p), c_void_p)
114
+
115
+ AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, c_uint, c_void_p, c_uint)
116
+ AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT = CFUNCTYPE((c_int), POINTER(c_void_p), c_uint, c_uint)
117
+
118
+
119
+ class struct__AILIASpeechApiCallback(Structure):
120
+ pass
121
+
122
+ struct__AILIASpeechApiCallback.__slots__ = [
123
+ 'ailiaAudioGetFrameLen',
124
+ 'ailiaAudioGetMelSpectrogram',
125
+ 'ailiaAudioResample',
126
+ 'ailiaAudioGetResampleLen',
127
+
128
+ 'ailiaTokenizerCreate',
129
+ 'ailiaTokenizerOpenModelFileA',
130
+ 'ailiaTokenizerOpenModelFileW',
131
+ 'ailiaTokenizerEncode',
132
+ 'ailiaTokenizerGetTokenCount',
133
+ 'ailiaTokenizerGetTokens',
134
+ 'ailiaTokenizerDecode',
135
+ 'ailiaTokenizerGetTextLength',
136
+ 'ailiaTokenizerGetText',
137
+ 'ailiaTokenizerDestroy',
138
+ 'ailiaTokenizerUtf8ToUtf32',
139
+ 'ailiaTokenizerUtf32ToUtf8',
140
+
141
+ 'ailiaCreate',
142
+ 'ailiaOpenWeightFileA',
143
+ 'ailiaOpenWeightFileW',
144
+ 'ailiaOpenWeightMem',
145
+ 'ailiaSetMemoryMode',
146
+ 'ailiaDestroy',
147
+ 'ailiaUpdate',
148
+ 'ailiaGetBlobIndexByInputIndex',
149
+ 'ailiaGetBlobIndexByOutputIndex',
150
+ 'ailiaGetBlobData',
151
+ 'ailiaSetInputBlobData',
152
+ 'ailiaSetInputBlobShape',
153
+ 'ailiaGetBlobShape',
154
+ 'ailiaGetErrorDetail',
155
+ 'ailiaCopyBlobData',
156
+ 'ailiaGetEnvironment',
157
+ ]
158
+ struct__AILIASpeechApiCallback._fields_ = [
159
+ ('ailiaAudioGetFrameLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN),
160
+ ('ailiaAudioGetMelSpectrogram', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM),
161
+ ('ailiaAudioResample', AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE),
162
+ ('ailiaAudioGetResampleLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN),
163
+
164
+ ('ailiaTokenizerCreate', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE),
165
+ ('ailiaTokenizerOpenModelFileA', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A),
166
+ ('ailiaTokenizerOpenModelFileW', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W),
167
+ ('ailiaTokenizerEncode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE),
168
+ ('ailiaTokenizerGetTokenCount', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT),
169
+ ('ailiaTokenizerGetTokens', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS),
170
+ ('ailiaTokenizerDecode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE),
171
+ ('ailiaTokenizerGetTextLength', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH),
172
+ ('ailiaTokenizerGetText', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT),
173
+ ('ailiaTokenizerDestroy', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY),
174
+ ('ailiaTokenizerUtf8ToUtf32', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32),
175
+ ('ailiaTokenizerUtf32ToUtf8', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8),
176
+
177
+ ('ailiaCreate', AILIA_SPEECH_USER_API_AILIA_CREATE),
178
+ ('ailiaOpenWeightFileA', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A),
179
+ ('ailiaOpenWeightFileW', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W),
180
+ ('ailiaOpenWeightMem', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM),
181
+ ('ailiaSetMemoryMode', AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE),
182
+ ('ailiaDestroy', AILIA_SPEECH_USER_API_AILIA_DESTROY),
183
+ ('ailiaUpdate', AILIA_SPEECH_USER_API_AILIA_UPDATE),
184
+ ('ailiaGetBlobIndexByInputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX),
185
+ ('ailiaGetBlobIndexByOutputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX),
186
+ ('ailiaGetBlobData', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA),
187
+ ('ailiaSetInputBlobData', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA),
188
+ ('ailiaSetInputBlobShape', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE),
189
+ ('ailiaGetBlobShape', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE),
190
+ ('ailiaGetErrorDetail', AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL),
191
+ ('ailiaCopyBlobData', AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA),
192
+ ('ailiaGetEnvironment', AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT),
193
+ ]
194
+
195
+ AILIASpeechApiCallback = struct__AILIASpeechApiCallback
196
+
197
+ # ==============================================================================
198
+
199
+ dll.ailiaSpeechCreate.restype = c_int
200
+ dll.ailiaSpeechCreate.argtypes = (POINTER(c_void_p), c_int32, c_int32, c_int32, c_int32, c_int32, AILIASpeechApiCallback, c_int32)
201
+
202
+ dll.ailiaSpeechDestroy.restype = None
203
+ dll.ailiaSpeechDestroy.argtypes = (c_void_p, )
204
+
205
+ dll.ailiaSpeechOpenModelFileA.restype = c_int
206
+ dll.ailiaSpeechOpenModelFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
207
+
208
+ dll.ailiaSpeechOpenModelFileW.restype = c_int
209
+ dll.ailiaSpeechOpenModelFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
210
+
211
+ dll.ailiaSpeechOpenVadFileA.restype = c_int
212
+ dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
213
+
214
+ dll.ailiaSpeechOpenVadFileW.restype = c_int
215
+ dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
216
+
217
+ dll.ailiaSpeechPushInputData.restype = c_int
218
+ dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
219
+ dtype=numpy.float32, flags='CONTIGUOUS'
220
+ ), # src
221
+ ctypes.c_uint,
222
+ ctypes.c_uint,
223
+ ctypes.c_uint)
224
+
225
+ dll.ailiaSpeechFinalizeInputData.restype = c_int
226
+ dll.ailiaSpeechFinalizeInputData.argtypes = (c_void_p, )
227
+
228
+ dll.ailiaSpeechBuffered.restype = c_int
229
+ dll.ailiaSpeechBuffered.argtypes = (c_void_p, POINTER(ctypes.c_uint))
230
+
231
+ dll.ailiaSpeechComplete.restype = c_int
232
+ dll.ailiaSpeechComplete.argtypes = (c_void_p, POINTER(ctypes.c_uint))
233
+
234
+ dll.ailiaSpeechTranscribe.restype = c_int
235
+ dll.ailiaSpeechTranscribe.argtypes = (c_void_p, )
236
+
237
+ dll.ailiaSpeechGetTextCount.restype = c_int
238
+ dll.ailiaSpeechGetTextCount.argtypes = (c_void_p, POINTER(ctypes.c_uint))
239
+
240
+ class AILIASpeechText(ctypes.Structure):
241
+ _fields_ = [
242
+ ("text", ctypes.c_char_p),
243
+ ("time_stamp_begin", ctypes.c_float),
244
+ ("time_stamp_end", ctypes.c_float),
245
+ ("person_id", ctypes.c_uint),
246
+ ("language", ctypes.c_char_p),
247
+ ("confidence", ctypes.c_float)]
248
+
249
+ dll.ailiaSpeechGetText.restype = c_int
250
+ dll.ailiaSpeechGetText.argtypes = (c_void_p, POINTER(AILIASpeechText), ctypes.c_uint, ctypes.c_uint)
251
+
252
+ dll.ailiaSpeechResetTranscribeState.restype = c_int
253
+ dll.ailiaSpeechResetTranscribeState.argtypes = (c_void_p, )
254
+
255
+ AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK = CFUNCTYPE((c_int), c_int64, c_char_p)
256
+
257
+ dll.ailiaSpeechSetIntermediateCallback.restype = c_int
258
+ dll.ailiaSpeechSetIntermediateCallback.argtypes = (c_void_p, AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK, c_int64)
259
+
260
+ dll.ailiaSpeechSetLanguage.restype = c_int
261
+ dll.ailiaSpeechSetLanguage.argtypes = (c_void_p, c_char_p)
262
+
263
+ # ==============================================================================
264
+ # model download
265
+ # ==============================================================================
266
+
267
+ def progress_print(block_count, block_size, total_size):
268
+ percentage = 100.0 * block_count * block_size / total_size
269
+ if percentage > 100:
270
+ # Bigger than 100 does not look good, so...
271
+ percentage = 100
272
+ max_bar = 50
273
+ bar_num = int(percentage / (100 / max_bar))
274
+ progress_element = '=' * bar_num
275
+ if bar_num != max_bar:
276
+ progress_element += '>'
277
+ bar_fill = ' ' # fill the blanks
278
+ bar = progress_element.ljust(max_bar, bar_fill)
279
+ total_size_kb = total_size / 1024
280
+ print(f'[{bar} {percentage:.2f}% ( {total_size_kb:.0f}KB )]', end='\r')
281
+
282
+ def urlretrieve(remote_path, weight_path, progress_print):
283
+ temp_path = weight_path + ".tmp"
284
+ try:
285
+ #raise ssl.SSLError # test
286
+ urllib.request.urlretrieve(
287
+ remote_path,
288
+ temp_path,
289
+ progress_print,
290
+ )
291
+ except ssl.SSLError as e:
292
+ print(f'SSLError detected, so try to download without ssl')
293
+ remote_path = remote_path.replace("https","http")
294
+ urllib.request.urlretrieve(
295
+ remote_path,
296
+ temp_path,
297
+ progress_print,
298
+ )
299
+ shutil.move(temp_path, weight_path)
300
+
301
+ def check_and_download_file(file_path, remote_path):
302
+ if not os.path.exists(file_path):
303
+ print('Downloading %s...' % file_path)
304
+ urlretrieve(remote_path + os.path.basename(file_path), file_path, progress_print)
305
+
306
+ # ==============================================================================
307
+ # base model class
308
+ # ==============================================================================
309
+
310
+ class AiliaSpeechError(RuntimeError):
311
+ def __init__(self, message, code):
312
+ super().__init__(f"{message} code:{code}")
313
+ self.code = code
314
+
315
+ class AiliaSpeechModel:
316
+ _api_callback = None
317
+ _instance = None
318
+
319
+ def _check(self, status):
320
+ if status != AILIA_SPEECH_STATUS_SUCCESS:
321
+ raise AiliaSpeechError(f"ailia speech error", status)
322
+
323
+ def _string_buffer(self, path):
324
+ if sys.platform == "win32":
325
+ return ctypes.create_unicode_buffer(path)
326
+ else:
327
+ return ctypes.create_string_buffer(path.encode("utf-8"))
328
+
329
+ def _create_callback(self):
330
+ callback = AILIASpeechApiCallback()
331
+ callback.ailiaAudioGetFrameLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN(("ailiaAudioGetFrameLen", ailia.audio.audio_core.dll))
332
+ callback.ailiaAudioGetMelSpectrogram = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM(("ailiaAudioGetMelSpectrogram", ailia.audio.audio_core.dll))
333
+ callback.ailiaAudioResample = AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE(("ailiaAudioResample", ailia.audio.audio_core.dll))
334
+ callback.ailiaAudioGetResampleLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN(("ailiaAudioGetResampleLen", ailia.audio.audio_core.dll))
335
+
336
+ callback.ailiaTokenizerCreate = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE(("ailiaTokenizerCreate", ailia_tokenizer.dll))
337
+ callback.ailiaTokenizerOpenModelFileA = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A(("ailiaTokenizerOpenModelFileA", ailia_tokenizer.dll))
338
+ callback.ailiaTokenizerOpenModelFileW = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W(("ailiaTokenizerOpenModelFileW", ailia_tokenizer.dll))
339
+ callback.ailiaTokenizerEncode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE(("ailiaTokenizerEncode", ailia_tokenizer.dll))
340
+ callback.ailiaTokenizerGetTokenCount = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT(("ailiaTokenizerGetTokenCount", ailia_tokenizer.dll))
341
+ callback.ailiaTokenizerGetTokens = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS(("ailiaTokenizerGetTokens", ailia_tokenizer.dll))
342
+ callback.ailiaTokenizerDecode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE(("ailiaTokenizerDecode", ailia_tokenizer.dll))
343
+ callback.ailiaTokenizerGetTextLength = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH(("ailiaTokenizerGetTextLength", ailia_tokenizer.dll))
344
+ callback.ailiaTokenizerGetText = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT(("ailiaTokenizerGetText", ailia_tokenizer.dll))
345
+ callback.ailiaTokenizerDestroy = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY(("ailiaTokenizerDestroy", ailia_tokenizer.dll))
346
+ callback.ailiaTokenizerUtf8ToUtf32 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32(("ailiaTokenizerUtf8ToUtf32", ailia_tokenizer.dll))
347
+ callback.ailiaTokenizerUtf32ToUtf8 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8(("ailiaTokenizerUtf32ToUtf8", ailia_tokenizer.dll))
348
+
349
+ callback.ailiaCreate = AILIA_SPEECH_USER_API_AILIA_CREATE(("ailiaCreate", ailia.core.dll))
350
+ callback.ailiaOpenWeightFileA = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A(("ailiaOpenWeightFileA", ailia.core.dll))
351
+ callback.ailiaOpenWeightFileW = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W(("ailiaOpenWeightFileW", ailia.core.dll))
352
+ callback.ailiaOpenWeightMem = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM(("ailiaOpenWeightMem", ailia.core.dll))
353
+ callback.ailiaSetMemoryMode = AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE(("ailiaSetMemoryMode", ailia.core.dll))
354
+ callback.ailiaDestroy = AILIA_SPEECH_USER_API_AILIA_DESTROY(("ailiaDestroy", ailia.core.dll))
355
+ callback.ailiaUpdate = AILIA_SPEECH_USER_API_AILIA_UPDATE(("ailiaUpdate", ailia.core.dll))
356
+ callback.ailiaGetBlobIndexByInputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX(("ailiaGetBlobIndexByInputIndex", ailia.core.dll))
357
+ callback.ailiaGetBlobIndexByOutputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX(("ailiaGetBlobIndexByOutputIndex", ailia.core.dll))
358
+ callback.ailiaGetBlobData = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA(("ailiaGetBlobData", ailia.core.dll))
359
+ callback.ailiaSetInputBlobData = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA(("ailiaSetInputBlobData", ailia.core.dll))
360
+ callback.ailiaSetInputBlobShape = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE(("ailiaSetInputBlobShape", ailia.core.dll))
361
+ callback.ailiaGetBlobShape = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE(("ailiaGetBlobShape", ailia.core.dll))
362
+ callback.ailiaGetErrorDetail = AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL(("ailiaGetErrorDetail", ailia.core.dll))
363
+ callback.ailiaCopyBlobData = AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA(("ailiaCopyBlobData", ailia.core.dll))
364
+ callback.ailiaGetEnvironment = AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT(("ailiaGetEnvironment", ailia.core.dll))
365
+
366
+ self._api_callback = callback # prevent GC
367
+
368
+ # ==============================================================================
369
+ # Public class
370
+ # ==============================================================================
371
+
372
+ intermediate_callback_cnt = 0
373
+ intermediate_callback_map = {}
374
+
375
+ def intermediate_callback(handle, text):
376
+ intermediate_callback_map[handle](text.decode())
377
+ return 0
378
+
379
+ class Whisper(AiliaSpeechModel):
380
+ _c_callback = None
381
+
382
+ def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
383
+ self._instance = ctypes.c_void_p(None)
384
+ self._create_callback()
385
+ self._check(dll.ailiaSpeechCreate(cast(pointer(self._instance), POINTER(c_void_p)), ctypes.c_int32(env_id), ctypes.c_int32(num_thread), ctypes.c_int32(memory_mode), ctypes.c_int32(task), ctypes.c_int32(flags), self._api_callback, ctypes.c_int32(AILIA_SPEECH_API_CALLBACK_VERSION)))
386
+ if callback is not None:
387
+ self._c_callback = AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK(intermediate_callback)
388
+ global intermediate_callback_cnt
389
+ global intermediate_callback_map
390
+ intermediate_callback_map[intermediate_callback_cnt] = callback
391
+ self._check(dll.ailiaSpeechSetIntermediateCallback(self._instance, self._c_callback, intermediate_callback_cnt))
392
+ intermediate_callback_cnt = intermediate_callback_cnt + 1
393
+
394
+
395
+ def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY):
396
+ if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
397
+ encoder_path = "encoder_tiny.opt.onnx"
398
+ decoder_path = "decoder_tiny_fix_kv_cache.opt2.onnx"
399
+ encoder_pb_path = None
400
+ decoder_pb_path = None
401
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE:
402
+ encoder_path = "encoder_base.opt.onnx"
403
+ decoder_path = "decoder_base_fix_kv_cache.opt2.onnx"
404
+ encoder_pb_path = None
405
+ decoder_pb_path = None
406
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL:
407
+ encoder_path = "encoder_small.opt.onnx"
408
+ decoder_path = "decoder_small_fix_kv_cache.opt2.onnx"
409
+ encoder_pb_path = None
410
+ decoder_pb_path = None
411
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM:
412
+ encoder_path = "encoder_medium.opt.onnx"
413
+ decoder_path = "decoder_medium_fix_kv_cache.opt2.onnx"
414
+ encoder_pb_path = None
415
+ decoder_pb_path = None
416
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE:
417
+ encoder_path = "encoder_large.onnx"
418
+ decoder_path = "decoder_large_fix_kv_cache.onnx"
419
+ encoder_pb_path = "encoder_large_weights.pb"
420
+ decoder_pb_path = "decoder_large_fix_kv_cache_weights.pb"
421
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3:
422
+ encoder_path = "encoder_large_v3.onnx"
423
+ decoder_path = "decoder_large_v3_fix_kv_cache.onnx"
424
+ encoder_pb_path = "encoder_large_v3_weights.pb"
425
+ decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
426
+ self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
427
+ self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
428
+ self._open_vad(model_path + "silero_vad.onnx", AILIA_SPEECH_VAD_TYPE_SILERO)
429
+
430
+ def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path):
431
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
432
+ os.makedirs(model_path, exist_ok = True)
433
+ check_and_download_file(model_path + encoder_path, REMOTE_PATH)
434
+ check_and_download_file(model_path + decoder_path, REMOTE_PATH)
435
+ if encoder_pb_path is not None:
436
+ check_and_download_file(model_path + encoder_pb_path, REMOTE_PATH)
437
+ if decoder_pb_path is not None:
438
+ check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
439
+
440
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
441
+ check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
442
+
443
+ def _open_model(self, encoder, decoder, model_type):
444
+ p1 = self._string_buffer(encoder)
445
+ p2 = self._string_buffer(decoder)
446
+
447
+ if sys.platform == "win32":
448
+ self._check(dll.ailiaSpeechOpenModelFileW(self._instance, p1, p2, model_type))
449
+ else:
450
+ self._check(dll.ailiaSpeechOpenModelFileA(self._instance, p1, p2, model_type))
451
+
452
+ def _open_vad(self, vad, vad_type):
453
+ p1 = self._string_buffer(vad)
454
+
455
+ if sys.platform == "win32":
456
+ self._check(dll.ailiaSpeechOpenVadFileW(self._instance, p1, vad_type))
457
+ else:
458
+ self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
459
+
460
+ def transcribe(self, audio_waveform, sampling_rate, lang = None):
461
+ if len(audio_waveform.shape) == 1:
462
+ channels = 1
463
+ elif len(audio_waveform.shape) == 2:
464
+ channels = audio_waveform.shape[0]
465
+ audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
466
+ else:
467
+ raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
468
+
469
+ audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
470
+
471
+ if lang is not None:
472
+ self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
473
+
474
+ self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
475
+ self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
476
+ self._check(dll.ailiaSpeechTranscribe(self._instance))
477
+
478
+ count = ctypes.c_uint(0)
479
+ self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
480
+ results = []
481
+ for i in range(count.value):
482
+ text = AILIASpeechText()
483
+ self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
484
+ results.append({"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence})
485
+
486
+ self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
487
+
488
+ return results
489
+
490
+ def __del__(self):
491
+ if self._instance:
492
+ dll.ailiaSpeechDestroy(cast(self._instance, c_void_p))
493
+
Binary file
@@ -0,0 +1,493 @@
1
+ import ctypes
2
+ import os
3
+ import sys
4
+
5
+ import numpy
6
+ import ailia
7
+ import ailia.audio
8
+ import ailia_tokenizer
9
+
10
+ import urllib.request
11
+ import ssl
12
+ import shutil
13
+ import platform
14
+
15
+ #### dependency check
16
+ if sys.platform == "win32":
17
+ import ctypes
18
+ try:
19
+ for library in ["vcruntime140.dll", "vcruntime140_1.dll", "msvcp140.dll"]:
20
+ ctypes.windll.LoadLibrary(library)
21
+ except:
22
+ print(" WARNING Please install MSVC 2015-2019 runtime from https://docs.microsoft.com/ja-jp/cpp/windows/latest-supported-vc-redist")
23
+
24
+
25
+ #### loading DLL / DYLIB / SO ####
26
+ if sys.platform == "win32":
27
+ dll_platform = "windows/x64"
28
+ dll_name = "ailia_speech.dll"
29
+ load_fn = ctypes.WinDLL
30
+ elif sys.platform == "darwin":
31
+ dll_platform = "mac"
32
+ dll_name = "libailia_speech.dylib"
33
+ load_fn = ctypes.CDLL
34
+ else:
35
+ is_arm = "arm" in platform.machine() or platform.machine() == "aarch64"
36
+ if is_arm:
37
+ if platform.architecture()[0] == "32bit":
38
+ dll_platform = "linux/armeabi-v7a"
39
+ else:
40
+ dll_platform = "linux/arm64-v8a"
41
+ else:
42
+ dll_platform = "linux/x64"
43
+ dll_name = "libailia_speech.so"
44
+ load_fn = ctypes.CDLL
45
+
46
+ dll_found = False
47
+ candidate = ["", str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep), str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep) + dll_platform + str(os.sep)]
48
+ for dir in candidate:
49
+ try:
50
+ dll = load_fn(dir + dll_name)
51
+ dll_found = True
52
+ except:
53
+ pass
54
+ if not dll_found:
55
+ msg = "DLL load failed : \'" + dll_name + "\' is not found"
56
+ raise ImportError(msg)
57
+
58
+ # ==============================================================================
59
+
60
+ from ctypes import *
61
+
62
+ AILIA_SPEECH_STATUS_SUCCESS = ( 0 )
63
+
64
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY = (0)
65
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE = (1)
66
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL = (2)
67
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM = (3)
68
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE = (4)
69
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 = (5)
70
+
71
+ AILIA_SPEECH_TASK_TRANSCRIBE = (0)
72
+ AILIA_SPEECH_TASK_TRANSLATE = (1)
73
+
74
+ AILIA_SPEECH_FLAG_NONE = (0)
75
+ AILIA_SPEECH_FLAG_LIVE = (1)
76
+
77
+ AILIA_SPEECH_VAD_TYPE_SILERO = (0)
78
+ AILIA_SPEECH_API_CALLBACK_VERSION = (6)
79
+
80
+ AILIA_SPEECH_TEXT_VERSION = (2)
81
+
82
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
83
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
84
+
85
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p) , c_int, c_int)
86
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A = CFUNCTYPE((c_int), c_void_p , c_char_p)
87
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W = CFUNCTYPE((c_int), c_void_p , c_wchar)
88
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE = CFUNCTYPE((c_int), c_void_p , c_char_p)
89
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
90
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS = CFUNCTYPE((c_int), c_void_p , POINTER(c_int) , c_uint)
91
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE = CFUNCTYPE((c_int), c_void_p , POINTER(c_int), c_uint)
92
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
93
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT = CFUNCTYPE((c_int), c_void_p , c_char_p , c_uint)
94
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY = CFUNCTYPE((c_int), c_void_p)
95
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32 = CFUNCTYPE((c_int), POINTER(c_uint) , POINTER(c_uint) , c_char_p , c_uint)
96
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8 = CFUNCTYPE((c_int), c_char_p, POINTER(c_uint) , c_uint)
97
+
98
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE = CFUNCTYPE((c_int), POINTER(c_float), POINTER(c_float), c_int, c_int, c_int, c_int)
99
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN = CFUNCTYPE((c_int), POINTER(c_int), c_int, c_int, c_int)
100
+ AILIA_SPEECH_USER_API_AILIA_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p), c_int, c_int)
101
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A = CFUNCTYPE((c_int), c_void_p, c_char_p)
102
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W = CFUNCTYPE((c_int), c_void_p, POINTER(c_wchar))
103
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM = CFUNCTYPE((c_int), c_void_p, POINTER(c_byte), c_uint)
104
+ AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE = CFUNCTYPE((c_int), c_void_p, c_uint)
105
+ AILIA_SPEECH_USER_API_AILIA_DESTROY = CFUNCTYPE((None), c_void_p)
106
+ AILIA_SPEECH_USER_API_AILIA_UPDATE = CFUNCTYPE((c_int), c_void_p)
107
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
108
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
109
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
110
+ AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
111
+ AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
112
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
113
+ AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL = CFUNCTYPE((c_char_p), c_void_p)
114
+
115
+ AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, c_uint, c_void_p, c_uint)
116
+ AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT = CFUNCTYPE((c_int), POINTER(c_void_p), c_uint, c_uint)
117
+
118
+
119
+ class struct__AILIASpeechApiCallback(Structure):
120
+ pass
121
+
122
+ struct__AILIASpeechApiCallback.__slots__ = [
123
+ 'ailiaAudioGetFrameLen',
124
+ 'ailiaAudioGetMelSpectrogram',
125
+ 'ailiaAudioResample',
126
+ 'ailiaAudioGetResampleLen',
127
+
128
+ 'ailiaTokenizerCreate',
129
+ 'ailiaTokenizerOpenModelFileA',
130
+ 'ailiaTokenizerOpenModelFileW',
131
+ 'ailiaTokenizerEncode',
132
+ 'ailiaTokenizerGetTokenCount',
133
+ 'ailiaTokenizerGetTokens',
134
+ 'ailiaTokenizerDecode',
135
+ 'ailiaTokenizerGetTextLength',
136
+ 'ailiaTokenizerGetText',
137
+ 'ailiaTokenizerDestroy',
138
+ 'ailiaTokenizerUtf8ToUtf32',
139
+ 'ailiaTokenizerUtf32ToUtf8',
140
+
141
+ 'ailiaCreate',
142
+ 'ailiaOpenWeightFileA',
143
+ 'ailiaOpenWeightFileW',
144
+ 'ailiaOpenWeightMem',
145
+ 'ailiaSetMemoryMode',
146
+ 'ailiaDestroy',
147
+ 'ailiaUpdate',
148
+ 'ailiaGetBlobIndexByInputIndex',
149
+ 'ailiaGetBlobIndexByOutputIndex',
150
+ 'ailiaGetBlobData',
151
+ 'ailiaSetInputBlobData',
152
+ 'ailiaSetInputBlobShape',
153
+ 'ailiaGetBlobShape',
154
+ 'ailiaGetErrorDetail',
155
+ 'ailiaCopyBlobData',
156
+ 'ailiaGetEnvironment',
157
+ ]
158
+ struct__AILIASpeechApiCallback._fields_ = [
159
+ ('ailiaAudioGetFrameLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN),
160
+ ('ailiaAudioGetMelSpectrogram', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM),
161
+ ('ailiaAudioResample', AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE),
162
+ ('ailiaAudioGetResampleLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN),
163
+
164
+ ('ailiaTokenizerCreate', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE),
165
+ ('ailiaTokenizerOpenModelFileA', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A),
166
+ ('ailiaTokenizerOpenModelFileW', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W),
167
+ ('ailiaTokenizerEncode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE),
168
+ ('ailiaTokenizerGetTokenCount', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT),
169
+ ('ailiaTokenizerGetTokens', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS),
170
+ ('ailiaTokenizerDecode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE),
171
+ ('ailiaTokenizerGetTextLength', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH),
172
+ ('ailiaTokenizerGetText', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT),
173
+ ('ailiaTokenizerDestroy', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY),
174
+ ('ailiaTokenizerUtf8ToUtf32', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32),
175
+ ('ailiaTokenizerUtf32ToUtf8', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8),
176
+
177
+ ('ailiaCreate', AILIA_SPEECH_USER_API_AILIA_CREATE),
178
+ ('ailiaOpenWeightFileA', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A),
179
+ ('ailiaOpenWeightFileW', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W),
180
+ ('ailiaOpenWeightMem', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM),
181
+ ('ailiaSetMemoryMode', AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE),
182
+ ('ailiaDestroy', AILIA_SPEECH_USER_API_AILIA_DESTROY),
183
+ ('ailiaUpdate', AILIA_SPEECH_USER_API_AILIA_UPDATE),
184
+ ('ailiaGetBlobIndexByInputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX),
185
+ ('ailiaGetBlobIndexByOutputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX),
186
+ ('ailiaGetBlobData', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA),
187
+ ('ailiaSetInputBlobData', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA),
188
+ ('ailiaSetInputBlobShape', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE),
189
+ ('ailiaGetBlobShape', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE),
190
+ ('ailiaGetErrorDetail', AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL),
191
+ ('ailiaCopyBlobData', AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA),
192
+ ('ailiaGetEnvironment', AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT),
193
+ ]
194
+
195
+ AILIASpeechApiCallback = struct__AILIASpeechApiCallback
196
+
197
+ # ==============================================================================
198
+
199
+ dll.ailiaSpeechCreate.restype = c_int
200
+ dll.ailiaSpeechCreate.argtypes = (POINTER(c_void_p), c_int32, c_int32, c_int32, c_int32, c_int32, AILIASpeechApiCallback, c_int32)
201
+
202
+ dll.ailiaSpeechDestroy.restype = None
203
+ dll.ailiaSpeechDestroy.argtypes = (c_void_p, )
204
+
205
+ dll.ailiaSpeechOpenModelFileA.restype = c_int
206
+ dll.ailiaSpeechOpenModelFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
207
+
208
+ dll.ailiaSpeechOpenModelFileW.restype = c_int
209
+ dll.ailiaSpeechOpenModelFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
210
+
211
+ dll.ailiaSpeechOpenVadFileA.restype = c_int
212
+ dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
213
+
214
+ dll.ailiaSpeechOpenVadFileW.restype = c_int
215
+ dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
216
+
217
+ dll.ailiaSpeechPushInputData.restype = c_int
218
+ dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
219
+ dtype=numpy.float32, flags='CONTIGUOUS'
220
+ ), # src
221
+ ctypes.c_uint,
222
+ ctypes.c_uint,
223
+ ctypes.c_uint)
224
+
225
+ dll.ailiaSpeechFinalizeInputData.restype = c_int
226
+ dll.ailiaSpeechFinalizeInputData.argtypes = (c_void_p, )
227
+
228
+ dll.ailiaSpeechBuffered.restype = c_int
229
+ dll.ailiaSpeechBuffered.argtypes = (c_void_p, POINTER(ctypes.c_uint))
230
+
231
+ dll.ailiaSpeechComplete.restype = c_int
232
+ dll.ailiaSpeechComplete.argtypes = (c_void_p, POINTER(ctypes.c_uint))
233
+
234
+ dll.ailiaSpeechTranscribe.restype = c_int
235
+ dll.ailiaSpeechTranscribe.argtypes = (c_void_p, )
236
+
237
+ dll.ailiaSpeechGetTextCount.restype = c_int
238
+ dll.ailiaSpeechGetTextCount.argtypes = (c_void_p, POINTER(ctypes.c_uint))
239
+
240
+ class AILIASpeechText(ctypes.Structure):
241
+ _fields_ = [
242
+ ("text", ctypes.c_char_p),
243
+ ("time_stamp_begin", ctypes.c_float),
244
+ ("time_stamp_end", ctypes.c_float),
245
+ ("person_id", ctypes.c_uint),
246
+ ("language", ctypes.c_char_p),
247
+ ("confidence", ctypes.c_float)]
248
+
249
+ dll.ailiaSpeechGetText.restype = c_int
250
+ dll.ailiaSpeechGetText.argtypes = (c_void_p, POINTER(AILIASpeechText), ctypes.c_uint, ctypes.c_uint)
251
+
252
+ dll.ailiaSpeechResetTranscribeState.restype = c_int
253
+ dll.ailiaSpeechResetTranscribeState.argtypes = (c_void_p, )
254
+
255
+ AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK = CFUNCTYPE((c_int), c_int64, c_char_p)
256
+
257
+ dll.ailiaSpeechSetIntermediateCallback.restype = c_int
258
+ dll.ailiaSpeechSetIntermediateCallback.argtypes = (c_void_p, AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK, c_int64)
259
+
260
+ dll.ailiaSpeechSetLanguage.restype = c_int
261
+ dll.ailiaSpeechSetLanguage.argtypes = (c_void_p, c_char_p)
262
+
263
+ # ==============================================================================
264
+ # model download
265
+ # ==============================================================================
266
+
267
+ def progress_print(block_count, block_size, total_size):
268
+ percentage = 100.0 * block_count * block_size / total_size
269
+ if percentage > 100:
270
+ # Bigger than 100 does not look good, so...
271
+ percentage = 100
272
+ max_bar = 50
273
+ bar_num = int(percentage / (100 / max_bar))
274
+ progress_element = '=' * bar_num
275
+ if bar_num != max_bar:
276
+ progress_element += '>'
277
+ bar_fill = ' ' # fill the blanks
278
+ bar = progress_element.ljust(max_bar, bar_fill)
279
+ total_size_kb = total_size / 1024
280
+ print(f'[{bar} {percentage:.2f}% ( {total_size_kb:.0f}KB )]', end='\r')
281
+
282
+ def urlretrieve(remote_path, weight_path, progress_print):
283
+ temp_path = weight_path + ".tmp"
284
+ try:
285
+ #raise ssl.SSLError # test
286
+ urllib.request.urlretrieve(
287
+ remote_path,
288
+ temp_path,
289
+ progress_print,
290
+ )
291
+ except ssl.SSLError as e:
292
+ print(f'SSLError detected, so try to download without ssl')
293
+ remote_path = remote_path.replace("https","http")
294
+ urllib.request.urlretrieve(
295
+ remote_path,
296
+ temp_path,
297
+ progress_print,
298
+ )
299
+ shutil.move(temp_path, weight_path)
300
+
301
+ def check_and_download_file(file_path, remote_path):
302
+ if not os.path.exists(file_path):
303
+ print('Downloading %s...' % file_path)
304
+ urlretrieve(remote_path + os.path.basename(file_path), file_path, progress_print)
305
+
306
+ # ==============================================================================
307
+ # base model class
308
+ # ==============================================================================
309
+
310
+ class AiliaSpeechError(RuntimeError):
311
+ def __init__(self, message, code):
312
+ super().__init__(f"{message} code:{code}")
313
+ self.code = code
314
+
315
+ class AiliaSpeechModel:
316
+ _api_callback = None
317
+ _instance = None
318
+
319
+ def _check(self, status):
320
+ if status != AILIA_SPEECH_STATUS_SUCCESS:
321
+ raise AiliaSpeechError(f"ailia speech error", status)
322
+
323
+ def _string_buffer(self, path):
324
+ if sys.platform == "win32":
325
+ return ctypes.create_unicode_buffer(path)
326
+ else:
327
+ return ctypes.create_string_buffer(path.encode("utf-8"))
328
+
329
+ def _create_callback(self):
330
+ callback = AILIASpeechApiCallback()
331
+ callback.ailiaAudioGetFrameLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN(("ailiaAudioGetFrameLen", ailia.audio.audio_core.dll))
332
+ callback.ailiaAudioGetMelSpectrogram = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM(("ailiaAudioGetMelSpectrogram", ailia.audio.audio_core.dll))
333
+ callback.ailiaAudioResample = AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE(("ailiaAudioResample", ailia.audio.audio_core.dll))
334
+ callback.ailiaAudioGetResampleLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN(("ailiaAudioGetResampleLen", ailia.audio.audio_core.dll))
335
+
336
+ callback.ailiaTokenizerCreate = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE(("ailiaTokenizerCreate", ailia_tokenizer.dll))
337
+ callback.ailiaTokenizerOpenModelFileA = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A(("ailiaTokenizerOpenModelFileA", ailia_tokenizer.dll))
338
+ callback.ailiaTokenizerOpenModelFileW = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W(("ailiaTokenizerOpenModelFileW", ailia_tokenizer.dll))
339
+ callback.ailiaTokenizerEncode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE(("ailiaTokenizerEncode", ailia_tokenizer.dll))
340
+ callback.ailiaTokenizerGetTokenCount = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT(("ailiaTokenizerGetTokenCount", ailia_tokenizer.dll))
341
+ callback.ailiaTokenizerGetTokens = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS(("ailiaTokenizerGetTokens", ailia_tokenizer.dll))
342
+ callback.ailiaTokenizerDecode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE(("ailiaTokenizerDecode", ailia_tokenizer.dll))
343
+ callback.ailiaTokenizerGetTextLength = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH(("ailiaTokenizerGetTextLength", ailia_tokenizer.dll))
344
+ callback.ailiaTokenizerGetText = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT(("ailiaTokenizerGetText", ailia_tokenizer.dll))
345
+ callback.ailiaTokenizerDestroy = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY(("ailiaTokenizerDestroy", ailia_tokenizer.dll))
346
+ callback.ailiaTokenizerUtf8ToUtf32 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32(("ailiaTokenizerUtf8ToUtf32", ailia_tokenizer.dll))
347
+ callback.ailiaTokenizerUtf32ToUtf8 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8(("ailiaTokenizerUtf32ToUtf8", ailia_tokenizer.dll))
348
+
349
+ callback.ailiaCreate = AILIA_SPEECH_USER_API_AILIA_CREATE(("ailiaCreate", ailia.core.dll))
350
+ callback.ailiaOpenWeightFileA = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A(("ailiaOpenWeightFileA", ailia.core.dll))
351
+ callback.ailiaOpenWeightFileW = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W(("ailiaOpenWeightFileW", ailia.core.dll))
352
+ callback.ailiaOpenWeightMem = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM(("ailiaOpenWeightMem", ailia.core.dll))
353
+ callback.ailiaSetMemoryMode = AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE(("ailiaSetMemoryMode", ailia.core.dll))
354
+ callback.ailiaDestroy = AILIA_SPEECH_USER_API_AILIA_DESTROY(("ailiaDestroy", ailia.core.dll))
355
+ callback.ailiaUpdate = AILIA_SPEECH_USER_API_AILIA_UPDATE(("ailiaUpdate", ailia.core.dll))
356
+ callback.ailiaGetBlobIndexByInputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX(("ailiaGetBlobIndexByInputIndex", ailia.core.dll))
357
+ callback.ailiaGetBlobIndexByOutputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX(("ailiaGetBlobIndexByOutputIndex", ailia.core.dll))
358
+ callback.ailiaGetBlobData = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA(("ailiaGetBlobData", ailia.core.dll))
359
+ callback.ailiaSetInputBlobData = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA(("ailiaSetInputBlobData", ailia.core.dll))
360
+ callback.ailiaSetInputBlobShape = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE(("ailiaSetInputBlobShape", ailia.core.dll))
361
+ callback.ailiaGetBlobShape = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE(("ailiaGetBlobShape", ailia.core.dll))
362
+ callback.ailiaGetErrorDetail = AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL(("ailiaGetErrorDetail", ailia.core.dll))
363
+ callback.ailiaCopyBlobData = AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA(("ailiaCopyBlobData", ailia.core.dll))
364
+ callback.ailiaGetEnvironment = AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT(("ailiaGetEnvironment", ailia.core.dll))
365
+
366
+ self._api_callback = callback # prevent GC
367
+
368
+ # ==============================================================================
369
+ # Public class
370
+ # ==============================================================================
371
+
372
+ intermediate_callback_cnt = 0
373
+ intermediate_callback_map = {}
374
+
375
+ def intermediate_callback(handle, text):
376
+ intermediate_callback_map[handle](text.decode())
377
+ return 0
378
+
379
+ class Whisper(AiliaSpeechModel):
380
+ _c_callback = None
381
+
382
+ def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
383
+ self._instance = ctypes.c_void_p(None)
384
+ self._create_callback()
385
+ self._check(dll.ailiaSpeechCreate(cast(pointer(self._instance), POINTER(c_void_p)), ctypes.c_int32(env_id), ctypes.c_int32(num_thread), ctypes.c_int32(memory_mode), ctypes.c_int32(task), ctypes.c_int32(flags), self._api_callback, ctypes.c_int32(AILIA_SPEECH_API_CALLBACK_VERSION)))
386
+ if callback is not None:
387
+ self._c_callback = AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK(intermediate_callback)
388
+ global intermediate_callback_cnt
389
+ global intermediate_callback_map
390
+ intermediate_callback_map[intermediate_callback_cnt] = callback
391
+ self._check(dll.ailiaSpeechSetIntermediateCallback(self._instance, self._c_callback, intermediate_callback_cnt))
392
+ intermediate_callback_cnt = intermediate_callback_cnt + 1
393
+
394
+
395
+ def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY):
396
+ if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
397
+ encoder_path = "encoder_tiny.opt.onnx"
398
+ decoder_path = "decoder_tiny_fix_kv_cache.opt2.onnx"
399
+ encoder_pb_path = None
400
+ decoder_pb_path = None
401
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE:
402
+ encoder_path = "encoder_base.opt.onnx"
403
+ decoder_path = "decoder_base_fix_kv_cache.opt2.onnx"
404
+ encoder_pb_path = None
405
+ decoder_pb_path = None
406
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL:
407
+ encoder_path = "encoder_small.opt.onnx"
408
+ decoder_path = "decoder_small_fix_kv_cache.opt2.onnx"
409
+ encoder_pb_path = None
410
+ decoder_pb_path = None
411
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM:
412
+ encoder_path = "encoder_medium.opt.onnx"
413
+ decoder_path = "decoder_medium_fix_kv_cache.opt2.onnx"
414
+ encoder_pb_path = None
415
+ decoder_pb_path = None
416
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE:
417
+ encoder_path = "encoder_large.onnx"
418
+ decoder_path = "decoder_large_fix_kv_cache.onnx"
419
+ encoder_pb_path = "encoder_large_weights.pb"
420
+ decoder_pb_path = "decoder_large_fix_kv_cache_weights.pb"
421
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3:
422
+ encoder_path = "encoder_large_v3.onnx"
423
+ decoder_path = "decoder_large_v3_fix_kv_cache.onnx"
424
+ encoder_pb_path = "encoder_large_v3_weights.pb"
425
+ decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
426
+ self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
427
+ self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
428
+ self._open_vad(model_path + "silero_vad.onnx", AILIA_SPEECH_VAD_TYPE_SILERO)
429
+
430
+ def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path):
431
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
432
+ os.makedirs(model_path, exist_ok = True)
433
+ check_and_download_file(model_path + encoder_path, REMOTE_PATH)
434
+ check_and_download_file(model_path + decoder_path, REMOTE_PATH)
435
+ if encoder_pb_path is not None:
436
+ check_and_download_file(model_path + encoder_pb_path, REMOTE_PATH)
437
+ if decoder_pb_path is not None:
438
+ check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
439
+
440
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
441
+ check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
442
+
443
+ def _open_model(self, encoder, decoder, model_type):
444
+ p1 = self._string_buffer(encoder)
445
+ p2 = self._string_buffer(decoder)
446
+
447
+ if sys.platform == "win32":
448
+ self._check(dll.ailiaSpeechOpenModelFileW(self._instance, p1, p2, model_type))
449
+ else:
450
+ self._check(dll.ailiaSpeechOpenModelFileA(self._instance, p1, p2, model_type))
451
+
452
+ def _open_vad(self, vad, vad_type):
453
+ p1 = self._string_buffer(vad)
454
+
455
+ if sys.platform == "win32":
456
+ self._check(dll.ailiaSpeechOpenVadFileW(self._instance, p1, vad_type))
457
+ else:
458
+ self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
459
+
460
+ def transcribe(self, audio_waveform, sampling_rate, lang = None):
461
+ if len(audio_waveform.shape) == 1:
462
+ channels = 1
463
+ elif len(audio_waveform.shape) == 2:
464
+ channels = audio_waveform.shape[0]
465
+ audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
466
+ else:
467
+ raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
468
+
469
+ audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
470
+
471
+ if lang is not None:
472
+ self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
473
+
474
+ self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
475
+ self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
476
+ self._check(dll.ailiaSpeechTranscribe(self._instance))
477
+
478
+ count = ctypes.c_uint(0)
479
+ self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
480
+ results = []
481
+ for i in range(count.value):
482
+ text = AILIASpeechText()
483
+ self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
484
+ results.append({"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence})
485
+
486
+ self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
487
+
488
+ return results
489
+
490
+ def __del__(self):
491
+ if self._instance:
492
+ dll.ailiaSpeechDestroy(cast(self._instance, c_void_p))
493
+
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.1
2
+ Name: ailia_speech
3
+ Version: 1.3.0.0
4
+ Summary: ailia AI Speech
5
+ Home-page: https://ailia.jp/
6
+ Author: ax Inc.
7
+ Author-email: contact@axinc.jp
8
+ License: https://ailia.ai/en/license/
9
+ Requires-Python: >3.6
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: ailia
12
+ Requires-Dist: ailia-tokenizer
13
+
14
+ # ailia AI Speech Python API
15
+
16
+ !! CAUTION !!
17
+ “ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
18
+ As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
19
+
20
+ ## About ailia AI Speech
21
+
22
+ ailia Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia Speech, you can easily integrate AI powered speech recognition into your applications.
23
+
24
+ ## Install from pip
25
+
26
+ You can install the ailia SDK free evaluation package with the following command.
27
+
28
+ ```
29
+ pip3 install ailia_speech
30
+ ```
31
+
32
+ ## Install from package
33
+
34
+ You can install the ailia SDK from Package with the following command.
35
+
36
+ ```
37
+ python3 bootstrap.py
38
+ pip3 install ./
39
+ ```
40
+
41
+ ## Usage
42
+
43
+ ```python
44
+ import ailia
45
+ import ailia_speech
46
+
47
+ import librosa
48
+
49
+ import os
50
+ import urllib.request
51
+
52
+ # Load target audio
53
+ ref_file_path = "demo.wav"
54
+ if not os.path.exists(ref_file_path):
55
+ urllib.request.urlretrieve(
56
+ "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
57
+ "demo.wav"
58
+ )
59
+ audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
60
+
61
+ # Infer
62
+ speech = ailia_speech.Whisper()
63
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
64
+ recognized_text = speech.transcribe(audio_waveform, sampling_rate)
65
+ print(recognized_text)
66
+ ```
67
+
68
+ ## API specification
69
+
70
+ https://github.com/axinc-ai/ailia-sdk
71
+
@@ -0,0 +1,12 @@
1
+ ailia_speech/LICENSE_AILIA_EN.pdf,sha256=1DzVViPnw1uAS8gJ5a8uN3iZNNR5I1ItIXmezHfUpeM,70149
2
+ ailia_speech/LICENSE_AILIA_JA.pdf,sha256=s628QN47S2bNqIfuSjm2LBf0vIluv2df6MSemn6Ksmw,174134
3
+ ailia_speech/__init__.py,sha256=7XiloklOFiXHRHs_wDWyDW0HHD6obPDklxY327Lmwmc,25333
4
+ ailia_speech/linux/arm64-v8a/libailia_speech.so,sha256=JAOwnBr7lbiMZmPCM99pd4vJQ08ZuXDPpq-FurrXSnE,166096
5
+ ailia_speech/linux/x64/libailia_speech.so,sha256=WbFvA5wKTgS_Zx8ErT7WBKJbzOUexavr4nP4EkLNawQ,171360
6
+ ailia_speech/mac/libailia_speech.dylib,sha256=-JAC40yLslAVMvfh6LhDvP3Zyt3hIT3WZc7wa9-07zU,317112
7
+ ailia_speech/windows/x64/ailia_speech.dll,sha256=WJCOHi0Na4tdMG1RT7dA7yAoWumiGSWeW1vxUtiXDS8,126464
8
+ ailia_speech-1.3.0.0.data/scripts/__init__.py,sha256=7XiloklOFiXHRHs_wDWyDW0HHD6obPDklxY327Lmwmc,25333
9
+ ailia_speech-1.3.0.0.dist-info/METADATA,sha256=xj7S7gD2gsqVQbXiBDaHh1s_Cron4mr5SN80GcPTBFc,1902
10
+ ailia_speech-1.3.0.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
11
+ ailia_speech-1.3.0.0.dist-info/top_level.txt,sha256=Ou9XeJ9AvdK8eutw07oosCthftD1tRYzAgNY2BrYhDc,13
12
+ ailia_speech-1.3.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.1.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ ailia_speech