ailia-speech 1.3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ailia-speech might be problematic. Click here for more details.

@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.1
2
+ Name: ailia_speech
3
+ Version: 1.3.0.0
4
+ Summary: ailia AI Speech
5
+ Home-page: https://ailia.jp/
6
+ Author: ax Inc.
7
+ Author-email: contact@axinc.jp
8
+ License: https://ailia.ai/en/license/
9
+ Requires-Python: >3.6
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: ailia
12
+ Requires-Dist: ailia_tokenizer
13
+
14
+ # ailia AI Speech Python API
15
+
16
+ !! CAUTION !!
17
+ “ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
18
+ As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
19
+
20
+ ## About ailia AI Speech
21
+
22
+ ailia Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia Speech, you can easily integrate AI powered speech recognition into your applications.
23
+
24
+ ## Install from pip
25
+
26
+ You can install the ailia SDK free evaluation package with the following command.
27
+
28
+ ```
29
+ pip3 install ailia_speech
30
+ ```
31
+
32
+ ## Install from package
33
+
34
+ You can install the ailia SDK from Package with the following command.
35
+
36
+ ```
37
+ python3 bootstrap.py
38
+ pip3 install ./
39
+ ```
40
+
41
+ ## Usage
42
+
43
+ ```python
44
+ import ailia
45
+ import ailia_speech
46
+
47
+ import librosa
48
+
49
+ import os
50
+ import urllib.request
51
+
52
+ # Load target audio
53
+ ref_file_path = "demo.wav"
54
+ if not os.path.exists(ref_file_path):
55
+ urllib.request.urlretrieve(
56
+ "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
57
+ "demo.wav"
58
+ )
59
+ audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
60
+
61
+ # Infer
62
+ speech = ailia_speech.Whisper()
63
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
64
+ recognized_text = speech.transcribe(audio_waveform, sampling_rate)
65
+ print(recognized_text)
66
+ ```
67
+
68
+ ## API specification
69
+
70
+ https://github.com/axinc-ai/ailia-sdk
71
+
@@ -0,0 +1,58 @@
1
+ # ailia AI Speech Python API
2
+
3
+ !! CAUTION !!
4
+ “ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
5
+ As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
6
+
7
+ ## About ailia AI Speech
8
+
9
+ ailia Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia Speech, you can easily integrate AI powered speech recognition into your applications.
10
+
11
+ ## Install from pip
12
+
13
+ You can install the ailia SDK free evaluation package with the following command.
14
+
15
+ ```
16
+ pip3 install ailia_speech
17
+ ```
18
+
19
+ ## Install from package
20
+
21
+ You can install the ailia SDK from Package with the following command.
22
+
23
+ ```
24
+ python3 bootstrap.py
25
+ pip3 install ./
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ ```python
31
+ import ailia
32
+ import ailia_speech
33
+
34
+ import librosa
35
+
36
+ import os
37
+ import urllib.request
38
+
39
+ # Load target audio
40
+ ref_file_path = "demo.wav"
41
+ if not os.path.exists(ref_file_path):
42
+ urllib.request.urlretrieve(
43
+ "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
44
+ "demo.wav"
45
+ )
46
+ audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
47
+
48
+ # Infer
49
+ speech = ailia_speech.Whisper()
50
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
51
+ recognized_text = speech.transcribe(audio_waveform, sampling_rate)
52
+ print(recognized_text)
53
+ ```
54
+
55
+ ## API specification
56
+
57
+ https://github.com/axinc-ai/ailia-sdk
58
+
@@ -0,0 +1,493 @@
1
+ import ctypes
2
+ import os
3
+ import sys
4
+
5
+ import numpy
6
+ import ailia
7
+ import ailia.audio
8
+ import ailia_tokenizer
9
+
10
+ import urllib.request
11
+ import ssl
12
+ import shutil
13
+ import platform
14
+
15
+ #### dependency check
16
+ if sys.platform == "win32":
17
+ import ctypes
18
+ try:
19
+ for library in ["vcruntime140.dll", "vcruntime140_1.dll", "msvcp140.dll"]:
20
+ ctypes.windll.LoadLibrary(library)
21
+ except:
22
+ print(" WARNING Please install MSVC 2015-2019 runtime from https://docs.microsoft.com/ja-jp/cpp/windows/latest-supported-vc-redist")
23
+
24
+
25
+ #### loading DLL / DYLIB / SO ####
26
+ if sys.platform == "win32":
27
+ dll_platform = "windows/x64"
28
+ dll_name = "ailia_speech.dll"
29
+ load_fn = ctypes.WinDLL
30
+ elif sys.platform == "darwin":
31
+ dll_platform = "mac"
32
+ dll_name = "libailia_speech.dylib"
33
+ load_fn = ctypes.CDLL
34
+ else:
35
+ is_arm = "arm" in platform.machine() or platform.machine() == "aarch64"
36
+ if is_arm:
37
+ if platform.architecture()[0] == "32bit":
38
+ dll_platform = "linux/armeabi-v7a"
39
+ else:
40
+ dll_platform = "linux/arm64-v8a"
41
+ else:
42
+ dll_platform = "linux/x64"
43
+ dll_name = "libailia_speech.so"
44
+ load_fn = ctypes.CDLL
45
+
46
+ dll_found = False
47
+ candidate = ["", str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep), str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep) + dll_platform + str(os.sep)]
48
+ for dir in candidate:
49
+ try:
50
+ dll = load_fn(dir + dll_name)
51
+ dll_found = True
52
+ except:
53
+ pass
54
+ if not dll_found:
55
+ msg = "DLL load failed : \'" + dll_name + "\' is not found"
56
+ raise ImportError(msg)
57
+
58
+ # ==============================================================================
59
+
60
+ from ctypes import *
61
+
62
+ AILIA_SPEECH_STATUS_SUCCESS = ( 0 )
63
+
64
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY = (0)
65
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE = (1)
66
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL = (2)
67
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM = (3)
68
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE = (4)
69
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 = (5)
70
+
71
+ AILIA_SPEECH_TASK_TRANSCRIBE = (0)
72
+ AILIA_SPEECH_TASK_TRANSLATE = (1)
73
+
74
+ AILIA_SPEECH_FLAG_NONE = (0)
75
+ AILIA_SPEECH_FLAG_LIVE = (1)
76
+
77
+ AILIA_SPEECH_VAD_TYPE_SILERO = (0)
78
+ AILIA_SPEECH_API_CALLBACK_VERSION = (6)
79
+
80
+ AILIA_SPEECH_TEXT_VERSION = (2)
81
+
82
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
83
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
84
+
85
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p) , c_int, c_int)
86
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A = CFUNCTYPE((c_int), c_void_p , c_char_p)
87
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W = CFUNCTYPE((c_int), c_void_p , c_wchar)
88
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE = CFUNCTYPE((c_int), c_void_p , c_char_p)
89
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
90
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS = CFUNCTYPE((c_int), c_void_p , POINTER(c_int) , c_uint)
91
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE = CFUNCTYPE((c_int), c_void_p , POINTER(c_int), c_uint)
92
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
93
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT = CFUNCTYPE((c_int), c_void_p , c_char_p , c_uint)
94
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY = CFUNCTYPE((c_int), c_void_p)
95
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32 = CFUNCTYPE((c_int), POINTER(c_uint) , POINTER(c_uint) , c_char_p , c_uint)
96
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8 = CFUNCTYPE((c_int), c_char_p, POINTER(c_uint) , c_uint)
97
+
98
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE = CFUNCTYPE((c_int), POINTER(c_float), POINTER(c_float), c_int, c_int, c_int, c_int)
99
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN = CFUNCTYPE((c_int), POINTER(c_int), c_int, c_int, c_int)
100
+ AILIA_SPEECH_USER_API_AILIA_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p), c_int, c_int)
101
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A = CFUNCTYPE((c_int), c_void_p, c_char_p)
102
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W = CFUNCTYPE((c_int), c_void_p, POINTER(c_wchar))
103
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM = CFUNCTYPE((c_int), c_void_p, POINTER(c_byte), c_uint)
104
+ AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE = CFUNCTYPE((c_int), c_void_p, c_uint)
105
+ AILIA_SPEECH_USER_API_AILIA_DESTROY = CFUNCTYPE((None), c_void_p)
106
+ AILIA_SPEECH_USER_API_AILIA_UPDATE = CFUNCTYPE((c_int), c_void_p)
107
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
108
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
109
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
110
+ AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
111
+ AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
112
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
113
+ AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL = CFUNCTYPE((c_char_p), c_void_p)
114
+
115
+ AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, c_uint, c_void_p, c_uint)
116
+ AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT = CFUNCTYPE((c_int), POINTER(c_void_p), c_uint, c_uint)
117
+
118
+
119
+ class struct__AILIASpeechApiCallback(Structure):
120
+ pass
121
+
122
+ struct__AILIASpeechApiCallback.__slots__ = [
123
+ 'ailiaAudioGetFrameLen',
124
+ 'ailiaAudioGetMelSpectrogram',
125
+ 'ailiaAudioResample',
126
+ 'ailiaAudioGetResampleLen',
127
+
128
+ 'ailiaTokenizerCreate',
129
+ 'ailiaTokenizerOpenModelFileA',
130
+ 'ailiaTokenizerOpenModelFileW',
131
+ 'ailiaTokenizerEncode',
132
+ 'ailiaTokenizerGetTokenCount',
133
+ 'ailiaTokenizerGetTokens',
134
+ 'ailiaTokenizerDecode',
135
+ 'ailiaTokenizerGetTextLength',
136
+ 'ailiaTokenizerGetText',
137
+ 'ailiaTokenizerDestroy',
138
+ 'ailiaTokenizerUtf8ToUtf32',
139
+ 'ailiaTokenizerUtf32ToUtf8',
140
+
141
+ 'ailiaCreate',
142
+ 'ailiaOpenWeightFileA',
143
+ 'ailiaOpenWeightFileW',
144
+ 'ailiaOpenWeightMem',
145
+ 'ailiaSetMemoryMode',
146
+ 'ailiaDestroy',
147
+ 'ailiaUpdate',
148
+ 'ailiaGetBlobIndexByInputIndex',
149
+ 'ailiaGetBlobIndexByOutputIndex',
150
+ 'ailiaGetBlobData',
151
+ 'ailiaSetInputBlobData',
152
+ 'ailiaSetInputBlobShape',
153
+ 'ailiaGetBlobShape',
154
+ 'ailiaGetErrorDetail',
155
+ 'ailiaCopyBlobData',
156
+ 'ailiaGetEnvironment',
157
+ ]
158
+ struct__AILIASpeechApiCallback._fields_ = [
159
+ ('ailiaAudioGetFrameLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN),
160
+ ('ailiaAudioGetMelSpectrogram', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM),
161
+ ('ailiaAudioResample', AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE),
162
+ ('ailiaAudioGetResampleLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN),
163
+
164
+ ('ailiaTokenizerCreate', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE),
165
+ ('ailiaTokenizerOpenModelFileA', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A),
166
+ ('ailiaTokenizerOpenModelFileW', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W),
167
+ ('ailiaTokenizerEncode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE),
168
+ ('ailiaTokenizerGetTokenCount', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT),
169
+ ('ailiaTokenizerGetTokens', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS),
170
+ ('ailiaTokenizerDecode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE),
171
+ ('ailiaTokenizerGetTextLength', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH),
172
+ ('ailiaTokenizerGetText', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT),
173
+ ('ailiaTokenizerDestroy', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY),
174
+ ('ailiaTokenizerUtf8ToUtf32', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32),
175
+ ('ailiaTokenizerUtf32ToUtf8', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8),
176
+
177
+ ('ailiaCreate', AILIA_SPEECH_USER_API_AILIA_CREATE),
178
+ ('ailiaOpenWeightFileA', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A),
179
+ ('ailiaOpenWeightFileW', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W),
180
+ ('ailiaOpenWeightMem', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM),
181
+ ('ailiaSetMemoryMode', AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE),
182
+ ('ailiaDestroy', AILIA_SPEECH_USER_API_AILIA_DESTROY),
183
+ ('ailiaUpdate', AILIA_SPEECH_USER_API_AILIA_UPDATE),
184
+ ('ailiaGetBlobIndexByInputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX),
185
+ ('ailiaGetBlobIndexByOutputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX),
186
+ ('ailiaGetBlobData', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA),
187
+ ('ailiaSetInputBlobData', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA),
188
+ ('ailiaSetInputBlobShape', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE),
189
+ ('ailiaGetBlobShape', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE),
190
+ ('ailiaGetErrorDetail', AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL),
191
+ ('ailiaCopyBlobData', AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA),
192
+ ('ailiaGetEnvironment', AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT),
193
+ ]
194
+
195
+ AILIASpeechApiCallback = struct__AILIASpeechApiCallback
196
+
197
+ # ==============================================================================
198
+
199
+ dll.ailiaSpeechCreate.restype = c_int
200
+ dll.ailiaSpeechCreate.argtypes = (POINTER(c_void_p), c_int32, c_int32, c_int32, c_int32, c_int32, AILIASpeechApiCallback, c_int32)
201
+
202
+ dll.ailiaSpeechDestroy.restype = None
203
+ dll.ailiaSpeechDestroy.argtypes = (c_void_p, )
204
+
205
+ dll.ailiaSpeechOpenModelFileA.restype = c_int
206
+ dll.ailiaSpeechOpenModelFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
207
+
208
+ dll.ailiaSpeechOpenModelFileW.restype = c_int
209
+ dll.ailiaSpeechOpenModelFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
210
+
211
+ dll.ailiaSpeechOpenVadFileA.restype = c_int
212
+ dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
213
+
214
+ dll.ailiaSpeechOpenVadFileW.restype = c_int
215
+ dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
216
+
217
+ dll.ailiaSpeechPushInputData.restype = c_int
218
+ dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
219
+ dtype=numpy.float32, flags='CONTIGUOUS'
220
+ ), # src
221
+ ctypes.c_uint,
222
+ ctypes.c_uint,
223
+ ctypes.c_uint)
224
+
225
+ dll.ailiaSpeechFinalizeInputData.restype = c_int
226
+ dll.ailiaSpeechFinalizeInputData.argtypes = (c_void_p, )
227
+
228
+ dll.ailiaSpeechBuffered.restype = c_int
229
+ dll.ailiaSpeechBuffered.argtypes = (c_void_p, POINTER(ctypes.c_uint))
230
+
231
+ dll.ailiaSpeechComplete.restype = c_int
232
+ dll.ailiaSpeechComplete.argtypes = (c_void_p, POINTER(ctypes.c_uint))
233
+
234
+ dll.ailiaSpeechTranscribe.restype = c_int
235
+ dll.ailiaSpeechTranscribe.argtypes = (c_void_p, )
236
+
237
+ dll.ailiaSpeechGetTextCount.restype = c_int
238
+ dll.ailiaSpeechGetTextCount.argtypes = (c_void_p, POINTER(ctypes.c_uint))
239
+
240
+ class AILIASpeechText(ctypes.Structure):
241
+ _fields_ = [
242
+ ("text", ctypes.c_char_p),
243
+ ("time_stamp_begin", ctypes.c_float),
244
+ ("time_stamp_end", ctypes.c_float),
245
+ ("person_id", ctypes.c_uint),
246
+ ("language", ctypes.c_char_p),
247
+ ("confidence", ctypes.c_float)]
248
+
249
+ dll.ailiaSpeechGetText.restype = c_int
250
+ dll.ailiaSpeechGetText.argtypes = (c_void_p, POINTER(AILIASpeechText), ctypes.c_uint, ctypes.c_uint)
251
+
252
+ dll.ailiaSpeechResetTranscribeState.restype = c_int
253
+ dll.ailiaSpeechResetTranscribeState.argtypes = (c_void_p, )
254
+
255
+ AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK = CFUNCTYPE((c_int), c_int64, c_char_p)
256
+
257
+ dll.ailiaSpeechSetIntermediateCallback.restype = c_int
258
+ dll.ailiaSpeechSetIntermediateCallback.argtypes = (c_void_p, AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK, c_int64)
259
+
260
+ dll.ailiaSpeechSetLanguage.restype = c_int
261
+ dll.ailiaSpeechSetLanguage.argtypes = (c_void_p, c_char_p)
262
+
263
+ # ==============================================================================
264
+ # model download
265
+ # ==============================================================================
266
+
267
+ def progress_print(block_count, block_size, total_size):
268
+ percentage = 100.0 * block_count * block_size / total_size
269
+ if percentage > 100:
270
+ # Bigger than 100 does not look good, so...
271
+ percentage = 100
272
+ max_bar = 50
273
+ bar_num = int(percentage / (100 / max_bar))
274
+ progress_element = '=' * bar_num
275
+ if bar_num != max_bar:
276
+ progress_element += '>'
277
+ bar_fill = ' ' # fill the blanks
278
+ bar = progress_element.ljust(max_bar, bar_fill)
279
+ total_size_kb = total_size / 1024
280
+ print(f'[{bar} {percentage:.2f}% ( {total_size_kb:.0f}KB )]', end='\r')
281
+
282
+ def urlretrieve(remote_path, weight_path, progress_print):
283
+ temp_path = weight_path + ".tmp"
284
+ try:
285
+ #raise ssl.SSLError # test
286
+ urllib.request.urlretrieve(
287
+ remote_path,
288
+ temp_path,
289
+ progress_print,
290
+ )
291
+ except ssl.SSLError as e:
292
+ print(f'SSLError detected, so try to download without ssl')
293
+ remote_path = remote_path.replace("https","http")
294
+ urllib.request.urlretrieve(
295
+ remote_path,
296
+ temp_path,
297
+ progress_print,
298
+ )
299
+ shutil.move(temp_path, weight_path)
300
+
301
+ def check_and_download_file(file_path, remote_path):
302
+ if not os.path.exists(file_path):
303
+ print('Downloading %s...' % file_path)
304
+ urlretrieve(remote_path + os.path.basename(file_path), file_path, progress_print)
305
+
306
+ # ==============================================================================
307
+ # base model class
308
+ # ==============================================================================
309
+
310
+ class AiliaSpeechError(RuntimeError):
311
+ def __init__(self, message, code):
312
+ super().__init__(f"{message} code:{code}")
313
+ self.code = code
314
+
315
+ class AiliaSpeechModel:
316
+ _api_callback = None
317
+ _instance = None
318
+
319
+ def _check(self, status):
320
+ if status != AILIA_SPEECH_STATUS_SUCCESS:
321
+ raise AiliaSpeechError(f"ailia speech error", status)
322
+
323
+ def _string_buffer(self, path):
324
+ if sys.platform == "win32":
325
+ return ctypes.create_unicode_buffer(path)
326
+ else:
327
+ return ctypes.create_string_buffer(path.encode("utf-8"))
328
+
329
+ def _create_callback(self):
330
+ callback = AILIASpeechApiCallback()
331
+ callback.ailiaAudioGetFrameLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN(("ailiaAudioGetFrameLen", ailia.audio.audio_core.dll))
332
+ callback.ailiaAudioGetMelSpectrogram = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM(("ailiaAudioGetMelSpectrogram", ailia.audio.audio_core.dll))
333
+ callback.ailiaAudioResample = AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE(("ailiaAudioResample", ailia.audio.audio_core.dll))
334
+ callback.ailiaAudioGetResampleLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN(("ailiaAudioGetResampleLen", ailia.audio.audio_core.dll))
335
+
336
+ callback.ailiaTokenizerCreate = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE(("ailiaTokenizerCreate", ailia_tokenizer.dll))
337
+ callback.ailiaTokenizerOpenModelFileA = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A(("ailiaTokenizerOpenModelFileA", ailia_tokenizer.dll))
338
+ callback.ailiaTokenizerOpenModelFileW = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W(("ailiaTokenizerOpenModelFileW", ailia_tokenizer.dll))
339
+ callback.ailiaTokenizerEncode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE(("ailiaTokenizerEncode", ailia_tokenizer.dll))
340
+ callback.ailiaTokenizerGetTokenCount = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT(("ailiaTokenizerGetTokenCount", ailia_tokenizer.dll))
341
+ callback.ailiaTokenizerGetTokens = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS(("ailiaTokenizerGetTokens", ailia_tokenizer.dll))
342
+ callback.ailiaTokenizerDecode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE(("ailiaTokenizerDecode", ailia_tokenizer.dll))
343
+ callback.ailiaTokenizerGetTextLength = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH(("ailiaTokenizerGetTextLength", ailia_tokenizer.dll))
344
+ callback.ailiaTokenizerGetText = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT(("ailiaTokenizerGetText", ailia_tokenizer.dll))
345
+ callback.ailiaTokenizerDestroy = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY(("ailiaTokenizerDestroy", ailia_tokenizer.dll))
346
+ callback.ailiaTokenizerUtf8ToUtf32 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32(("ailiaTokenizerUtf8ToUtf32", ailia_tokenizer.dll))
347
+ callback.ailiaTokenizerUtf32ToUtf8 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8(("ailiaTokenizerUtf32ToUtf8", ailia_tokenizer.dll))
348
+
349
+ callback.ailiaCreate = AILIA_SPEECH_USER_API_AILIA_CREATE(("ailiaCreate", ailia.core.dll))
350
+ callback.ailiaOpenWeightFileA = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A(("ailiaOpenWeightFileA", ailia.core.dll))
351
+ callback.ailiaOpenWeightFileW = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W(("ailiaOpenWeightFileW", ailia.core.dll))
352
+ callback.ailiaOpenWeightMem = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM(("ailiaOpenWeightMem", ailia.core.dll))
353
+ callback.ailiaSetMemoryMode = AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE(("ailiaSetMemoryMode", ailia.core.dll))
354
+ callback.ailiaDestroy = AILIA_SPEECH_USER_API_AILIA_DESTROY(("ailiaDestroy", ailia.core.dll))
355
+ callback.ailiaUpdate = AILIA_SPEECH_USER_API_AILIA_UPDATE(("ailiaUpdate", ailia.core.dll))
356
+ callback.ailiaGetBlobIndexByInputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX(("ailiaGetBlobIndexByInputIndex", ailia.core.dll))
357
+ callback.ailiaGetBlobIndexByOutputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX(("ailiaGetBlobIndexByOutputIndex", ailia.core.dll))
358
+ callback.ailiaGetBlobData = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA(("ailiaGetBlobData", ailia.core.dll))
359
+ callback.ailiaSetInputBlobData = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA(("ailiaSetInputBlobData", ailia.core.dll))
360
+ callback.ailiaSetInputBlobShape = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE(("ailiaSetInputBlobShape", ailia.core.dll))
361
+ callback.ailiaGetBlobShape = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE(("ailiaGetBlobShape", ailia.core.dll))
362
+ callback.ailiaGetErrorDetail = AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL(("ailiaGetErrorDetail", ailia.core.dll))
363
+ callback.ailiaCopyBlobData = AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA(("ailiaCopyBlobData", ailia.core.dll))
364
+ callback.ailiaGetEnvironment = AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT(("ailiaGetEnvironment", ailia.core.dll))
365
+
366
+ self._api_callback = callback # prevent GC
367
+
368
+ # ==============================================================================
369
+ # Public class
370
+ # ==============================================================================
371
+
372
+ intermediate_callback_cnt = 0
373
+ intermediate_callback_map = {}
374
+
375
+ def intermediate_callback(handle, text):
376
+ intermediate_callback_map[handle](text.decode())
377
+ return 0
378
+
379
+ class Whisper(AiliaSpeechModel):
380
+ _c_callback = None
381
+
382
+ def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
383
+ self._instance = ctypes.c_void_p(None)
384
+ self._create_callback()
385
+ self._check(dll.ailiaSpeechCreate(cast(pointer(self._instance), POINTER(c_void_p)), ctypes.c_int32(env_id), ctypes.c_int32(num_thread), ctypes.c_int32(memory_mode), ctypes.c_int32(task), ctypes.c_int32(flags), self._api_callback, ctypes.c_int32(AILIA_SPEECH_API_CALLBACK_VERSION)))
386
+ if callback is not None:
387
+ self._c_callback = AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK(intermediate_callback)
388
+ global intermediate_callback_cnt
389
+ global intermediate_callback_map
390
+ intermediate_callback_map[intermediate_callback_cnt] = callback
391
+ self._check(dll.ailiaSpeechSetIntermediateCallback(self._instance, self._c_callback, intermediate_callback_cnt))
392
+ intermediate_callback_cnt = intermediate_callback_cnt + 1
393
+
394
+
395
+ def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY):
396
+ if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
397
+ encoder_path = "encoder_tiny.opt.onnx"
398
+ decoder_path = "decoder_tiny_fix_kv_cache.opt2.onnx"
399
+ encoder_pb_path = None
400
+ decoder_pb_path = None
401
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE:
402
+ encoder_path = "encoder_base.opt.onnx"
403
+ decoder_path = "decoder_base_fix_kv_cache.opt2.onnx"
404
+ encoder_pb_path = None
405
+ decoder_pb_path = None
406
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL:
407
+ encoder_path = "encoder_small.opt.onnx"
408
+ decoder_path = "decoder_small_fix_kv_cache.opt2.onnx"
409
+ encoder_pb_path = None
410
+ decoder_pb_path = None
411
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM:
412
+ encoder_path = "encoder_medium.opt.onnx"
413
+ decoder_path = "decoder_medium_fix_kv_cache.opt2.onnx"
414
+ encoder_pb_path = None
415
+ decoder_pb_path = None
416
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE:
417
+ encoder_path = "encoder_large.onnx"
418
+ decoder_path = "decoder_large_fix_kv_cache.onnx"
419
+ encoder_pb_path = "encoder_large_weights.pb"
420
+ decoder_pb_path = "decoder_large_fix_kv_cache_weights.pb"
421
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3:
422
+ encoder_path = "encoder_large_v3.onnx"
423
+ decoder_path = "decoder_large_v3_fix_kv_cache.onnx"
424
+ encoder_pb_path = "encoder_large_v3_weights.pb"
425
+ decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
426
+ self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
427
+ self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
428
+ self._open_vad(model_path + "silero_vad.onnx", AILIA_SPEECH_VAD_TYPE_SILERO)
429
+
430
+ def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path):
431
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
432
+ os.makedirs(model_path, exist_ok = True)
433
+ check_and_download_file(model_path + encoder_path, REMOTE_PATH)
434
+ check_and_download_file(model_path + decoder_path, REMOTE_PATH)
435
+ if encoder_pb_path is not None:
436
+ check_and_download_file(model_path + encoder_pb_path, REMOTE_PATH)
437
+ if decoder_pb_path is not None:
438
+ check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
439
+
440
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
441
+ check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
442
+
443
+ def _open_model(self, encoder, decoder, model_type):
444
+ p1 = self._string_buffer(encoder)
445
+ p2 = self._string_buffer(decoder)
446
+
447
+ if sys.platform == "win32":
448
+ self._check(dll.ailiaSpeechOpenModelFileW(self._instance, p1, p2, model_type))
449
+ else:
450
+ self._check(dll.ailiaSpeechOpenModelFileA(self._instance, p1, p2, model_type))
451
+
452
+ def _open_vad(self, vad, vad_type):
453
+ p1 = self._string_buffer(vad)
454
+
455
+ if sys.platform == "win32":
456
+ self._check(dll.ailiaSpeechOpenVadFileW(self._instance, p1, vad_type))
457
+ else:
458
+ self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
459
+
460
+ def transcribe(self, audio_waveform, sampling_rate, lang = None):
461
+ if len(audio_waveform.shape) == 1:
462
+ channels = 1
463
+ elif len(audio_waveform.shape) == 2:
464
+ channels = audio_waveform.shape[0]
465
+ audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
466
+ else:
467
+ raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
468
+
469
+ audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
470
+
471
+ if lang is not None:
472
+ self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
473
+
474
+ self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
475
+ self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
476
+ self._check(dll.ailiaSpeechTranscribe(self._instance))
477
+
478
+ count = ctypes.c_uint(0)
479
+ self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
480
+ results = []
481
+ for i in range(count.value):
482
+ text = AILIASpeechText()
483
+ self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
484
+ results.append({"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence})
485
+
486
+ self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
487
+
488
+ return results
489
+
490
+ def __del__(self):
491
+ if self._instance:
492
+ dll.ailiaSpeechDestroy(cast(self._instance, c_void_p))
493
+
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.1
2
+ Name: ailia_speech
3
+ Version: 1.3.0.0
4
+ Summary: ailia AI Speech
5
+ Home-page: https://ailia.jp/
6
+ Author: ax Inc.
7
+ Author-email: contact@axinc.jp
8
+ License: https://ailia.ai/en/license/
9
+ Requires-Python: >3.6
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: ailia
12
+ Requires-Dist: ailia_tokenizer
13
+
14
+ # ailia AI Speech Python API
15
+
16
+ !! CAUTION !!
17
+ “ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
18
+ As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
19
+
20
+ ## About ailia AI Speech
21
+
22
+ ailia Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia Speech, you can easily integrate AI powered speech recognition into your applications.
23
+
24
+ ## Install from pip
25
+
26
+ You can install the ailia SDK free evaluation package with the following command.
27
+
28
+ ```
29
+ pip3 install ailia_speech
30
+ ```
31
+
32
+ ## Install from package
33
+
34
+ You can install the ailia SDK from Package with the following command.
35
+
36
+ ```
37
+ python3 bootstrap.py
38
+ pip3 install ./
39
+ ```
40
+
41
+ ## Usage
42
+
43
+ ```python
44
+ import ailia
45
+ import ailia_speech
46
+
47
+ import librosa
48
+
49
+ import os
50
+ import urllib.request
51
+
52
+ # Load target audio
53
+ ref_file_path = "demo.wav"
54
+ if not os.path.exists(ref_file_path):
55
+ urllib.request.urlretrieve(
56
+ "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
57
+ "demo.wav"
58
+ )
59
+ audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
60
+
61
+ # Infer
62
+ speech = ailia_speech.Whisper()
63
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
64
+ recognized_text = speech.transcribe(audio_waveform, sampling_rate)
65
+ print(recognized_text)
66
+ ```
67
+
68
+ ## API specification
69
+
70
+ https://github.com/axinc-ai/ailia-sdk
71
+
@@ -0,0 +1,14 @@
1
+ README.md
2
+ setup.py
3
+ ailia_speech/LICENSE_AILIA_EN.pdf
4
+ ailia_speech/LICENSE_AILIA_JA.pdf
5
+ ailia_speech/__init__.py
6
+ ailia_speech.egg-info/PKG-INFO
7
+ ailia_speech.egg-info/SOURCES.txt
8
+ ailia_speech.egg-info/dependency_links.txt
9
+ ailia_speech.egg-info/requires.txt
10
+ ailia_speech.egg-info/top_level.txt
11
+ ailia_speech/linux/arm64-v8a/libailia_speech.so
12
+ ailia_speech/linux/x64/libailia_speech.so
13
+ ailia_speech/mac/libailia_speech.dylib
14
+ ailia_speech/windows/x64/ailia_speech.dll
@@ -0,0 +1,2 @@
1
+ ailia
2
+ ailia_tokenizer
@@ -0,0 +1 @@
1
+ ailia_speech
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,73 @@
1
+ import os
2
+ import sys
3
+ import platform
4
+ import glob
5
+ import shutil
6
+ import platform
7
+
8
+ from setuptools import setup, Extension
9
+ from setuptools import find_packages
10
+
11
+ with open("README.md", encoding="utf-8") as f:
12
+ long_description = f.read()
13
+
14
+ scripts = []
15
+ for f in glob.glob("ailia_speech/*.py"):
16
+ scripts.append(f)
17
+
18
+ def find_libraries():
19
+ dll_names = []
20
+ platforms = ["win32", "darwin", "linux_armv7l", "linux_aarch64", "linux_x86_64"]
21
+
22
+ for platform in platforms:
23
+ if platform == "win32":
24
+ dll_platform = "windows/x64"
25
+ dll_type = ".dll"
26
+ elif platform == "darwin":
27
+ dll_platform = "mac"
28
+ dll_type = ".dylib"
29
+ else:
30
+ if platform == "linux_armv7l":
31
+ dll_platform = "linux/armeabi-v7a"
32
+ elif platform == "linux_aarch64":
33
+ dll_platform = "linux/arm64-v8a"
34
+ else:
35
+ dll_platform = "linux/x64"
36
+ dll_type = ".so"
37
+
38
+ dll_path = "./ailia_speech/" + dll_platform + "/"
39
+
40
+ for f in glob.glob(dll_path+"*"+dll_type):
41
+ f = f.replace("\\", "/")
42
+ f = f.replace("./ailia_speech/", "./")
43
+ dll_names.append(f)
44
+
45
+ dll_names.append("./LICENSE_AILIA_EN.pdf")
46
+ dll_names.append("./LICENSE_AILIA_JA.pdf")
47
+ dll_names.append("./oss/LICENSE_SILERO_VAD.txt")
48
+ dll_names.append("./oss/LICENSE_SRELL.txt")
49
+ dll_names.append("./oss/LICENSE_WHISPER.txt")
50
+
51
+ return dll_names
52
+
53
+ if __name__ == "__main__":
54
+ setup(
55
+ name="ailia_speech",
56
+ scripts=scripts,
57
+ version="1.3.0.0",
58
+ install_requires=[
59
+ "ailia",
60
+ "ailia_tokenizer",
61
+ ],
62
+ description="ailia AI Speech",
63
+ long_description=long_description,
64
+ long_description_content_type="text/markdown",
65
+ author="ax Inc.",
66
+ author_email="contact@axinc.jp",
67
+ url="https://ailia.jp/",
68
+ license="https://ailia.ai/en/license/",
69
+ packages=find_packages(),
70
+ package_data={"ailia_speech":find_libraries()},
71
+ include_package_data=True,
72
+ python_requires=">3.6",
73
+ )