ailia-speech 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,163 @@
1
+ Metadata-Version: 2.4
2
+ Name: ailia_speech
3
+ Version: 1.5.0
4
+ Summary: ailia AI Speech
5
+ Home-page: https://ailia.ai/en/
6
+ Author: ailia Inc.
7
+ Author-email: contact@ailia.ai
8
+ License: https://ailia.ai/en/license/
9
+ Requires-Python: >3.6
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: ailia
12
+ Requires-Dist: ailia_tokenizer
13
+ Dynamic: author
14
+ Dynamic: author-email
15
+ Dynamic: description
16
+ Dynamic: description-content-type
17
+ Dynamic: home-page
18
+ Dynamic: license
19
+ Dynamic: requires-dist
20
+ Dynamic: requires-python
21
+ Dynamic: summary
22
+
23
+ # ailia AI Speech Python API
24
+
25
+ !! CAUTION !!
26
+ “ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
27
+ As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
28
+
29
+ ## About ailia AI Speech
30
+
31
+ ailia AI Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia AI Speech, you can easily integrate AI powered speech recognition into your applications.
32
+
33
+ ## Install from pip
34
+
35
+ You can install the ailia AI Speech free evaluation package with the following command.
36
+
37
+ ```
38
+ pip3 install ailia_speech
39
+ ```
40
+
41
+ ## Install from package
42
+
43
+ You can install the ailia AI Speech from Package with the following command.
44
+
45
+ ```
46
+ python3 bootstrap.py
47
+ pip3 install ./
48
+ ```
49
+
50
+ ## Usage
51
+
52
+ ### Batch mode
53
+
54
+ In batch mode, the entire audio is transcribed at once.
55
+
56
+ ```python
57
+ import ailia_speech
58
+
59
+ import librosa
60
+
61
+ import os
62
+ import urllib.request
63
+
64
+ # Load target audio
65
+ input_file_path = "demo.wav"
66
+ if not os.path.exists(input_file_path):
67
+ urllib.request.urlretrieve(
68
+ "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
69
+ "demo.wav"
70
+ )
71
+ audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
72
+
73
+ # Model Initialize
74
+ speech = ailia_speech.Whisper()
75
+ model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
76
+
77
+ # When using sensevoice
78
+ #speech = ailia_speech.SenseVoice()
79
+ #model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
80
+
81
+ # Infer
82
+ speech.initialize_model(model_path = "./models/", model_type = model_type)
83
+ recognized_text = speech.transcribe(audio_waveform, sampling_rate)
84
+ for text in recognized_text:
85
+ print(text)
86
+ ```
87
+
88
+ ### Step mode
89
+
90
+ In step mode, the audio is input in chunks and transcribed sequentially.
91
+
92
+ ```python
93
+ import ailia_speech
94
+
95
+ import librosa
96
+
97
+ import os
98
+ import urllib.request
99
+
100
+ # Load target audio
101
+ input_file_path = "demo.wav"
102
+ if not os.path.exists(input_file_path):
103
+ urllib.request.urlretrieve(
104
+ "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
105
+ "demo.wav"
106
+ )
107
+ audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
108
+
109
+ # Infer
110
+ speech = ailia_speech.Whisper()
111
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO)
112
+ speech.set_silent_threshold(silent_threshold = 0.5, speech_sec = 1.0, no_speech_sec = 0.5)
113
+ for i in range(0, audio_waveform.shape[0], sampling_rate):
114
+ complete = False
115
+ if i + sampling_rate >= audio_waveform.shape[0]:
116
+ complete = True
117
+ recognized_text = speech.transcribe_step(audio_waveform[i:min(audio_waveform.shape[0], i + sampling_rate)], sampling_rate, complete)
118
+ for text in recognized_text:
119
+ print(text)
120
+ ```
121
+
122
+ ### Dialization mode
123
+
124
+ By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
125
+
126
+ ```
127
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, diarization_type = ailia_speech.AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
128
+ ```
129
+
130
+ ### Available model types
131
+
132
+ It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
133
+
134
+ Whisper
135
+
136
+ ```
137
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
138
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
139
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
140
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
141
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
142
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
143
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
144
+ ```
145
+
146
+ SenseVoice
147
+
148
+ ```
149
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
150
+ ```
151
+
152
+ ### Available vad versions
153
+
154
+ By default, version "4" of SileroVAD is used. The version can be specified from "4", "5", "6", and "6_2".
155
+
156
+ ```
157
+ speech.initialize_model(model_path = "./models/", vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, vad_version = "6_2")
158
+ ```
159
+
160
+ ## API specification
161
+
162
+ https://github.com/axinc-ai/ailia-sdk
163
+
@@ -0,0 +1,141 @@
1
+ # ailia AI Speech Python API
2
+
3
+ !! CAUTION !!
4
+ “ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
5
+ As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
6
+
7
+ ## About ailia AI Speech
8
+
9
+ ailia AI Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia AI Speech, you can easily integrate AI powered speech recognition into your applications.
10
+
11
+ ## Install from pip
12
+
13
+ You can install the ailia AI Speech free evaluation package with the following command.
14
+
15
+ ```
16
+ pip3 install ailia_speech
17
+ ```
18
+
19
+ ## Install from package
20
+
21
+ You can install the ailia AI Speech from Package with the following command.
22
+
23
+ ```
24
+ python3 bootstrap.py
25
+ pip3 install ./
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ ### Batch mode
31
+
32
+ In batch mode, the entire audio is transcribed at once.
33
+
34
+ ```python
35
+ import ailia_speech
36
+
37
+ import librosa
38
+
39
+ import os
40
+ import urllib.request
41
+
42
+ # Load target audio
43
+ input_file_path = "demo.wav"
44
+ if not os.path.exists(input_file_path):
45
+ urllib.request.urlretrieve(
46
+ "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
47
+ "demo.wav"
48
+ )
49
+ audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
50
+
51
+ # Model Initialize
52
+ speech = ailia_speech.Whisper()
53
+ model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
54
+
55
+ # When using sensevoice
56
+ #speech = ailia_speech.SenseVoice()
57
+ #model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
58
+
59
+ # Infer
60
+ speech.initialize_model(model_path = "./models/", model_type = model_type)
61
+ recognized_text = speech.transcribe(audio_waveform, sampling_rate)
62
+ for text in recognized_text:
63
+ print(text)
64
+ ```
65
+
66
+ ### Step mode
67
+
68
+ In step mode, the audio is input in chunks and transcribed sequentially.
69
+
70
+ ```python
71
+ import ailia_speech
72
+
73
+ import librosa
74
+
75
+ import os
76
+ import urllib.request
77
+
78
+ # Load target audio
79
+ input_file_path = "demo.wav"
80
+ if not os.path.exists(input_file_path):
81
+ urllib.request.urlretrieve(
82
+ "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
83
+ "demo.wav"
84
+ )
85
+ audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
86
+
87
+ # Infer
88
+ speech = ailia_speech.Whisper()
89
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO)
90
+ speech.set_silent_threshold(silent_threshold = 0.5, speech_sec = 1.0, no_speech_sec = 0.5)
91
+ for i in range(0, audio_waveform.shape[0], sampling_rate):
92
+ complete = False
93
+ if i + sampling_rate >= audio_waveform.shape[0]:
94
+ complete = True
95
+ recognized_text = speech.transcribe_step(audio_waveform[i:min(audio_waveform.shape[0], i + sampling_rate)], sampling_rate, complete)
96
+ for text in recognized_text:
97
+ print(text)
98
+ ```
99
+
100
+ ### Dialization mode
101
+
102
+ By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
103
+
104
+ ```
105
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, diarization_type = ailia_speech.AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
106
+ ```
107
+
108
+ ### Available model types
109
+
110
+ It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
111
+
112
+ Whisper
113
+
114
+ ```
115
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
116
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
117
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
118
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
119
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
120
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
121
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
122
+ ```
123
+
124
+ SenseVoice
125
+
126
+ ```
127
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
128
+ ```
129
+
130
+ ### Available vad versions
131
+
132
+ By default, version "4" of SileroVAD is used. The version can be specified from "4", "5", "6", and "6_2".
133
+
134
+ ```
135
+ speech.initialize_model(model_path = "./models/", vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, vad_version = "6_2")
136
+ ```
137
+
138
+ ## API specification
139
+
140
+ https://github.com/axinc-ai/ailia-sdk
141
+
@@ -0,0 +1,762 @@
1
+ import ctypes
2
+ import os
3
+ import sys
4
+
5
+ import numpy
6
+ import ailia
7
+ import ailia.audio
8
+ import ailia_tokenizer
9
+
10
+ import urllib.request
11
+ import ssl
12
+ import shutil
13
+ import platform
14
+
15
+ #### dependency check
16
+ if sys.platform == "win32":
17
+ import ctypes
18
+ try:
19
+ for library in ["vcruntime140.dll", "vcruntime140_1.dll", "msvcp140.dll"]:
20
+ ctypes.windll.LoadLibrary(library)
21
+ except:
22
+ print(" WARNING Please install MSVC 2015-2019 runtime from https://docs.microsoft.com/ja-jp/cpp/windows/latest-supported-vc-redist")
23
+
24
+
25
+ #### loading DLL / DYLIB / SO ####
26
+ if sys.platform == "win32":
27
+ dll_platform = "windows/x64"
28
+ dll_name = "ailia_speech.dll"
29
+ load_fn = ctypes.WinDLL
30
+ elif sys.platform == "darwin":
31
+ dll_platform = "mac"
32
+ dll_name = "libailia_speech.dylib"
33
+ load_fn = ctypes.CDLL
34
+ else:
35
+ is_arm = "arm" in platform.machine() or platform.machine() == "aarch64"
36
+ if is_arm:
37
+ if platform.architecture()[0] == "32bit":
38
+ dll_platform = "linux/armeabi-v7a"
39
+ else:
40
+ dll_platform = "linux/arm64-v8a"
41
+ else:
42
+ dll_platform = "linux/x64"
43
+ dll_name = "libailia_speech.so"
44
+ load_fn = ctypes.CDLL
45
+
46
+ dll_found = False
47
+ candidate = ["", str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep), str(os.path.dirname(os.path.abspath(__file__))) + str(os.sep) + dll_platform + str(os.sep)]
48
+ for dir in candidate:
49
+ try:
50
+ dll = load_fn(dir + dll_name)
51
+ dll_found = True
52
+ except:
53
+ pass
54
+ if not dll_found:
55
+ msg = "DLL load failed : \'" + dll_name + "\' is not found"
56
+ raise ImportError(msg)
57
+
58
+ # ==============================================================================
59
+
60
+ from ctypes import *
61
+
62
+ AILIA_SPEECH_STATUS_SUCCESS = ( 0 )
63
+
64
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY = (0)
65
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE = (1)
66
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL = (2)
67
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM = (3)
68
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE = (4)
69
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 = (5)
70
+ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO = (6)
71
+ AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL = (10)
72
+
73
+ AILIA_SPEECH_TASK_TRANSCRIBE = (0)
74
+ AILIA_SPEECH_TASK_TRANSLATE = (1)
75
+
76
+ AILIA_SPEECH_FLAG_NONE = (0)
77
+ AILIA_SPEECH_FLAG_LIVE = (1)
78
+
79
+ AILIA_SPEECH_VAD_TYPE_SILERO = (0)
80
+
81
+ AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO = (0)
82
+
83
+ AILIA_SPEECH_API_CALLBACK_VERSION = (6)
84
+
85
+ AILIA_SPEECH_TEXT_VERSION = (2)
86
+ AILIA_SPEECH_SPEAKER_ID_UNKNOWN = (0xFFFFFFFF)
87
+
88
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
89
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
90
+
91
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p) , c_int, c_int)
92
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A = CFUNCTYPE((c_int), c_void_p , c_char_p)
93
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W = CFUNCTYPE((c_int), c_void_p , c_wchar)
94
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE = CFUNCTYPE((c_int), c_void_p , c_char_p)
95
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
96
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS = CFUNCTYPE((c_int), c_void_p , POINTER(c_int) , c_uint)
97
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE = CFUNCTYPE((c_int), c_void_p , POINTER(c_int), c_uint)
98
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH = CFUNCTYPE((c_int), c_void_p , POINTER(c_uint))
99
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT = CFUNCTYPE((c_int), c_void_p , c_char_p , c_uint)
100
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY = CFUNCTYPE((c_int), c_void_p)
101
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32 = CFUNCTYPE((c_int), POINTER(c_uint) , POINTER(c_uint) , c_char_p , c_uint)
102
+ AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8 = CFUNCTYPE((c_int), c_char_p, POINTER(c_uint) , c_uint)
103
+
104
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE = CFUNCTYPE((c_int), POINTER(c_float), POINTER(c_float), c_int, c_int, c_int, c_int)
105
+ AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN = CFUNCTYPE((c_int), POINTER(c_int), c_int, c_int, c_int)
106
+ AILIA_SPEECH_USER_API_AILIA_CREATE = CFUNCTYPE((c_int), POINTER(c_void_p), c_int, c_int)
107
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A = CFUNCTYPE((c_int), c_void_p, c_char_p)
108
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W = CFUNCTYPE((c_int), c_void_p, POINTER(c_wchar))
109
+ AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM = CFUNCTYPE((c_int), c_void_p, POINTER(c_byte), c_uint)
110
+ AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE = CFUNCTYPE((c_int), c_void_p, c_uint)
111
+ AILIA_SPEECH_USER_API_AILIA_DESTROY = CFUNCTYPE((None), c_void_p)
112
+ AILIA_SPEECH_USER_API_AILIA_UPDATE = CFUNCTYPE((c_int), c_void_p)
113
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
114
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX = CFUNCTYPE((c_int), c_void_p, POINTER(c_uint), c_uint)
115
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
116
+ AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, POINTER(c_float), c_uint, c_uint)
117
+ AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
118
+ AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE = CFUNCTYPE((c_int), c_void_p, c_void_p, c_uint, c_uint)
119
+ AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL = CFUNCTYPE((c_char_p), c_void_p)
120
+
121
+ AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA = CFUNCTYPE((c_int), c_void_p, c_uint, c_void_p, c_uint)
122
+ AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT = CFUNCTYPE((c_int), POINTER(c_void_p), c_uint, c_uint)
123
+
124
+
125
+ class struct__AILIASpeechApiCallback(Structure):
126
+ pass
127
+
128
+ struct__AILIASpeechApiCallback.__slots__ = [
129
+ 'ailiaAudioGetFrameLen',
130
+ 'ailiaAudioGetMelSpectrogram',
131
+ 'ailiaAudioResample',
132
+ 'ailiaAudioGetResampleLen',
133
+
134
+ 'ailiaTokenizerCreate',
135
+ 'ailiaTokenizerOpenModelFileA',
136
+ 'ailiaTokenizerOpenModelFileW',
137
+ 'ailiaTokenizerEncode',
138
+ 'ailiaTokenizerGetTokenCount',
139
+ 'ailiaTokenizerGetTokens',
140
+ 'ailiaTokenizerDecode',
141
+ 'ailiaTokenizerGetTextLength',
142
+ 'ailiaTokenizerGetText',
143
+ 'ailiaTokenizerDestroy',
144
+ 'ailiaTokenizerUtf8ToUtf32',
145
+ 'ailiaTokenizerUtf32ToUtf8',
146
+
147
+ 'ailiaCreate',
148
+ 'ailiaOpenWeightFileA',
149
+ 'ailiaOpenWeightFileW',
150
+ 'ailiaOpenWeightMem',
151
+ 'ailiaSetMemoryMode',
152
+ 'ailiaDestroy',
153
+ 'ailiaUpdate',
154
+ 'ailiaGetBlobIndexByInputIndex',
155
+ 'ailiaGetBlobIndexByOutputIndex',
156
+ 'ailiaGetBlobData',
157
+ 'ailiaSetInputBlobData',
158
+ 'ailiaSetInputBlobShape',
159
+ 'ailiaGetBlobShape',
160
+ 'ailiaGetErrorDetail',
161
+ 'ailiaCopyBlobData',
162
+ 'ailiaGetEnvironment',
163
+ ]
164
+ struct__AILIASpeechApiCallback._fields_ = [
165
+ ('ailiaAudioGetFrameLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN),
166
+ ('ailiaAudioGetMelSpectrogram', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM),
167
+ ('ailiaAudioResample', AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE),
168
+ ('ailiaAudioGetResampleLen', AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN),
169
+
170
+ ('ailiaTokenizerCreate', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE),
171
+ ('ailiaTokenizerOpenModelFileA', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A),
172
+ ('ailiaTokenizerOpenModelFileW', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W),
173
+ ('ailiaTokenizerEncode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE),
174
+ ('ailiaTokenizerGetTokenCount', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT),
175
+ ('ailiaTokenizerGetTokens', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS),
176
+ ('ailiaTokenizerDecode', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE),
177
+ ('ailiaTokenizerGetTextLength', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH),
178
+ ('ailiaTokenizerGetText', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT),
179
+ ('ailiaTokenizerDestroy', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY),
180
+ ('ailiaTokenizerUtf8ToUtf32', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32),
181
+ ('ailiaTokenizerUtf32ToUtf8', AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8),
182
+
183
+ ('ailiaCreate', AILIA_SPEECH_USER_API_AILIA_CREATE),
184
+ ('ailiaOpenWeightFileA', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A),
185
+ ('ailiaOpenWeightFileW', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W),
186
+ ('ailiaOpenWeightMem', AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM),
187
+ ('ailiaSetMemoryMode', AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE),
188
+ ('ailiaDestroy', AILIA_SPEECH_USER_API_AILIA_DESTROY),
189
+ ('ailiaUpdate', AILIA_SPEECH_USER_API_AILIA_UPDATE),
190
+ ('ailiaGetBlobIndexByInputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX),
191
+ ('ailiaGetBlobIndexByOutputIndex', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX),
192
+ ('ailiaGetBlobData', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA),
193
+ ('ailiaSetInputBlobData', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA),
194
+ ('ailiaSetInputBlobShape', AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE),
195
+ ('ailiaGetBlobShape', AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE),
196
+ ('ailiaGetErrorDetail', AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL),
197
+ ('ailiaCopyBlobData', AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA),
198
+ ('ailiaGetEnvironment', AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT),
199
+ ]
200
+
201
+ AILIASpeechApiCallback = struct__AILIASpeechApiCallback
202
+
203
+ # ==============================================================================
204
+
205
+ dll.ailiaSpeechCreate.restype = c_int
206
+ dll.ailiaSpeechCreate.argtypes = (POINTER(c_void_p), c_int32, c_int32, c_int32, c_int32, c_int32, AILIASpeechApiCallback, c_int32)
207
+
208
+ dll.ailiaSpeechDestroy.restype = None
209
+ dll.ailiaSpeechDestroy.argtypes = (c_void_p, )
210
+
211
+ dll.ailiaSpeechOpenModelFileA.restype = c_int
212
+ dll.ailiaSpeechOpenModelFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
213
+
214
+ dll.ailiaSpeechOpenModelFileW.restype = c_int
215
+ dll.ailiaSpeechOpenModelFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
216
+
217
+ dll.ailiaSpeechOpenVadFileA.restype = c_int
218
+ dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
219
+
220
+ dll.ailiaSpeechOpenVadFileW.restype = c_int
221
+ dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
222
+
223
+ dll.ailiaSpeechOpenDiarizationFileA.restype = c_int
224
+ dll.ailiaSpeechOpenDiarizationFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
225
+
226
+ dll.ailiaSpeechOpenDiarizationFileW.restype = c_int
227
+ dll.ailiaSpeechOpenDiarizationFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
228
+
229
+ dll.ailiaSpeechPushInputData.restype = c_int
230
+ dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
231
+ dtype=numpy.float32, flags='CONTIGUOUS'
232
+ ), # src
233
+ ctypes.c_uint,
234
+ ctypes.c_uint,
235
+ ctypes.c_uint)
236
+
237
+ dll.ailiaSpeechFinalizeInputData.restype = c_int
238
+ dll.ailiaSpeechFinalizeInputData.argtypes = (c_void_p, )
239
+
240
+ dll.ailiaSpeechBuffered.restype = c_int
241
+ dll.ailiaSpeechBuffered.argtypes = (c_void_p, POINTER(ctypes.c_uint))
242
+
243
+ dll.ailiaSpeechComplete.restype = c_int
244
+ dll.ailiaSpeechComplete.argtypes = (c_void_p, POINTER(ctypes.c_uint))
245
+
246
+ dll.ailiaSpeechTranscribe.restype = c_int
247
+ dll.ailiaSpeechTranscribe.argtypes = (c_void_p, )
248
+
249
+ dll.ailiaSpeechGetTextCount.restype = c_int
250
+ dll.ailiaSpeechGetTextCount.argtypes = (c_void_p, POINTER(ctypes.c_uint))
251
+
252
+ class AILIASpeechText(ctypes.Structure):
253
+ _fields_ = [
254
+ ("text", ctypes.c_char_p),
255
+ ("time_stamp_begin", ctypes.c_float),
256
+ ("time_stamp_end", ctypes.c_float),
257
+ ("speaker_id", ctypes.c_uint),
258
+ ("language", ctypes.c_char_p),
259
+ ("confidence", ctypes.c_float)]
260
+
261
+ dll.ailiaSpeechGetText.restype = c_int
262
+ dll.ailiaSpeechGetText.argtypes = (c_void_p, POINTER(AILIASpeechText), ctypes.c_uint, ctypes.c_uint)
263
+
264
+ dll.ailiaSpeechResetTranscribeState.restype = c_int
265
+ dll.ailiaSpeechResetTranscribeState.argtypes = (c_void_p, )
266
+
267
+ AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK = CFUNCTYPE((c_int), c_int64, c_char_p)
268
+
269
+ dll.ailiaSpeechSetIntermediateCallback.restype = c_int
270
+ dll.ailiaSpeechSetIntermediateCallback.argtypes = (c_void_p, AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK, c_int64)
271
+
272
+ dll.ailiaSpeechSetLanguage.restype = c_int
273
+ dll.ailiaSpeechSetLanguage.argtypes = (c_void_p, c_char_p)
274
+
275
+ dll.ailiaSpeechSetSilentThreshold.restype = c_int
276
+ dll.ailiaSpeechSetSilentThreshold.argtypes = (c_void_p, c_float, c_float, c_float)
277
+
278
+ # ==============================================================================
279
+ # model download
280
+ # ==============================================================================
281
+
282
+ def progress_print(block_count, block_size, total_size):
283
+ percentage = 100.0 * block_count * block_size / total_size
284
+ if percentage > 100:
285
+ # Bigger than 100 does not look good, so...
286
+ percentage = 100
287
+ max_bar = 50
288
+ bar_num = int(percentage / (100 / max_bar))
289
+ progress_element = '=' * bar_num
290
+ if bar_num != max_bar:
291
+ progress_element += '>'
292
+ bar_fill = ' ' # fill the blanks
293
+ bar = progress_element.ljust(max_bar, bar_fill)
294
+ total_size_kb = total_size / 1024
295
+ print(f'[{bar} {percentage:.2f}% ( {total_size_kb:.0f}KB )]', end='\r')
296
+
297
+ def urlretrieve(remote_path, weight_path, progress_print):
298
+ temp_path = weight_path + ".tmp"
299
+ try:
300
+ #raise ssl.SSLError # test
301
+ urllib.request.urlretrieve(
302
+ remote_path,
303
+ temp_path,
304
+ progress_print,
305
+ )
306
+ except ssl.SSLError as e:
307
+ print(f'SSLError detected, so try to download without ssl')
308
+ remote_path = remote_path.replace("https","http")
309
+ urllib.request.urlretrieve(
310
+ remote_path,
311
+ temp_path,
312
+ progress_print,
313
+ )
314
+ shutil.move(temp_path, weight_path)
315
+
316
+ def check_and_download_file(file_path, remote_path):
317
+ if not os.path.exists(file_path):
318
+ print('Downloading %s...' % file_path)
319
+ urlretrieve(remote_path + os.path.basename(file_path), file_path, progress_print)
320
+
321
+ # ==============================================================================
322
+ # base model class
323
+ # ==============================================================================
324
+
325
+ intermediate_callback_cnt = 0
326
+ intermediate_callback_map = {}
327
+
328
+ def intermediate_callback(handle, text):
329
+ intermediate_callback_map[handle](text.decode())
330
+ return 0
331
+
332
+ class AiliaSpeechError(RuntimeError):
333
+ def __init__(self, message, code):
334
+ super().__init__(f"{message} code:{code}")
335
+ self.code = code
336
+
337
+ class AiliaSpeechModel:
338
+ _api_callback = None
339
+ _instance = None
340
+ _c_callback = None
341
+
342
+ def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
343
+ """ constructor of ailia Specch model instance.
344
+
345
+ Parameters
346
+ ----------
347
+ env_id : int, optional, default:ENVIRONMENT_AUTO(-1)
348
+ environment id of ailia excecution.
349
+ To retrieve env_id value, use
350
+ ailia.get_environment_count() / ailia.get_environment() pair
351
+ or
352
+ ailia.get_gpu_environment_id() .
353
+ num_thread : int, optional, default: MULTITHREAD_AUTO(0)
354
+ number of threads.
355
+ valid values:
356
+ MULTITHREAD_AUTO=0 [means systems's logical processor count],
357
+ 1 to 32.
358
+ memory_mode : int, optional, default: 11 (reuse interstage)
359
+ memory management mode of ailia excecution.
360
+ To retrieve memory_mode value, use ailia.get_memory_mode() .
361
+ task : int, optional, default: AILIA_SPEECH_TASK_TRANSCRIBE
362
+ AILIA_SPEECH_TASK_TRANSCRIBE or AILIA_SPEECH_TASK_TRANSLATE
363
+ flags : int, optional, default: AILIA_SPEECH_FLAG_NONE
364
+ Reserved
365
+ callback : func or None, optional, default: None
366
+ Callback for receiving intermediate result text .
367
+ Example
368
+ def f_callback(text):
369
+ print(text)
370
+ """
371
+ self._instance = ctypes.c_void_p(None)
372
+ self._create_callback()
373
+ self._check(dll.ailiaSpeechCreate(cast(pointer(self._instance), POINTER(c_void_p)), ctypes.c_int32(env_id), ctypes.c_int32(num_thread), ctypes.c_int32(memory_mode), ctypes.c_int32(task), ctypes.c_int32(flags), self._api_callback, ctypes.c_int32(AILIA_SPEECH_API_CALLBACK_VERSION)))
374
+ if callback is not None:
375
+ self._c_callback = AILIA_SPEECH_USER_API_INTERMEDIATE_CALLBACK(intermediate_callback)
376
+ global intermediate_callback_cnt
377
+ global intermediate_callback_map
378
+ intermediate_callback_map[intermediate_callback_cnt] = callback
379
+ self._check(dll.ailiaSpeechSetIntermediateCallback(self._instance, self._c_callback, intermediate_callback_cnt))
380
+ intermediate_callback_cnt = intermediate_callback_cnt + 1
381
+
382
+ def _check(self, status):
383
+ if status != AILIA_SPEECH_STATUS_SUCCESS:
384
+ raise AiliaSpeechError(f"ailia speech error", status)
385
+
386
+ def _string_buffer_aw(self, path):
387
+ if sys.platform == "win32":
388
+ return ctypes.create_unicode_buffer(path)
389
+ else:
390
+ return ctypes.create_string_buffer(path.encode("utf-8"))
391
+
392
+ def _string_buffer(self, path):
393
+ return ctypes.create_string_buffer(path.encode("utf-8"))
394
+
395
+ def _create_callback(self):
396
+ callback = AILIASpeechApiCallback()
397
+ callback.ailiaAudioGetFrameLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN(("ailiaAudioGetFrameLen", ailia.audio.audio_core.dll))
398
+ callback.ailiaAudioGetMelSpectrogram = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM(("ailiaAudioGetMelSpectrogram", ailia.audio.audio_core.dll))
399
+ callback.ailiaAudioResample = AILIA_SPEECH_USER_API_AILIA_AUDIO_RESAMPLE(("ailiaAudioResample", ailia.audio.audio_core.dll))
400
+ callback.ailiaAudioGetResampleLen = AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_RESAMPLE_LEN(("ailiaAudioGetResampleLen", ailia.audio.audio_core.dll))
401
+
402
+ callback.ailiaTokenizerCreate = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_CREATE(("ailiaTokenizerCreate", ailia_tokenizer.dll))
403
+ callback.ailiaTokenizerOpenModelFileA = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_A(("ailiaTokenizerOpenModelFileA", ailia_tokenizer.dll))
404
+ callback.ailiaTokenizerOpenModelFileW = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_OPEN_MODEL_FILE_W(("ailiaTokenizerOpenModelFileW", ailia_tokenizer.dll))
405
+ callback.ailiaTokenizerEncode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_ENCODE(("ailiaTokenizerEncode", ailia_tokenizer.dll))
406
+ callback.ailiaTokenizerGetTokenCount = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKEN_COUNT(("ailiaTokenizerGetTokenCount", ailia_tokenizer.dll))
407
+ callback.ailiaTokenizerGetTokens = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TOKENS(("ailiaTokenizerGetTokens", ailia_tokenizer.dll))
408
+ callback.ailiaTokenizerDecode = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DECODE(("ailiaTokenizerDecode", ailia_tokenizer.dll))
409
+ callback.ailiaTokenizerGetTextLength = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT_LENGTH(("ailiaTokenizerGetTextLength", ailia_tokenizer.dll))
410
+ callback.ailiaTokenizerGetText = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_GET_TEXT(("ailiaTokenizerGetText", ailia_tokenizer.dll))
411
+ callback.ailiaTokenizerDestroy = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_DESTROY(("ailiaTokenizerDestroy", ailia_tokenizer.dll))
412
+ callback.ailiaTokenizerUtf8ToUtf32 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF8_TO_UTF32(("ailiaTokenizerUtf8ToUtf32", ailia_tokenizer.dll))
413
+ callback.ailiaTokenizerUtf32ToUtf8 = AILIA_SPEECH_USER_API_AILIA_TOKENIZER_UTF32_TO_UTF8(("ailiaTokenizerUtf32ToUtf8", ailia_tokenizer.dll))
414
+
415
+ callback.ailiaCreate = AILIA_SPEECH_USER_API_AILIA_CREATE(("ailiaCreate", ailia.core.dll))
416
+ callback.ailiaOpenWeightFileA = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_A(("ailiaOpenWeightFileA", ailia.core.dll))
417
+ callback.ailiaOpenWeightFileW = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_FILE_W(("ailiaOpenWeightFileW", ailia.core.dll))
418
+ callback.ailiaOpenWeightMem = AILIA_SPEECH_USER_API_AILIA_OPEN_WEIGHT_MEM(("ailiaOpenWeightMem", ailia.core.dll))
419
+ callback.ailiaSetMemoryMode = AILIA_SPEECH_USER_API_AILIA_SET_MEMORY_MODE(("ailiaSetMemoryMode", ailia.core.dll))
420
+ callback.ailiaDestroy = AILIA_SPEECH_USER_API_AILIA_DESTROY(("ailiaDestroy", ailia.core.dll))
421
+ callback.ailiaUpdate = AILIA_SPEECH_USER_API_AILIA_UPDATE(("ailiaUpdate", ailia.core.dll))
422
+ callback.ailiaGetBlobIndexByInputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_INPUT_INDEX(("ailiaGetBlobIndexByInputIndex", ailia.core.dll))
423
+ callback.ailiaGetBlobIndexByOutputIndex = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_INDEX_BY_OUTPUT_INDEX(("ailiaGetBlobIndexByOutputIndex", ailia.core.dll))
424
+ callback.ailiaGetBlobData = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_DATA(("ailiaGetBlobData", ailia.core.dll))
425
+ callback.ailiaSetInputBlobData = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_DATA(("ailiaSetInputBlobData", ailia.core.dll))
426
+ callback.ailiaSetInputBlobShape = AILIA_SPEECH_USER_API_AILIA_SET_INPUT_BLOB_SHAPE(("ailiaSetInputBlobShape", ailia.core.dll))
427
+ callback.ailiaGetBlobShape = AILIA_SPEECH_USER_API_AILIA_GET_BLOB_SHAPE(("ailiaGetBlobShape", ailia.core.dll))
428
+ callback.ailiaGetErrorDetail = AILIA_SPEECH_USER_API_AILIA_GET_ERROR_DETAIL(("ailiaGetErrorDetail", ailia.core.dll))
429
+ callback.ailiaCopyBlobData = AILIA_SPEECH_USER_API_AILIA_COPY_BLOB_DATA(("ailiaCopyBlobData", ailia.core.dll))
430
+ callback.ailiaGetEnvironment = AILIA_SPEECH_USER_API_AILIA_GET_ENVIRONMENT(("ailiaGetEnvironment", ailia.core.dll))
431
+
432
+ self._api_callback = callback # prevent GC
433
+
434
+ def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, vad_version, diarization_type, model_type):
435
+ if model_type == AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL:
436
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/sensevoice/"
437
+ else:
438
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
439
+ os.makedirs(model_path, exist_ok = True)
440
+ check_and_download_file(model_path + encoder_path, REMOTE_PATH)
441
+ check_and_download_file(model_path + decoder_path, REMOTE_PATH)
442
+ if encoder_pb_path is not None:
443
+ check_and_download_file(model_path + encoder_pb_path, REMOTE_PATH)
444
+ if decoder_pb_path is not None:
445
+ check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
446
+
447
+ if vad_type is not None:
448
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
449
+ check_and_download_file(model_path + self._vad_model_name(vad_version), REMOTE_PATH)
450
+
451
+ if diarization_type is not None:
452
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/pyannote-audio/"
453
+ check_and_download_file(model_path + "segmentation.onnx", REMOTE_PATH)
454
+ check_and_download_file(model_path + "speaker-embedding.onnx", REMOTE_PATH)
455
+
456
+ def _open_model(self, encoder, decoder, model_type):
457
+ p1 = self._string_buffer_aw(encoder)
458
+ p2 = self._string_buffer_aw(decoder)
459
+
460
+ if sys.platform == "win32":
461
+ self._check(dll.ailiaSpeechOpenModelFileW(self._instance, p1, p2, model_type))
462
+ else:
463
+ self._check(dll.ailiaSpeechOpenModelFileA(self._instance, p1, p2, model_type))
464
+
465
+ def _open_vad(self, vad, vad_type):
466
+ p1 = self._string_buffer_aw(vad)
467
+
468
+ if sys.platform == "win32":
469
+ self._check(dll.ailiaSpeechOpenVadFileW(self._instance, p1, vad_type))
470
+ else:
471
+ self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
472
+
473
+ def _open_diarization(self, segmentation, embedding, diarization_type):
474
+ p1 = self._string_buffer_aw(segmentation)
475
+ p2 = self._string_buffer_aw(embedding)
476
+
477
+ if sys.platform == "win32":
478
+ self._check(dll.ailiaSpeechOpenDiarizationFileW(self._instance, p1, p2, diarization_type))
479
+ else:
480
+ self._check(dll.ailiaSpeechOpenDiarizationFileA(self._instance, p1, p2, diarization_type))
481
+
482
+ def set_silent_threshold(self, silent_threshold, speech_sec, no_speech_sec):
483
+ """ Set silent threshold. If there are more than a certain number of sounded sections, and if the silent section lasts for a certain amount of time or more, the remaining buffer is processed without waiting for 30 seconds.
484
+
485
+ Parameters
486
+ ----------
487
+ silent_threshold : float
488
+ volume threshold, standard value 0.5
489
+ speech_sec : float
490
+ speech time, standard value 1.0
491
+ no_speech_sec : float
492
+ no_speech time, standard value 1.0
493
+ """
494
+ self._check(dll.ailiaSpeechSetSilentThreshold(self._instance, silent_threshold, speech_sec, no_speech_sec))
495
+
496
+ def transcribe(self, audio_waveform, sampling_rate, lang = None):
497
+ """ Perform speech recognition. Processes the entire audio at once.
498
+
499
+ Parameters
500
+ ----------
501
+ audio_waveform : np.array
502
+ PCM data
503
+ sampling_rate : int
504
+ Sampling rate (Hz)
505
+ lang : str, optional, default : None
506
+ Language code (ja, en, etc.) (automatic detection if None)
507
+
508
+ Returns
509
+ ----------
510
+ text : Speech recognition result text
511
+ time_stamp_begin : Start Time (seconds)
512
+ time_stamp_end : End Time (seconds)
513
+ speaker_id :Speaker ID (when diarization is enabled)
514
+ language : Language code
515
+ confidence : Confidence level
516
+ """
517
+ if len(audio_waveform.shape) == 1:
518
+ channels = 1
519
+ elif len(audio_waveform.shape) == 2:
520
+ channels = audio_waveform.shape[0]
521
+ audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
522
+ else:
523
+ raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
524
+
525
+ audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
526
+
527
+ if lang is not None:
528
+ self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
529
+
530
+ self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
531
+ self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
532
+
533
+ while True:
534
+ complete = ctypes.c_uint(0)
535
+ self._check(dll.ailiaSpeechComplete(self._instance, ctypes.byref(complete)))
536
+ if complete.value == 1:
537
+ break
538
+
539
+ self._check(dll.ailiaSpeechTranscribe(self._instance))
540
+
541
+ count = ctypes.c_uint(0)
542
+ self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
543
+ results = []
544
+ for i in range(count.value):
545
+ text = AILIASpeechText()
546
+ self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
547
+ yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "speaker_id" : None if text.speaker_id == AILIA_SPEECH_SPEAKER_ID_UNKNOWN else text.speaker_id, "language" : text.language.decode(), "confidence" : text.confidence}
548
+
549
+ self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
550
+
551
+ def transcribe_step(self, audio_waveform, sampling_rate, complete, lang = None):
552
+ """ Perform speech recognition. Processes the audio sequentially.
553
+
554
+ Parameters
555
+ ----------
556
+ audio_waveform : np.array
557
+ PCM data
558
+ sampling_rate : int
559
+ Sampling rate (Hz)
560
+ lang : str, optional, default : None
561
+ Language code (ja, en, etc.) (automatic detection if None)
562
+ complete : bool
563
+ True if this is the final audio input.
564
+ transcribe_step executes a step each time there is microphone input, and by setting complete to True at the end, the buffer can be flushed.
565
+
566
+ Returns
567
+ ----------
568
+ text : Speech recognition result text
569
+ time_stamp_begin : Start Time (seconds)
570
+ time_stamp_end : End Time (seconds)
571
+ speaker_id :Speaker ID (when diarization is enabled)
572
+ language : Language code
573
+ confidence : Confidence level
574
+ """
575
+ if len(audio_waveform.shape) == 1:
576
+ channels = 1
577
+ elif len(audio_waveform.shape) == 2:
578
+ channels = audio_waveform.shape[0]
579
+ audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
580
+ else:
581
+ raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
582
+
583
+ audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
584
+
585
+ if lang is not None:
586
+ self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
587
+
588
+ self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
589
+ if complete:
590
+ self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
591
+
592
+ while True:
593
+ buffered = ctypes.c_uint(0)
594
+ self._check(dll.ailiaSpeechBuffered(self._instance, ctypes.byref(buffered)))
595
+ if buffered.value == 0:
596
+ break
597
+
598
+ self._check(dll.ailiaSpeechTranscribe(self._instance))
599
+
600
+ count = ctypes.c_uint(0)
601
+ self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
602
+ results = []
603
+ for i in range(count.value):
604
+ text = AILIASpeechText()
605
+ self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
606
+ yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "speaker_id" : None if text.speaker_id == AILIA_SPEECH_SPEAKER_ID_UNKNOWN else text.speaker_id, "language" : text.language.decode(), "confidence" : text.confidence}
607
+
608
+ if complete:
609
+ self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
610
+
611
+ def _vad_model_name(self, vad_version):
612
+ if vad_version == "4":
613
+ vad_path = "silero_vad.onnx"
614
+ elif vad_version == "5" or vad_version == "6" or vad_version == "6_2":
615
+ vad_path = "silero_vad_v" + vad_version + ".onnx"
616
+ else:
617
+ raise Exception("Unknown vad_version")
618
+ return vad_path
619
+
620
+ def __del__(self):
621
+ if self._instance:
622
+ dll.ailiaSpeechDestroy(cast(self._instance, c_void_p))
623
+
624
+ # ==============================================================================
625
+ # Public class
626
+ # ==============================================================================
627
+
628
+ class Whisper(AiliaSpeechModel):
629
+ def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
630
+ super().__init__(env_id = env_id, num_thread = num_thread, memory_mode = memory_mode, task = task, flags = flags, callback = callback)
631
+
632
+ def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY, vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, vad_version = "4", diarization_type = None, is_fp16 = False):
633
+ """ Initialize and download the model.
634
+
635
+ Parameters
636
+ ----------
637
+ model_path : string, optional, default : "./"
638
+ Destination for saving the model file
639
+ model_type : int, optional, default : AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
640
+ Type of model. Can be set to AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY, AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE, AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL, AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM, AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE, AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 or AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO.
641
+ vad_type : int, optional, default : AILIA_SPEECH_VAD_TYPE_SILERO
642
+ Type of VAD. Can be set to None or AILIA_SPEECH_VAD_TYPE_SILERO.
643
+ vad_version : string, optional, default : "4"
644
+ Versions 4, 5, and 6.2 of SileroVAD can be specified.
645
+ diarization_type : int, optional, default : None
646
+ Type of diarization. Can be set to None or AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO.
647
+ transcribe_step executes a step each time there is microphone input, and by setting complete to True at the end, the buffer can be flushed.
648
+ is_fp16 : bool, optional, default : False
649
+ Whether to use an FP16 model.
650
+ """
651
+ if "time_license" in ailia.get_version():
652
+ ailia.check_and_download_license()
653
+ if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
654
+ if is_fp16:
655
+ encoder_path = "encoder_tiny_fp16.opt3.onnx"
656
+ decoder_path = "decoder_tiny_fix_kv_cache_fp16.opt3.onnx"
657
+ else:
658
+ encoder_path = "encoder_tiny.opt3.onnx"
659
+ decoder_path = "decoder_tiny_fix_kv_cache.opt3.onnx"
660
+ encoder_pb_path = None
661
+ decoder_pb_path = None
662
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE:
663
+ if is_fp16:
664
+ encoder_path = "encoder_base_fp16.opt3.onnx"
665
+ decoder_path = "decoder_base_fix_kv_cache_fp16.opt3.onnx"
666
+ else:
667
+ encoder_path = "encoder_base.opt3.onnx"
668
+ decoder_path = "decoder_base_fix_kv_cache.opt3.onnx"
669
+ encoder_pb_path = None
670
+ decoder_pb_path = None
671
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL:
672
+ if is_fp16:
673
+ encoder_path = "encoder_small_fp16.opt3.onnx"
674
+ decoder_path = "decoder_small_fix_kv_cache_fp16.opt3.onnx"
675
+ else:
676
+ encoder_path = "encoder_small.opt3.onnx"
677
+ decoder_path = "decoder_small_fix_kv_cache.opt3.onnx"
678
+ encoder_pb_path = None
679
+ decoder_pb_path = None
680
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM:
681
+ if is_fp16:
682
+ encoder_path = "encoder_medium_fp16.opt3.onnx"
683
+ decoder_path = "decoder_medium_fix_kv_cache_fp16.opt3.onnx"
684
+ else:
685
+ encoder_path = "encoder_medium.opt3.onnx"
686
+ decoder_path = "decoder_medium_fix_kv_cache.opt3.onnx"
687
+ encoder_pb_path = None
688
+ decoder_pb_path = None
689
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE:
690
+ encoder_path = "encoder_large.onnx"
691
+ decoder_path = "decoder_large_fix_kv_cache.onnx"
692
+ encoder_pb_path = "encoder_large_weights.pb"
693
+ decoder_pb_path = "decoder_large_fix_kv_cache_weights.pb"
694
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3:
695
+ encoder_path = "encoder_large_v3.onnx"
696
+ decoder_path = "decoder_large_v3_fix_kv_cache.onnx"
697
+ encoder_pb_path = "encoder_large_v3_weights.pb"
698
+ decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
699
+ elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO:
700
+ if is_fp16:
701
+ encoder_path = "encoder_turbo_fp16.opt.onnx"
702
+ decoder_path = "decoder_turbo_fix_kv_cache_fp16.opt.onnx"
703
+ encoder_pb_path = None
704
+ else:
705
+ encoder_path = "encoder_turbo.opt.onnx"
706
+ decoder_path = "decoder_turbo_fix_kv_cache.opt.onnx"
707
+ encoder_pb_path = "encoder_turbo_weights.opt.pb"
708
+ decoder_pb_path = None
709
+ model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
710
+ else:
711
+ raise Exception("Unknown model type")
712
+ self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, vad_version, diarization_type, model_type)
713
+ self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
714
+ if vad_type is not None:
715
+ self._open_vad(model_path + self._vad_model_name(vad_version), vad_type)
716
+ self.set_silent_threshold(0.5, 1.0, 1.0)
717
+ if diarization_type is not None:
718
+ self._open_diarization(model_path + "segmentation.onnx", model_path + "speaker-embedding.onnx", diarization_type)
719
+
720
+ class SenseVoice(AiliaSpeechModel):
721
+ def __init__(self, env_id = -1, num_thread = 0, memory_mode = 11, task = AILIA_SPEECH_TASK_TRANSCRIBE, flags = AILIA_SPEECH_FLAG_NONE, callback = None):
722
+ super().__init__(env_id = env_id, num_thread = num_thread, memory_mode = memory_mode, task = task, flags = flags, callback = callback)
723
+
724
+ def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL, vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, vad_version = "4", diarization_type = None, is_fp16 = False):
725
+ """ Initialize and download the model.
726
+
727
+ Parameters
728
+ ----------
729
+ model_path : string, optional, default : "./"
730
+ Destination for saving the model file
731
+ model_type : int, optional, default : AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
732
+ Type of model. Can be set to AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL.
733
+ vad_type : int, optional, default : AILIA_SPEECH_VAD_TYPE_SILERO
734
+ Type of VAD. Can be set to None or AILIA_SPEECH_VAD_TYPE_SILERO.
735
+ vad_version : string, optional, default : "4"
736
+ Versions 4, 5, and 6.2 of SileroVAD can be specified.
737
+ diarization_type : int, optional, default : None
738
+ Type of diarization. Can be set to None or AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO.
739
+ transcribe_step executes a step each time there is microphone input, and by setting complete to True at the end, the buffer can be flushed.
740
+ is_fp16 : bool, optional, default : False
741
+ Whether to use an FP16 model.
742
+ """
743
+
744
+ if "time_license" in ailia.get_version():
745
+ ailia.check_and_download_license()
746
+ if model_type == AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL:
747
+ if is_fp16:
748
+ encoder_path = "sensevoice_small_fp16.onnx"
749
+ else:
750
+ encoder_path = "sensevoice_small.onnx"
751
+ decoder_path = "sensevoice_small.model"
752
+ encoder_pb_path = None
753
+ decoder_pb_path = None
754
+ else:
755
+ raise Exception("Unknown model type")
756
+ self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, vad_version, diarization_type, model_type)
757
+ self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
758
+ if vad_type is not None:
759
+ self._open_vad(model_path + self._vad_model_name(vad_version), vad_type)
760
+ self.set_silent_threshold(0.5, 1.0, 1.0)
761
+ if diarization_type is not None:
762
+ self._open_diarization(model_path + "segmentation.onnx", model_path + "speaker-embedding.onnx", diarization_type)
@@ -0,0 +1,163 @@
1
+ Metadata-Version: 2.4
2
+ Name: ailia_speech
3
+ Version: 1.5.0
4
+ Summary: ailia AI Speech
5
+ Home-page: https://ailia.ai/en/
6
+ Author: ailia Inc.
7
+ Author-email: contact@ailia.ai
8
+ License: https://ailia.ai/en/license/
9
+ Requires-Python: >3.6
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: ailia
12
+ Requires-Dist: ailia_tokenizer
13
+ Dynamic: author
14
+ Dynamic: author-email
15
+ Dynamic: description
16
+ Dynamic: description-content-type
17
+ Dynamic: home-page
18
+ Dynamic: license
19
+ Dynamic: requires-dist
20
+ Dynamic: requires-python
21
+ Dynamic: summary
22
+
23
+ # ailia AI Speech Python API
24
+
25
+ !! CAUTION !!
26
+ “ailia” IS NOT OPEN SOURCE SOFTWARE (OSS).
27
+ As long as user complies with the conditions stated in [License Document](https://ailia.ai/license/), user may use the Software for free of charge, but the Software is basically paid software.
28
+
29
+ ## About ailia AI Speech
30
+
31
+ ailia AI Speech is a library to perform speech recognition using AI. It provides a C API for native applications, as well as a C# API well suited for Unity applications. Using ailia AI Speech, you can easily integrate AI powered speech recognition into your applications.
32
+
33
+ ## Install from pip
34
+
35
+ You can install the ailia AI Speech free evaluation package with the following command.
36
+
37
+ ```
38
+ pip3 install ailia_speech
39
+ ```
40
+
41
+ ## Install from package
42
+
43
+ You can install the ailia AI Speech from Package with the following command.
44
+
45
+ ```
46
+ python3 bootstrap.py
47
+ pip3 install ./
48
+ ```
49
+
50
+ ## Usage
51
+
52
+ ### Batch mode
53
+
54
+ In batch mode, the entire audio is transcribed at once.
55
+
56
+ ```python
57
+ import ailia_speech
58
+
59
+ import librosa
60
+
61
+ import os
62
+ import urllib.request
63
+
64
+ # Load target audio
65
+ input_file_path = "demo.wav"
66
+ if not os.path.exists(input_file_path):
67
+ urllib.request.urlretrieve(
68
+ "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
69
+ "demo.wav"
70
+ )
71
+ audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
72
+
73
+ # Model Initialize
74
+ speech = ailia_speech.Whisper()
75
+ model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
76
+
77
+ # When using sensevoice
78
+ #speech = ailia_speech.SenseVoice()
79
+ #model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
80
+
81
+ # Infer
82
+ speech.initialize_model(model_path = "./models/", model_type = model_type)
83
+ recognized_text = speech.transcribe(audio_waveform, sampling_rate)
84
+ for text in recognized_text:
85
+ print(text)
86
+ ```
87
+
88
+ ### Step mode
89
+
90
+ In step mode, the audio is input in chunks and transcribed sequentially.
91
+
92
+ ```python
93
+ import ailia_speech
94
+
95
+ import librosa
96
+
97
+ import os
98
+ import urllib.request
99
+
100
+ # Load target audio
101
+ input_file_path = "demo.wav"
102
+ if not os.path.exists(input_file_path):
103
+ urllib.request.urlretrieve(
104
+ "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wav",
105
+ "demo.wav"
106
+ )
107
+ audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
108
+
109
+ # Infer
110
+ speech = ailia_speech.Whisper()
111
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO)
112
+ speech.set_silent_threshold(silent_threshold = 0.5, speech_sec = 1.0, no_speech_sec = 0.5)
113
+ for i in range(0, audio_waveform.shape[0], sampling_rate):
114
+ complete = False
115
+ if i + sampling_rate >= audio_waveform.shape[0]:
116
+ complete = True
117
+ recognized_text = speech.transcribe_step(audio_waveform[i:min(audio_waveform.shape[0], i + sampling_rate)], sampling_rate, complete)
118
+ for text in recognized_text:
119
+ print(text)
120
+ ```
121
+
122
+ ### Dialization mode
123
+
124
+ By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
125
+
126
+ ```
127
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, diarization_type = ailia_speech.AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
128
+ ```
129
+
130
+ ### Available model types
131
+
132
+ It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
133
+
134
+ Whisper
135
+
136
+ ```
137
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
138
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
139
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
140
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
141
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
142
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
143
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
144
+ ```
145
+
146
+ SenseVoice
147
+
148
+ ```
149
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_SENSEVOICE_SMALL
150
+ ```
151
+
152
+ ### Available vad versions
153
+
154
+ By default, version "4" of SileroVAD is used. The version can be specified from "4", "5", "6", and "6_2".
155
+
156
+ ```
157
+ speech.initialize_model(model_path = "./models/", vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, vad_version = "6_2")
158
+ ```
159
+
160
+ ## API specification
161
+
162
+ https://github.com/axinc-ai/ailia-sdk
163
+
@@ -0,0 +1,14 @@
1
+ README.md
2
+ setup.py
3
+ ailia_speech/LICENSE_AILIA_EN.pdf
4
+ ailia_speech/LICENSE_AILIA_JA.pdf
5
+ ailia_speech/__init__.py
6
+ ailia_speech.egg-info/PKG-INFO
7
+ ailia_speech.egg-info/SOURCES.txt
8
+ ailia_speech.egg-info/dependency_links.txt
9
+ ailia_speech.egg-info/requires.txt
10
+ ailia_speech.egg-info/top_level.txt
11
+ ailia_speech/linux/arm64-v8a/libailia_speech.so
12
+ ailia_speech/linux/x64/libailia_speech.so
13
+ ailia_speech/mac/libailia_speech.dylib
14
+ ailia_speech/windows/x64/ailia_speech.dll
@@ -0,0 +1,2 @@
1
+ ailia
2
+ ailia_tokenizer
@@ -0,0 +1 @@
1
+ ailia_speech
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,73 @@
1
+ import os
2
+ import sys
3
+ import platform
4
+ import glob
5
+ import shutil
6
+ import platform
7
+
8
+ from setuptools import setup, Extension
9
+ from setuptools import find_packages
10
+
11
+ with open("README.md", encoding="utf-8") as f:
12
+ long_description = f.read()
13
+
14
+ scripts = []
15
+ for f in glob.glob("ailia_speech/*.py"):
16
+ scripts.append(f)
17
+
18
+ def find_libraries():
19
+ dll_names = []
20
+ platforms = ["win32", "darwin", "linux_armv7l", "linux_aarch64", "linux_x86_64"]
21
+
22
+ for platform in platforms:
23
+ if platform == "win32":
24
+ dll_platform = "windows/x64"
25
+ dll_type = ".dll"
26
+ elif platform == "darwin":
27
+ dll_platform = "mac"
28
+ dll_type = ".dylib"
29
+ else:
30
+ if platform == "linux_armv7l":
31
+ dll_platform = "linux/armeabi-v7a"
32
+ elif platform == "linux_aarch64":
33
+ dll_platform = "linux/arm64-v8a"
34
+ else:
35
+ dll_platform = "linux/x64"
36
+ dll_type = ".so"
37
+
38
+ dll_path = "./ailia_speech/" + dll_platform + "/"
39
+
40
+ for f in glob.glob(dll_path+"*"+dll_type):
41
+ f = f.replace("\\", "/")
42
+ f = f.replace("./ailia_speech/", "./")
43
+ dll_names.append(f)
44
+
45
+ dll_names.append("./LICENSE_AILIA_EN.pdf")
46
+ dll_names.append("./LICENSE_AILIA_JA.pdf")
47
+ dll_names.append("./oss/LICENSE_SILERO_VAD.txt")
48
+ dll_names.append("./oss/LICENSE_SRELL.txt")
49
+ dll_names.append("./oss/LICENSE_WHISPER.txt")
50
+
51
+ return dll_names
52
+
53
+ if __name__ == "__main__":
54
+ setup(
55
+ name="ailia_speech",
56
+ scripts=scripts,
57
+ version="1.5.0",
58
+ install_requires=[
59
+ "ailia",
60
+ "ailia_tokenizer",
61
+ ],
62
+ description="ailia AI Speech",
63
+ long_description=long_description,
64
+ long_description_content_type="text/markdown",
65
+ author="ailia Inc.",
66
+ author_email="contact@ailia.ai",
67
+ url="https://ailia.ai/en/",
68
+ license="https://ailia.ai/en/license/",
69
+ packages=find_packages(),
70
+ package_data={"ailia_speech":find_libraries()},
71
+ include_package_data=True,
72
+ python_requires=">3.6",
73
+ )