ailia-speech 1.3.2.3__tar.gz → 1.4.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ailia-speech might be problematic. Click here for more details.
- {ailia_speech-1.3.2.3/ailia_speech.egg-info → ailia_speech-1.4.0.1}/PKG-INFO +20 -3
- ailia_speech-1.3.2.3/PKG-INFO → ailia_speech-1.4.0.1/README.md +9 -14
- {ailia_speech-1.3.2.3 → ailia_speech-1.4.0.1}/ailia_speech/__init__.py +40 -10
- ailia_speech-1.4.0.1/ailia_speech/linux/arm64-v8a/libailia_speech.so +0 -0
- ailia_speech-1.4.0.1/ailia_speech/linux/x64/libailia_speech.so +0 -0
- ailia_speech-1.4.0.1/ailia_speech/mac/libailia_speech.dylib +0 -0
- ailia_speech-1.4.0.1/ailia_speech/windows/x64/ailia_speech.dll +0 -0
- ailia_speech-1.3.2.3/README.md → ailia_speech-1.4.0.1/ailia_speech.egg-info/PKG-INFO +31 -1
- {ailia_speech-1.3.2.3 → ailia_speech-1.4.0.1}/setup.py +1 -1
- ailia_speech-1.3.2.3/ailia_speech/linux/arm64-v8a/libailia_speech.so +0 -0
- ailia_speech-1.3.2.3/ailia_speech/linux/x64/libailia_speech.so +0 -0
- ailia_speech-1.3.2.3/ailia_speech/mac/libailia_speech.dylib +0 -0
- ailia_speech-1.3.2.3/ailia_speech/windows/x64/ailia_speech.dll +0 -0
- {ailia_speech-1.3.2.3 → ailia_speech-1.4.0.1}/ailia_speech/LICENSE_AILIA_EN.pdf +0 -0
- {ailia_speech-1.3.2.3 → ailia_speech-1.4.0.1}/ailia_speech/LICENSE_AILIA_JA.pdf +0 -0
- {ailia_speech-1.3.2.3 → ailia_speech-1.4.0.1}/ailia_speech.egg-info/SOURCES.txt +0 -0
- {ailia_speech-1.3.2.3 → ailia_speech-1.4.0.1}/ailia_speech.egg-info/dependency_links.txt +0 -0
- {ailia_speech-1.3.2.3 → ailia_speech-1.4.0.1}/ailia_speech.egg-info/requires.txt +0 -0
- {ailia_speech-1.3.2.3 → ailia_speech-1.4.0.1}/ailia_speech.egg-info/top_level.txt +0 -0
- {ailia_speech-1.3.2.3 → ailia_speech-1.4.0.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ailia_speech
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0.1
|
|
4
4
|
Summary: ailia AI Speech
|
|
5
5
|
Home-page: https://ailia.jp/
|
|
6
6
|
Author: ax Inc.
|
|
@@ -10,6 +10,15 @@ Requires-Python: >3.6
|
|
|
10
10
|
Description-Content-Type: text/markdown
|
|
11
11
|
Requires-Dist: ailia
|
|
12
12
|
Requires-Dist: ailia_tokenizer
|
|
13
|
+
Dynamic: author
|
|
14
|
+
Dynamic: author-email
|
|
15
|
+
Dynamic: description
|
|
16
|
+
Dynamic: description-content-type
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: license
|
|
19
|
+
Dynamic: requires-dist
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
Dynamic: summary
|
|
13
22
|
|
|
14
23
|
# ailia AI Speech Python API
|
|
15
24
|
|
|
@@ -103,13 +112,21 @@ for i in range(0, audio_waveform.shape[0], sampling_rate):
|
|
|
103
112
|
print(text)
|
|
104
113
|
```
|
|
105
114
|
|
|
115
|
+
### Dialization mode
|
|
116
|
+
|
|
117
|
+
By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, diarization_type = ailia_speech.AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
|
|
121
|
+
```
|
|
122
|
+
|
|
106
123
|
### Available model types
|
|
107
124
|
|
|
108
125
|
It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
|
|
109
126
|
|
|
110
127
|
```
|
|
111
128
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
|
|
112
|
-
|
|
129
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
|
|
113
130
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
|
|
114
131
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
|
|
115
132
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
|
|
@@ -1,16 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: ailia_speech
|
|
3
|
-
Version: 1.3.2.3
|
|
4
|
-
Summary: ailia AI Speech
|
|
5
|
-
Home-page: https://ailia.jp/
|
|
6
|
-
Author: ax Inc.
|
|
7
|
-
Author-email: contact@axinc.jp
|
|
8
|
-
License: https://ailia.ai/en/license/
|
|
9
|
-
Requires-Python: >3.6
|
|
10
|
-
Description-Content-Type: text/markdown
|
|
11
|
-
Requires-Dist: ailia
|
|
12
|
-
Requires-Dist: ailia_tokenizer
|
|
13
|
-
|
|
14
1
|
# ailia AI Speech Python API
|
|
15
2
|
|
|
16
3
|
!! CAUTION !!
|
|
@@ -103,13 +90,21 @@ for i in range(0, audio_waveform.shape[0], sampling_rate):
|
|
|
103
90
|
print(text)
|
|
104
91
|
```
|
|
105
92
|
|
|
93
|
+
### Dialization mode
|
|
94
|
+
|
|
95
|
+
By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, diarization_type = ailia_speech.AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
|
|
99
|
+
```
|
|
100
|
+
|
|
106
101
|
### Available model types
|
|
107
102
|
|
|
108
103
|
It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
|
|
109
104
|
|
|
110
105
|
```
|
|
111
106
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
|
|
112
|
-
|
|
107
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
|
|
113
108
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
|
|
114
109
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
|
|
115
110
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
|
|
@@ -76,9 +76,13 @@ AILIA_SPEECH_FLAG_NONE = (0)
|
|
|
76
76
|
AILIA_SPEECH_FLAG_LIVE = (1)
|
|
77
77
|
|
|
78
78
|
AILIA_SPEECH_VAD_TYPE_SILERO = (0)
|
|
79
|
+
|
|
80
|
+
AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO = (0)
|
|
81
|
+
|
|
79
82
|
AILIA_SPEECH_API_CALLBACK_VERSION = (6)
|
|
80
83
|
|
|
81
84
|
AILIA_SPEECH_TEXT_VERSION = (2)
|
|
85
|
+
AILIA_SPEECH_SPEAKER_ID_UNKNOWN = (0xFFFFFFFF)
|
|
82
86
|
|
|
83
87
|
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
|
|
84
88
|
AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
|
|
@@ -215,6 +219,12 @@ dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
|
|
|
215
219
|
dll.ailiaSpeechOpenVadFileW.restype = c_int
|
|
216
220
|
dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
|
|
217
221
|
|
|
222
|
+
dll.ailiaSpeechOpenDiarizationFileA.restype = c_int
|
|
223
|
+
dll.ailiaSpeechOpenDiarizationFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
|
|
224
|
+
|
|
225
|
+
dll.ailiaSpeechOpenDiarizationFileW.restype = c_int
|
|
226
|
+
dll.ailiaSpeechOpenDiarizationFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
|
|
227
|
+
|
|
218
228
|
dll.ailiaSpeechPushInputData.restype = c_int
|
|
219
229
|
dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
|
|
220
230
|
dtype=numpy.float32, flags='CONTIGUOUS'
|
|
@@ -243,7 +253,7 @@ class AILIASpeechText(ctypes.Structure):
|
|
|
243
253
|
("text", ctypes.c_char_p),
|
|
244
254
|
("time_stamp_begin", ctypes.c_float),
|
|
245
255
|
("time_stamp_end", ctypes.c_float),
|
|
246
|
-
("
|
|
256
|
+
("speaker_id", ctypes.c_uint),
|
|
247
257
|
("language", ctypes.c_char_p),
|
|
248
258
|
("confidence", ctypes.c_float)]
|
|
249
259
|
|
|
@@ -399,7 +409,7 @@ class Whisper(AiliaSpeechModel):
|
|
|
399
409
|
intermediate_callback_cnt = intermediate_callback_cnt + 1
|
|
400
410
|
|
|
401
411
|
|
|
402
|
-
def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY):
|
|
412
|
+
def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY, vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, diarization_type = None):
|
|
403
413
|
if "time_license" in ailia.get_version():
|
|
404
414
|
ailia.check_and_download_license()
|
|
405
415
|
if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
|
|
@@ -438,11 +448,15 @@ class Whisper(AiliaSpeechModel):
|
|
|
438
448
|
encoder_pb_path = "encoder_turbo_weights.opt.pb"
|
|
439
449
|
decoder_pb_path = None
|
|
440
450
|
model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
|
|
441
|
-
self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
|
|
451
|
+
self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, diarization_type)
|
|
442
452
|
self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
453
|
+
if vad_type is not None:
|
|
454
|
+
self._open_vad(model_path + "silero_vad.onnx", vad_type)
|
|
455
|
+
if diarization_type is not None:
|
|
456
|
+
self._open_diarization(model_path + "segmentation.onnx", model_path + "speaker-embedding.onnx", diarization_type)
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, diarization_type):
|
|
446
460
|
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
|
|
447
461
|
os.makedirs(model_path, exist_ok = True)
|
|
448
462
|
check_and_download_file(model_path + encoder_path, REMOTE_PATH)
|
|
@@ -452,8 +466,14 @@ class Whisper(AiliaSpeechModel):
|
|
|
452
466
|
if decoder_pb_path is not None:
|
|
453
467
|
check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
|
|
454
468
|
|
|
455
|
-
|
|
456
|
-
|
|
469
|
+
if vad_type is not None:
|
|
470
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
|
|
471
|
+
check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
|
|
472
|
+
|
|
473
|
+
if diarization_type is not None:
|
|
474
|
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/pyannote-audio/"
|
|
475
|
+
check_and_download_file(model_path + "segmentation.onnx", REMOTE_PATH)
|
|
476
|
+
check_and_download_file(model_path + "speaker-embedding.onnx", REMOTE_PATH)
|
|
457
477
|
|
|
458
478
|
def _open_model(self, encoder, decoder, model_type):
|
|
459
479
|
p1 = self._string_buffer_aw(encoder)
|
|
@@ -472,6 +492,16 @@ class Whisper(AiliaSpeechModel):
|
|
|
472
492
|
else:
|
|
473
493
|
self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
|
|
474
494
|
|
|
495
|
+
def _open_diarization(self, segmentation, embedding, diarization_type):
|
|
496
|
+
p1 = self._string_buffer_aw(segmentation)
|
|
497
|
+
p2 = self._string_buffer_aw(embedding)
|
|
498
|
+
|
|
499
|
+
if sys.platform == "win32":
|
|
500
|
+
self._check(dll.ailiaSpeechOpenDiarizationFileW(self._instance, p1, p2, diarization_type))
|
|
501
|
+
else:
|
|
502
|
+
self._check(dll.ailiaSpeechOpenDiarizationFileA(self._instance, p1, p2, diarization_type))
|
|
503
|
+
|
|
504
|
+
|
|
475
505
|
def set_silent_threshold(self, silent_threshold, speech_sec, no_speech_sec):
|
|
476
506
|
self._check(dll.ailiaSpeechSetSilentThreshold(self._instance, silent_threshold, speech_sec, no_speech_sec))
|
|
477
507
|
|
|
@@ -506,7 +536,7 @@ class Whisper(AiliaSpeechModel):
|
|
|
506
536
|
for i in range(count.value):
|
|
507
537
|
text = AILIASpeechText()
|
|
508
538
|
self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
|
|
509
|
-
yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "
|
|
539
|
+
yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "speaker_id" : None if text.speaker_id == AILIA_SPEECH_SPEAKER_ID_UNKNOWN else text.speaker_id, "language" : text.language.decode(), "confidence" : text.confidence}
|
|
510
540
|
|
|
511
541
|
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
512
542
|
|
|
@@ -542,7 +572,7 @@ class Whisper(AiliaSpeechModel):
|
|
|
542
572
|
for i in range(count.value):
|
|
543
573
|
text = AILIASpeechText()
|
|
544
574
|
self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
|
|
545
|
-
yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "
|
|
575
|
+
yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "speaker_id" : None if text.speaker_id == AILIA_SPEECH_SPEAKER_ID_UNKNOWN else text.speaker_id, "language" : text.language.decode(), "confidence" : text.confidence}
|
|
546
576
|
|
|
547
577
|
if complete:
|
|
548
578
|
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -1,3 +1,25 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ailia_speech
|
|
3
|
+
Version: 1.4.0.1
|
|
4
|
+
Summary: ailia AI Speech
|
|
5
|
+
Home-page: https://ailia.jp/
|
|
6
|
+
Author: ax Inc.
|
|
7
|
+
Author-email: contact@axinc.jp
|
|
8
|
+
License: https://ailia.ai/en/license/
|
|
9
|
+
Requires-Python: >3.6
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: ailia
|
|
12
|
+
Requires-Dist: ailia_tokenizer
|
|
13
|
+
Dynamic: author
|
|
14
|
+
Dynamic: author-email
|
|
15
|
+
Dynamic: description
|
|
16
|
+
Dynamic: description-content-type
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: license
|
|
19
|
+
Dynamic: requires-dist
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
Dynamic: summary
|
|
22
|
+
|
|
1
23
|
# ailia AI Speech Python API
|
|
2
24
|
|
|
3
25
|
!! CAUTION !!
|
|
@@ -90,13 +112,21 @@ for i in range(0, audio_waveform.shape[0], sampling_rate):
|
|
|
90
112
|
print(text)
|
|
91
113
|
```
|
|
92
114
|
|
|
115
|
+
### Dialization mode
|
|
116
|
+
|
|
117
|
+
By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, diarization_type = ailia_speech.AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
|
|
121
|
+
```
|
|
122
|
+
|
|
93
123
|
### Available model types
|
|
94
124
|
|
|
95
125
|
It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
|
|
96
126
|
|
|
97
127
|
```
|
|
98
128
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
|
|
99
|
-
|
|
129
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
|
|
100
130
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
|
|
101
131
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
|
|
102
132
|
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|