ailia-speech 1.3.0.5__py3-none-any.whl → 1.3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ailia-speech might be problematic. Click here for more details.
- ailia_speech/__init__.py +44 -0
- {ailia_speech-1.3.0.5.data → ailia_speech-1.3.1.0.data}/scripts/__init__.py +44 -0
- {ailia_speech-1.3.0.5.dist-info → ailia_speech-1.3.1.0.dist-info}/METADATA +54 -3
- {ailia_speech-1.3.0.5.dist-info → ailia_speech-1.3.1.0.dist-info}/RECORD +6 -6
- {ailia_speech-1.3.0.5.dist-info → ailia_speech-1.3.1.0.dist-info}/WHEEL +0 -0
- {ailia_speech-1.3.0.5.dist-info → ailia_speech-1.3.1.0.dist-info}/top_level.txt +0 -0
ailia_speech/__init__.py
CHANGED
|
@@ -67,6 +67,7 @@ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL = (2)
|
|
|
67
67
|
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM = (3)
|
|
68
68
|
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE = (4)
|
|
69
69
|
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 = (5)
|
|
70
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO = (6)
|
|
70
71
|
|
|
71
72
|
AILIA_SPEECH_TASK_TRANSCRIBE = (0)
|
|
72
73
|
AILIA_SPEECH_TASK_TRANSLATE = (1)
|
|
@@ -425,6 +426,12 @@ class Whisper(AiliaSpeechModel):
|
|
|
425
426
|
decoder_path = "decoder_large_v3_fix_kv_cache.onnx"
|
|
426
427
|
encoder_pb_path = "encoder_large_v3_weights.pb"
|
|
427
428
|
decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
|
|
429
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO:
|
|
430
|
+
encoder_path = "encoder_turbo.onnx"
|
|
431
|
+
decoder_path = "decoder_turbo_fix_kv_cache.onnx"
|
|
432
|
+
encoder_pb_path = "encoder_turbo_weights.pb"
|
|
433
|
+
decoder_pb_path = None
|
|
434
|
+
model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
|
|
428
435
|
self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
|
|
429
436
|
self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
|
|
430
437
|
self._open_vad(model_path + "silero_vad.onnx", AILIA_SPEECH_VAD_TYPE_SILERO)
|
|
@@ -494,6 +501,43 @@ class Whisper(AiliaSpeechModel):
|
|
|
494
501
|
|
|
495
502
|
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
496
503
|
|
|
504
|
+
def transcribe_step(self, audio_waveform, sampling_rate, complete, lang = None):
|
|
505
|
+
if len(audio_waveform.shape) == 1:
|
|
506
|
+
channels = 1
|
|
507
|
+
elif len(audio_waveform.shape) == 2:
|
|
508
|
+
channels = audio_waveform.shape[0]
|
|
509
|
+
audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
|
|
510
|
+
else:
|
|
511
|
+
raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
|
|
512
|
+
|
|
513
|
+
audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
|
|
514
|
+
|
|
515
|
+
if lang is not None:
|
|
516
|
+
self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
|
|
517
|
+
|
|
518
|
+
self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
|
|
519
|
+
if complete:
|
|
520
|
+
self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
|
|
521
|
+
|
|
522
|
+
while True:
|
|
523
|
+
buffered = ctypes.c_uint(0)
|
|
524
|
+
self._check(dll.ailiaSpeechBuffered(self._instance, ctypes.byref(buffered)))
|
|
525
|
+
if buffered.value == 0:
|
|
526
|
+
break
|
|
527
|
+
|
|
528
|
+
self._check(dll.ailiaSpeechTranscribe(self._instance))
|
|
529
|
+
|
|
530
|
+
count = ctypes.c_uint(0)
|
|
531
|
+
self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
|
|
532
|
+
results = []
|
|
533
|
+
for i in range(count.value):
|
|
534
|
+
text = AILIASpeechText()
|
|
535
|
+
self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
|
|
536
|
+
yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence}
|
|
537
|
+
|
|
538
|
+
if complete:
|
|
539
|
+
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
540
|
+
|
|
497
541
|
def __del__(self):
|
|
498
542
|
if self._instance:
|
|
499
543
|
dll.ailiaSpeechDestroy(cast(self._instance, c_void_p))
|
|
@@ -67,6 +67,7 @@ AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL = (2)
|
|
|
67
67
|
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM = (3)
|
|
68
68
|
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE = (4)
|
|
69
69
|
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3 = (5)
|
|
70
|
+
AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO = (6)
|
|
70
71
|
|
|
71
72
|
AILIA_SPEECH_TASK_TRANSCRIBE = (0)
|
|
72
73
|
AILIA_SPEECH_TASK_TRANSLATE = (1)
|
|
@@ -425,6 +426,12 @@ class Whisper(AiliaSpeechModel):
|
|
|
425
426
|
decoder_path = "decoder_large_v3_fix_kv_cache.onnx"
|
|
426
427
|
encoder_pb_path = "encoder_large_v3_weights.pb"
|
|
427
428
|
decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
|
|
429
|
+
elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO:
|
|
430
|
+
encoder_path = "encoder_turbo.onnx"
|
|
431
|
+
decoder_path = "decoder_turbo_fix_kv_cache.onnx"
|
|
432
|
+
encoder_pb_path = "encoder_turbo_weights.pb"
|
|
433
|
+
decoder_pb_path = None
|
|
434
|
+
model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
|
|
428
435
|
self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
|
|
429
436
|
self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
|
|
430
437
|
self._open_vad(model_path + "silero_vad.onnx", AILIA_SPEECH_VAD_TYPE_SILERO)
|
|
@@ -494,6 +501,43 @@ class Whisper(AiliaSpeechModel):
|
|
|
494
501
|
|
|
495
502
|
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
496
503
|
|
|
504
|
+
def transcribe_step(self, audio_waveform, sampling_rate, complete, lang = None):
|
|
505
|
+
if len(audio_waveform.shape) == 1:
|
|
506
|
+
channels = 1
|
|
507
|
+
elif len(audio_waveform.shape) == 2:
|
|
508
|
+
channels = audio_waveform.shape[0]
|
|
509
|
+
audio_waveform = numpy.transpose(audio_waveform, (1, 0)).flatten()
|
|
510
|
+
else:
|
|
511
|
+
raise AiliaSpeechError(f"audio_waveform must be 1 channel or 2 channel", -1)
|
|
512
|
+
|
|
513
|
+
audio_waveform = numpy.ascontiguousarray(audio_waveform.astype(numpy.float32))
|
|
514
|
+
|
|
515
|
+
if lang is not None:
|
|
516
|
+
self._check(dll.ailiaSpeechSetLanguage(self._instance, self._string_buffer(lang)))
|
|
517
|
+
|
|
518
|
+
self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
|
|
519
|
+
if complete:
|
|
520
|
+
self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
|
|
521
|
+
|
|
522
|
+
while True:
|
|
523
|
+
buffered = ctypes.c_uint(0)
|
|
524
|
+
self._check(dll.ailiaSpeechBuffered(self._instance, ctypes.byref(buffered)))
|
|
525
|
+
if buffered.value == 0:
|
|
526
|
+
break
|
|
527
|
+
|
|
528
|
+
self._check(dll.ailiaSpeechTranscribe(self._instance))
|
|
529
|
+
|
|
530
|
+
count = ctypes.c_uint(0)
|
|
531
|
+
self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
|
|
532
|
+
results = []
|
|
533
|
+
for i in range(count.value):
|
|
534
|
+
text = AILIASpeechText()
|
|
535
|
+
self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
|
|
536
|
+
yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence}
|
|
537
|
+
|
|
538
|
+
if complete:
|
|
539
|
+
self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
|
|
540
|
+
|
|
497
541
|
def __del__(self):
|
|
498
542
|
if self._instance:
|
|
499
543
|
dll.ailiaSpeechDestroy(cast(self._instance, c_void_p))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ailia_speech
|
|
3
|
-
Version: 1.3.0
|
|
3
|
+
Version: 1.3.1.0
|
|
4
4
|
Summary: ailia AI Speech
|
|
5
5
|
Home-page: https://ailia.jp/
|
|
6
6
|
Author: ax Inc.
|
|
@@ -40,6 +40,10 @@ pip3 install ./
|
|
|
40
40
|
|
|
41
41
|
## Usage
|
|
42
42
|
|
|
43
|
+
### Batch mode
|
|
44
|
+
|
|
45
|
+
In batch mode, the entire audio is transcribed at once.
|
|
46
|
+
|
|
43
47
|
```python
|
|
44
48
|
import ailia_speech
|
|
45
49
|
|
|
@@ -55,16 +59,63 @@ if not os.path.exists(input_file_path):
|
|
|
55
59
|
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
|
|
56
60
|
"demo.wav"
|
|
57
61
|
)
|
|
58
|
-
audio_waveform, sampling_rate = librosa.load(input_file_path, mono=True)
|
|
62
|
+
audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
|
|
59
63
|
|
|
60
64
|
# Infer
|
|
61
65
|
speech = ailia_speech.Whisper()
|
|
62
|
-
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.
|
|
66
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO)
|
|
63
67
|
recognized_text = speech.transcribe(audio_waveform, sampling_rate)
|
|
64
68
|
for text in recognized_text:
|
|
65
69
|
print(text)
|
|
66
70
|
```
|
|
67
71
|
|
|
72
|
+
### Step mode
|
|
73
|
+
|
|
74
|
+
In step mode, the audio is input in chunks and transcribed sequentially.
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import ailia_speech
|
|
78
|
+
|
|
79
|
+
import librosa
|
|
80
|
+
|
|
81
|
+
import os
|
|
82
|
+
import urllib.request
|
|
83
|
+
|
|
84
|
+
# Load target audio
|
|
85
|
+
input_file_path = "demo.wav"
|
|
86
|
+
if not os.path.exists(input_file_path):
|
|
87
|
+
urllib.request.urlretrieve(
|
|
88
|
+
"https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
|
|
89
|
+
"demo.wav"
|
|
90
|
+
)
|
|
91
|
+
audio_waveform, sampling_rate = librosa.load(input_file_path, mono = True)
|
|
92
|
+
|
|
93
|
+
# Infer
|
|
94
|
+
speech = ailia_speech.Whisper()
|
|
95
|
+
speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO)
|
|
96
|
+
for i in range(0, audio_waveform.shape[0], sampling_rate):
|
|
97
|
+
complete = False
|
|
98
|
+
if i + sampling_rate >= audio_waveform.shape[0]:
|
|
99
|
+
complete = True
|
|
100
|
+
recognized_text = speech.transcribe_step(audio_waveform[i:min(audio_waveform.shape[0], i + sampling_rate)], sampling_rate, complete)
|
|
101
|
+
for text in recognized_text:
|
|
102
|
+
print(text)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Available model types
|
|
106
|
+
|
|
107
|
+
It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
|
|
111
|
+
ilia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
|
|
112
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
|
|
113
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
|
|
114
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
|
|
115
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
|
|
116
|
+
ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO
|
|
117
|
+
```
|
|
118
|
+
|
|
68
119
|
## API specification
|
|
69
120
|
|
|
70
121
|
https://github.com/axinc-ai/ailia-sdk
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
ailia_speech/LICENSE_AILIA_EN.pdf,sha256=1DzVViPnw1uAS8gJ5a8uN3iZNNR5I1ItIXmezHfUpeM,70149
|
|
2
2
|
ailia_speech/LICENSE_AILIA_JA.pdf,sha256=s628QN47S2bNqIfuSjm2LBf0vIluv2df6MSemn6Ksmw,174134
|
|
3
|
-
ailia_speech/__init__.py,sha256=
|
|
3
|
+
ailia_speech/__init__.py,sha256=NMFB_mXj9ed9YcbEXrjvm-82OOYRxzSBENKCuxbtz6M,27915
|
|
4
4
|
ailia_speech/linux/arm64-v8a/libailia_speech.so,sha256=JAOwnBr7lbiMZmPCM99pd4vJQ08ZuXDPpq-FurrXSnE,166096
|
|
5
5
|
ailia_speech/linux/x64/libailia_speech.so,sha256=WbFvA5wKTgS_Zx8ErT7WBKJbzOUexavr4nP4EkLNawQ,171360
|
|
6
6
|
ailia_speech/mac/libailia_speech.dylib,sha256=-JAC40yLslAVMvfh6LhDvP3Zyt3hIT3WZc7wa9-07zU,317112
|
|
7
7
|
ailia_speech/windows/x64/ailia_speech.dll,sha256=WJCOHi0Na4tdMG1RT7dA7yAoWumiGSWeW1vxUtiXDS8,126464
|
|
8
|
-
ailia_speech-1.3.0.
|
|
9
|
-
ailia_speech-1.3.0.
|
|
10
|
-
ailia_speech-1.3.0.
|
|
11
|
-
ailia_speech-1.3.0.
|
|
12
|
-
ailia_speech-1.3.0.
|
|
8
|
+
ailia_speech-1.3.1.0.data/scripts/__init__.py,sha256=NMFB_mXj9ed9YcbEXrjvm-82OOYRxzSBENKCuxbtz6M,27915
|
|
9
|
+
ailia_speech-1.3.1.0.dist-info/METADATA,sha256=x5WBVorX7b9Mubn69FVh75XS2b3iDIy7ZM38F9yyGRo,3610
|
|
10
|
+
ailia_speech-1.3.1.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
11
|
+
ailia_speech-1.3.1.0.dist-info/top_level.txt,sha256=Ou9XeJ9AvdK8eutw07oosCthftD1tRYzAgNY2BrYhDc,13
|
|
12
|
+
ailia_speech-1.3.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|