PyPI - ailia-speech - Versions diffs - 1.3.2.2__tar.gz → 1.4.0__tar.gz - Mend

ailia-speech 1.3.2.2tar.gz → 1.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ailia-speech might be problematic. Click here for more details.

Files changed (20) hide show

{ailia_speech-1.3.2.2/ailia_speech.egg-info → ailia_speech-1.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: ailia_speech
-Version: 1.3.2.2
+Version: 1.4.0
 Summary: ailia AI Speech
 Home-page: https://ailia.jp/
 Author: ax Inc.
@@ -10,6 +10,15 @@ Requires-Python: >3.6
 Description-Content-Type: text/markdown
 Requires-Dist: ailia
 Requires-Dist: ailia_tokenizer
+Dynamic: author
+Dynamic: author-email
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
 # ailia AI Speech Python API
@@ -103,13 +112,21 @@ for i in range(0, audio_waveform.shape[0], sampling_rate):
 		print(text)
 ```
+### Dialization mode
+By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
+```
+speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, dialization_type = AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
+```
 ### Available model types
 It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
 ```
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
-ilia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
+ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE

ailia_speech-1.3.2.2/PKG-INFO → ailia_speech-1.4.0/README.md RENAMED Viewed

@@ -1,16 +1,3 @@
-Metadata-Version: 2.1
-Name: ailia_speech
-Version: 1.3.2.2
-Summary: ailia AI Speech
-Home-page: https://ailia.jp/
-Author: ax Inc.
-Author-email: contact@axinc.jp
-License: https://ailia.ai/en/license/
-Requires-Python: >3.6
-Description-Content-Type: text/markdown
-Requires-Dist: ailia
-Requires-Dist: ailia_tokenizer
 # ailia AI Speech Python API
 !! CAUTION !!
@@ -103,13 +90,21 @@ for i in range(0, audio_waveform.shape[0], sampling_rate):
 		print(text)
 ```
+### Dialization mode
+By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
+```
+speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, dialization_type = AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
+```
 ### Available model types
 It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
 ```
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
-ilia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
+ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE

{ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/ailia_speech/__init__.py RENAMED Viewed

@@ -76,9 +76,13 @@ AILIA_SPEECH_FLAG_NONE = (0)
 AILIA_SPEECH_FLAG_LIVE = (1)
 AILIA_SPEECH_VAD_TYPE_SILERO = (0)
+AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO = (0)
 AILIA_SPEECH_API_CALLBACK_VERSION = (6)
 AILIA_SPEECH_TEXT_VERSION = (2)
+AILIA_SPEECH_SPEAKER_ID_UNKNOWN = (0xFFFFFFFF)
 AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
 AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
@@ -215,6 +219,12 @@ dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
 dll.ailiaSpeechOpenVadFileW.restype = c_int
 dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
+dll.ailiaSpeechOpenDiarizationFileA.restype = c_int
+dll.ailiaSpeechOpenDiarizationFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
+dll.ailiaSpeechOpenDiarizationFileW.restype = c_int
+dll.ailiaSpeechOpenDiarizationFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
 dll.ailiaSpeechPushInputData.restype = c_int
 dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
                 dtype=numpy.float32, flags='CONTIGUOUS'
@@ -243,7 +253,7 @@ class AILIASpeechText(ctypes.Structure):
         ("text", ctypes.c_char_p),
         ("time_stamp_begin", ctypes.c_float),
         ("time_stamp_end", ctypes.c_float),
-        ("person_id", ctypes.c_uint),
+        ("speaker_id", ctypes.c_uint),
         ("language", ctypes.c_char_p),
         ("confidence", ctypes.c_float)]
@@ -399,7 +409,7 @@ class Whisper(AiliaSpeechModel):
             intermediate_callback_cnt = intermediate_callback_cnt + 1
-    def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY):
+    def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY, vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, dialization_type = None):
         if "time_license" in ailia.get_version():
             ailia.check_and_download_license()
         if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
@@ -433,16 +443,20 @@ class Whisper(AiliaSpeechModel):
             encoder_pb_path = "encoder_large_v3_weights.pb"
             decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
         elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO:
-            encoder_path = "encoder_turbo.onnx"
-            decoder_path = "decoder_turbo_fix_kv_cache.onnx"
-            encoder_pb_path = "encoder_turbo_weights.pb"
+            encoder_path = "encoder_turbo.opt.onnx"
+            decoder_path = "decoder_turbo_fix_kv_cache.opt.onnx"
+            encoder_pb_path = "encoder_turbo_weights.opt.pb"
             decoder_pb_path = None
             model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
-        self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
+        self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, dialization_type)
         self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
-        self._open_vad(model_path + "silero_vad.onnx", AILIA_SPEECH_VAD_TYPE_SILERO)
-    def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path):
+        if vad_type is not None:
+            self._open_vad(model_path + "silero_vad.onnx", vad_type)
+        if dialization_type is not None:
+            self._open_diarization(model_path + "segmentation.onnx", model_path + "speaker-embedding.onnx", dialization_type)
+    def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, dialization_type):
         REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
         os.makedirs(model_path, exist_ok = True)
         check_and_download_file(model_path + encoder_path, REMOTE_PATH)
@@ -452,8 +466,14 @@ class Whisper(AiliaSpeechModel):
         if decoder_pb_path is not None:
             check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
-        REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
-        check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
+        if vad_type is not None:
+            REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
+            check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
+        if dialization_type is not None:
+            REMOTE_PATH = "https://storage.googleapis.com/ailia-models/pyannote-audio/"
+            check_and_download_file(model_path + "segmentation.onnx", REMOTE_PATH)
+            check_and_download_file(model_path + "speaker-embedding.onnx", REMOTE_PATH)
     def _open_model(self, encoder, decoder, model_type):
         p1 = self._string_buffer_aw(encoder)
@@ -472,6 +492,16 @@ class Whisper(AiliaSpeechModel):
         else:
             self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
+    def _open_diarization(self, segmentation, embedding, diarization_type):
+        p1 = self._string_buffer_aw(segmentation)
+        p2 = self._string_buffer_aw(embedding)
+        if sys.platform == "win32":
+            self._check(dll.ailiaSpeechOpenDiarizationFileW(self._instance, p1, p2, diarization_type))
+        else:
+            self._check(dll.ailiaSpeechOpenDiarizationFileA(self._instance, p1, p2, diarization_type))
     def set_silent_threshold(self, silent_threshold, speech_sec, no_speech_sec):
         self._check(dll.ailiaSpeechSetSilentThreshold(self._instance, silent_threshold, speech_sec, no_speech_sec))
@@ -506,7 +536,7 @@ class Whisper(AiliaSpeechModel):
             for i in range(count.value):
                 text = AILIASpeechText()
                 self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
-                yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence}
+                yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "speaker_id" : None if text.speaker_id == AILIA_SPEECH_SPEAKER_ID_UNKNOWN else text.speaker_id, "language" : text.language.decode(), "confidence" : text.confidence}
         self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
@@ -542,7 +572,7 @@ class Whisper(AiliaSpeechModel):
             for i in range(count.value):
                 text = AILIASpeechText()
                 self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
-                yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence}
+                yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "speaker_id" : None if text.speaker_id == AILIA_SPEECH_SPEAKER_ID_UNKNOWN else text.speaker_id, "language" : text.language.decode(), "confidence" : text.confidence}
         if complete:
             self._check(dll.ailiaSpeechResetTranscribeState(self._instance))

ailia_speech-1.4.0/ailia_speech/linux/arm64-v8a/libailia_speech.so ADDED Viewed

Binary file

ailia_speech-1.4.0/ailia_speech/linux/x64/libailia_speech.so ADDED Viewed

Binary file

ailia_speech-1.4.0/ailia_speech/mac/libailia_speech.dylib ADDED Viewed

Binary file

ailia_speech-1.4.0/ailia_speech/windows/x64/ailia_speech.dll ADDED Viewed

Binary file

ailia_speech-1.3.2.2/README.md → ailia_speech-1.4.0/ailia_speech.egg-info/PKG-INFO RENAMED Viewed

@@ -1,3 +1,25 @@
+Metadata-Version: 2.4
+Name: ailia_speech
+Version: 1.4.0
+Summary: ailia AI Speech
+Home-page: https://ailia.jp/
+Author: ax Inc.
+Author-email: contact@axinc.jp
+License: https://ailia.ai/en/license/
+Requires-Python: >3.6
+Description-Content-Type: text/markdown
+Requires-Dist: ailia
+Requires-Dist: ailia_tokenizer
+Dynamic: author
+Dynamic: author-email
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
 # ailia AI Speech Python API
 !! CAUTION !!
@@ -90,13 +112,21 @@ for i in range(0, audio_waveform.shape[0], sampling_rate):
 		print(text)
 ```
+### Dialization mode
+By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
+```
+speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, dialization_type = AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
+```
 ### Available model types
 It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
 ```
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
-ilia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
+ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
 ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE

{ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/setup.py RENAMED Viewed

@@ -54,7 +54,7 @@ if __name__ == "__main__":
     setup(
         name="ailia_speech",
         scripts=scripts,
-        version="1.3.2.2",
+        version="1.4.0",
         install_requires=[
             "ailia",
             "ailia_tokenizer",