ailia-speech 1.3.2.2__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ailia-speech might be problematic. Click here for more details.

Files changed (20) hide show
  1. {ailia_speech-1.3.2.2/ailia_speech.egg-info → ailia_speech-1.4.0}/PKG-INFO +20 -3
  2. ailia_speech-1.3.2.2/PKG-INFO → ailia_speech-1.4.0/README.md +9 -14
  3. {ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/ailia_speech/__init__.py +43 -13
  4. ailia_speech-1.4.0/ailia_speech/linux/arm64-v8a/libailia_speech.so +0 -0
  5. ailia_speech-1.4.0/ailia_speech/linux/x64/libailia_speech.so +0 -0
  6. ailia_speech-1.4.0/ailia_speech/mac/libailia_speech.dylib +0 -0
  7. ailia_speech-1.4.0/ailia_speech/windows/x64/ailia_speech.dll +0 -0
  8. ailia_speech-1.3.2.2/README.md → ailia_speech-1.4.0/ailia_speech.egg-info/PKG-INFO +31 -1
  9. {ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/setup.py +1 -1
  10. ailia_speech-1.3.2.2/ailia_speech/linux/arm64-v8a/libailia_speech.so +0 -0
  11. ailia_speech-1.3.2.2/ailia_speech/linux/x64/libailia_speech.so +0 -0
  12. ailia_speech-1.3.2.2/ailia_speech/mac/libailia_speech.dylib +0 -0
  13. ailia_speech-1.3.2.2/ailia_speech/windows/x64/ailia_speech.dll +0 -0
  14. {ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/ailia_speech/LICENSE_AILIA_EN.pdf +0 -0
  15. {ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/ailia_speech/LICENSE_AILIA_JA.pdf +0 -0
  16. {ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/ailia_speech.egg-info/SOURCES.txt +0 -0
  17. {ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/ailia_speech.egg-info/dependency_links.txt +0 -0
  18. {ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/ailia_speech.egg-info/requires.txt +0 -0
  19. {ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/ailia_speech.egg-info/top_level.txt +0 -0
  20. {ailia_speech-1.3.2.2 → ailia_speech-1.4.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: ailia_speech
3
- Version: 1.3.2.2
3
+ Version: 1.4.0
4
4
  Summary: ailia AI Speech
5
5
  Home-page: https://ailia.jp/
6
6
  Author: ax Inc.
@@ -10,6 +10,15 @@ Requires-Python: >3.6
10
10
  Description-Content-Type: text/markdown
11
11
  Requires-Dist: ailia
12
12
  Requires-Dist: ailia_tokenizer
13
+ Dynamic: author
14
+ Dynamic: author-email
15
+ Dynamic: description
16
+ Dynamic: description-content-type
17
+ Dynamic: home-page
18
+ Dynamic: license
19
+ Dynamic: requires-dist
20
+ Dynamic: requires-python
21
+ Dynamic: summary
13
22
 
14
23
  # ailia AI Speech Python API
15
24
 
@@ -103,13 +112,21 @@ for i in range(0, audio_waveform.shape[0], sampling_rate):
103
112
  print(text)
104
113
  ```
105
114
 
115
+ ### Dialization mode
116
+
117
+ By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
118
+
119
+ ```
120
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, dialization_type = AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
121
+ ```
122
+
106
123
  ### Available model types
107
124
 
108
125
  It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
109
126
 
110
127
  ```
111
128
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
112
- ilia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
129
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
113
130
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
114
131
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
115
132
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
@@ -1,16 +1,3 @@
1
- Metadata-Version: 2.1
2
- Name: ailia_speech
3
- Version: 1.3.2.2
4
- Summary: ailia AI Speech
5
- Home-page: https://ailia.jp/
6
- Author: ax Inc.
7
- Author-email: contact@axinc.jp
8
- License: https://ailia.ai/en/license/
9
- Requires-Python: >3.6
10
- Description-Content-Type: text/markdown
11
- Requires-Dist: ailia
12
- Requires-Dist: ailia_tokenizer
13
-
14
1
  # ailia AI Speech Python API
15
2
 
16
3
  !! CAUTION !!
@@ -103,13 +90,21 @@ for i in range(0, audio_waveform.shape[0], sampling_rate):
103
90
  print(text)
104
91
  ```
105
92
 
93
+ ### Dialization mode
94
+
95
+ By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
96
+
97
+ ```
98
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, dialization_type = AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
99
+ ```
100
+
106
101
  ### Available model types
107
102
 
108
103
  It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
109
104
 
110
105
  ```
111
106
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
112
- ilia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
107
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
113
108
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
114
109
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
115
110
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
@@ -76,9 +76,13 @@ AILIA_SPEECH_FLAG_NONE = (0)
76
76
  AILIA_SPEECH_FLAG_LIVE = (1)
77
77
 
78
78
  AILIA_SPEECH_VAD_TYPE_SILERO = (0)
79
+
80
+ AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO = (0)
81
+
79
82
  AILIA_SPEECH_API_CALLBACK_VERSION = (6)
80
83
 
81
84
  AILIA_SPEECH_TEXT_VERSION = (2)
85
+ AILIA_SPEECH_SPEAKER_ID_UNKNOWN = (0xFFFFFFFF)
82
86
 
83
87
  AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_FRAME_LEN = CFUNCTYPE(POINTER(c_int), c_int, c_int, c_int, c_int)
84
88
  AILIA_SPEECH_USER_API_AILIA_AUDIO_GET_MEL_SPECTROGRAM = CFUNCTYPE((c_int), c_void_p, c_void_p, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_int, c_float, c_int, c_float, c_float, c_int, c_int, c_int)
@@ -215,6 +219,12 @@ dll.ailiaSpeechOpenVadFileA.argtypes = (c_void_p, c_char_p, c_int32)
215
219
  dll.ailiaSpeechOpenVadFileW.restype = c_int
216
220
  dll.ailiaSpeechOpenVadFileW.argtypes = (c_void_p, c_wchar_p, c_int32)
217
221
 
222
+ dll.ailiaSpeechOpenDiarizationFileA.restype = c_int
223
+ dll.ailiaSpeechOpenDiarizationFileA.argtypes = (c_void_p, c_char_p, c_char_p, c_int32)
224
+
225
+ dll.ailiaSpeechOpenDiarizationFileW.restype = c_int
226
+ dll.ailiaSpeechOpenDiarizationFileW.argtypes = (c_void_p, c_wchar_p, c_wchar_p, c_int32)
227
+
218
228
  dll.ailiaSpeechPushInputData.restype = c_int
219
229
  dll.ailiaSpeechPushInputData.argtypes = (c_void_p, numpy.ctypeslib.ndpointer(
220
230
  dtype=numpy.float32, flags='CONTIGUOUS'
@@ -243,7 +253,7 @@ class AILIASpeechText(ctypes.Structure):
243
253
  ("text", ctypes.c_char_p),
244
254
  ("time_stamp_begin", ctypes.c_float),
245
255
  ("time_stamp_end", ctypes.c_float),
246
- ("person_id", ctypes.c_uint),
256
+ ("speaker_id", ctypes.c_uint),
247
257
  ("language", ctypes.c_char_p),
248
258
  ("confidence", ctypes.c_float)]
249
259
 
@@ -399,7 +409,7 @@ class Whisper(AiliaSpeechModel):
399
409
  intermediate_callback_cnt = intermediate_callback_cnt + 1
400
410
 
401
411
 
402
- def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY):
412
+ def initialize_model(self, model_path = "./", model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY, vad_type = AILIA_SPEECH_VAD_TYPE_SILERO, dialization_type = None):
403
413
  if "time_license" in ailia.get_version():
404
414
  ailia.check_and_download_license()
405
415
  if model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY:
@@ -433,16 +443,20 @@ class Whisper(AiliaSpeechModel):
433
443
  encoder_pb_path = "encoder_large_v3_weights.pb"
434
444
  decoder_pb_path = "decoder_large_v3_fix_kv_cache_weights.pb"
435
445
  elif model_type == AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO:
436
- encoder_path = "encoder_turbo.onnx"
437
- decoder_path = "decoder_turbo_fix_kv_cache.onnx"
438
- encoder_pb_path = "encoder_turbo_weights.pb"
446
+ encoder_path = "encoder_turbo.opt.onnx"
447
+ decoder_path = "decoder_turbo_fix_kv_cache.opt.onnx"
448
+ encoder_pb_path = "encoder_turbo_weights.opt.pb"
439
449
  decoder_pb_path = None
440
450
  model_type = AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3
441
- self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path)
451
+ self._download_model(model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, dialization_type)
442
452
  self._open_model(model_path + encoder_path, model_path + decoder_path, model_type)
443
- self._open_vad(model_path + "silero_vad.onnx", AILIA_SPEECH_VAD_TYPE_SILERO)
444
-
445
- def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path):
453
+ if vad_type is not None:
454
+ self._open_vad(model_path + "silero_vad.onnx", vad_type)
455
+ if dialization_type is not None:
456
+ self._open_diarization(model_path + "segmentation.onnx", model_path + "speaker-embedding.onnx", dialization_type)
457
+
458
+
459
+ def _download_model(self, model_path, encoder_path, decoder_path, encoder_pb_path, decoder_pb_path, vad_type, dialization_type):
446
460
  REMOTE_PATH = "https://storage.googleapis.com/ailia-models/whisper/"
447
461
  os.makedirs(model_path, exist_ok = True)
448
462
  check_and_download_file(model_path + encoder_path, REMOTE_PATH)
@@ -452,8 +466,14 @@ class Whisper(AiliaSpeechModel):
452
466
  if decoder_pb_path is not None:
453
467
  check_and_download_file(model_path + decoder_pb_path, REMOTE_PATH)
454
468
 
455
- REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
456
- check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
469
+ if vad_type is not None:
470
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/silero-vad/"
471
+ check_and_download_file(model_path + "silero_vad.onnx", REMOTE_PATH)
472
+
473
+ if dialization_type is not None:
474
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/pyannote-audio/"
475
+ check_and_download_file(model_path + "segmentation.onnx", REMOTE_PATH)
476
+ check_and_download_file(model_path + "speaker-embedding.onnx", REMOTE_PATH)
457
477
 
458
478
  def _open_model(self, encoder, decoder, model_type):
459
479
  p1 = self._string_buffer_aw(encoder)
@@ -472,6 +492,16 @@ class Whisper(AiliaSpeechModel):
472
492
  else:
473
493
  self._check(dll.ailiaSpeechOpenVadFileA(self._instance, p1, vad_type))
474
494
 
495
+ def _open_diarization(self, segmentation, embedding, diarization_type):
496
+ p1 = self._string_buffer_aw(segmentation)
497
+ p2 = self._string_buffer_aw(embedding)
498
+
499
+ if sys.platform == "win32":
500
+ self._check(dll.ailiaSpeechOpenDiarizationFileW(self._instance, p1, p2, diarization_type))
501
+ else:
502
+ self._check(dll.ailiaSpeechOpenDiarizationFileA(self._instance, p1, p2, diarization_type))
503
+
504
+
475
505
  def set_silent_threshold(self, silent_threshold, speech_sec, no_speech_sec):
476
506
  self._check(dll.ailiaSpeechSetSilentThreshold(self._instance, silent_threshold, speech_sec, no_speech_sec))
477
507
 
@@ -506,7 +536,7 @@ class Whisper(AiliaSpeechModel):
506
536
  for i in range(count.value):
507
537
  text = AILIASpeechText()
508
538
  self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
509
- yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence}
539
+ yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "speaker_id" : None if text.speaker_id == AILIA_SPEECH_SPEAKER_ID_UNKNOWN else text.speaker_id, "language" : text.language.decode(), "confidence" : text.confidence}
510
540
 
511
541
  self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
512
542
 
@@ -542,7 +572,7 @@ class Whisper(AiliaSpeechModel):
542
572
  for i in range(count.value):
543
573
  text = AILIASpeechText()
544
574
  self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
545
- yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence}
575
+ yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "speaker_id" : None if text.speaker_id == AILIA_SPEECH_SPEAKER_ID_UNKNOWN else text.speaker_id, "language" : text.language.decode(), "confidence" : text.confidence}
546
576
 
547
577
  if complete:
548
578
  self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
@@ -1,3 +1,25 @@
1
+ Metadata-Version: 2.4
2
+ Name: ailia_speech
3
+ Version: 1.4.0
4
+ Summary: ailia AI Speech
5
+ Home-page: https://ailia.jp/
6
+ Author: ax Inc.
7
+ Author-email: contact@axinc.jp
8
+ License: https://ailia.ai/en/license/
9
+ Requires-Python: >3.6
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: ailia
12
+ Requires-Dist: ailia_tokenizer
13
+ Dynamic: author
14
+ Dynamic: author-email
15
+ Dynamic: description
16
+ Dynamic: description-content-type
17
+ Dynamic: home-page
18
+ Dynamic: license
19
+ Dynamic: requires-dist
20
+ Dynamic: requires-python
21
+ Dynamic: summary
22
+
1
23
  # ailia AI Speech Python API
2
24
 
3
25
  !! CAUTION !!
@@ -90,13 +112,21 @@ for i in range(0, audio_waveform.shape[0], sampling_rate):
90
112
  print(text)
91
113
  ```
92
114
 
115
+ ### Dialization mode
116
+
117
+ By specifying dialization_type, speaker diarization can be performed. When speaker diarization is enabled, speaker_id becomes valid.
118
+
119
+ ```
120
+ speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE_V3_TURBO, dialization_type = AILIA_SPEECH_DIARIZATION_TYPE_PYANNOTE_AUDIO)
121
+ ```
122
+
93
123
  ### Available model types
94
124
 
95
125
  It is possible to select multiple models according to accuracy and speed. LARGE_V3_TURBO is the most recommended.
96
126
 
97
127
  ```
98
128
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_TINY
99
- ilia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
129
+ ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_BASE
100
130
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL
101
131
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_MEDIUM
102
132
  ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_LARGE
@@ -54,7 +54,7 @@ if __name__ == "__main__":
54
54
  setup(
55
55
  name="ailia_speech",
56
56
  scripts=scripts,
57
- version="1.3.2.2",
57
+ version="1.4.0",
58
58
  install_requires=[
59
59
  "ailia",
60
60
  "ailia_tokenizer",
File without changes