ailia-speech 1.3.0.4__tar.gz → 1.3.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ailia-speech might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ailia_speech
3
- Version: 1.3.0.4
3
+ Version: 1.3.0.5
4
4
  Summary: ailia AI Speech
5
5
  Home-page: https://ailia.jp/
6
6
  Author: ax Inc.
@@ -49,19 +49,20 @@ import os
49
49
  import urllib.request
50
50
 
51
51
  # Load target audio
52
- ref_file_path = "demo.wav"
53
- if not os.path.exists(ref_file_path):
52
+ input_file_path = "demo.wav"
53
+ if not os.path.exists(input_file_path):
54
54
  urllib.request.urlretrieve(
55
55
  "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
56
56
  "demo.wav"
57
57
  )
58
- audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
58
+ audio_waveform, sampling_rate = librosa.load(input_file_path, mono=True)
59
59
 
60
60
  # Infer
61
61
  speech = ailia_speech.Whisper()
62
62
  speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
63
63
  recognized_text = speech.transcribe(audio_waveform, sampling_rate)
64
- print(recognized_text)
64
+ for text in recognized_text:
65
+ print(text)
65
66
  ```
66
67
 
67
68
  ## API specification
@@ -36,19 +36,20 @@ import os
36
36
  import urllib.request
37
37
 
38
38
  # Load target audio
39
- ref_file_path = "demo.wav"
40
- if not os.path.exists(ref_file_path):
39
+ input_file_path = "demo.wav"
40
+ if not os.path.exists(input_file_path):
41
41
  urllib.request.urlretrieve(
42
42
  "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
43
43
  "demo.wav"
44
44
  )
45
- audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
45
+ audio_waveform, sampling_rate = librosa.load(input_file_path, mono=True)
46
46
 
47
47
  # Infer
48
48
  speech = ailia_speech.Whisper()
49
49
  speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
50
50
  recognized_text = speech.transcribe(audio_waveform, sampling_rate)
51
- print(recognized_text)
51
+ for text in recognized_text:
52
+ print(text)
52
53
  ```
53
54
 
54
55
  ## API specification
@@ -475,19 +475,24 @@ class Whisper(AiliaSpeechModel):
475
475
 
476
476
  self._check(dll.ailiaSpeechPushInputData(self._instance, audio_waveform, channels, audio_waveform.shape[0] // channels, sampling_rate))
477
477
  self._check(dll.ailiaSpeechFinalizeInputData(self._instance))
478
- self._check(dll.ailiaSpeechTranscribe(self._instance))
479
478
 
480
- count = ctypes.c_uint(0)
481
- self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
482
- results = []
483
- for i in range(count.value):
484
- text = AILIASpeechText()
485
- self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
486
- results.append({"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence})
479
+ while True:
480
+ complete = ctypes.c_uint(0)
481
+ self._check(dll.ailiaSpeechComplete(self._instance, ctypes.byref(complete)))
482
+ if complete.value == 1:
483
+ break
487
484
 
488
- self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
485
+ self._check(dll.ailiaSpeechTranscribe(self._instance))
486
+
487
+ count = ctypes.c_uint(0)
488
+ self._check(dll.ailiaSpeechGetTextCount(self._instance, ctypes.byref(count)))
489
+ results = []
490
+ for i in range(count.value):
491
+ text = AILIASpeechText()
492
+ self._check(dll.ailiaSpeechGetText(self._instance, ctypes.byref(text), AILIA_SPEECH_TEXT_VERSION, i))
493
+ yield {"text" : text.text.decode(), "time_stamp_begin" : text.time_stamp_begin, "time_stamp_end" : text.time_stamp_end, "person_id" : text.person_id, "language" : text.language.decode(), "confidence" : text.confidence}
489
494
 
490
- return results
495
+ self._check(dll.ailiaSpeechResetTranscribeState(self._instance))
491
496
 
492
497
  def __del__(self):
493
498
  if self._instance:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ailia_speech
3
- Version: 1.3.0.4
3
+ Version: 1.3.0.5
4
4
  Summary: ailia AI Speech
5
5
  Home-page: https://ailia.jp/
6
6
  Author: ax Inc.
@@ -49,19 +49,20 @@ import os
49
49
  import urllib.request
50
50
 
51
51
  # Load target audio
52
- ref_file_path = "demo.wav"
53
- if not os.path.exists(ref_file_path):
52
+ input_file_path = "demo.wav"
53
+ if not os.path.exists(input_file_path):
54
54
  urllib.request.urlretrieve(
55
55
  "https://github.com/axinc-ai/ailia-models/raw/refs/heads/master/audio_processing/whisper/demo.wa",
56
56
  "demo.wav"
57
57
  )
58
- audio_waveform, sampling_rate = librosa.load(ref_file_path, mono=True)
58
+ audio_waveform, sampling_rate = librosa.load(input_file_path, mono=True)
59
59
 
60
60
  # Infer
61
61
  speech = ailia_speech.Whisper()
62
62
  speech.initialize_model(model_path = "./models/", model_type = ailia_speech.AILIA_SPEECH_MODEL_TYPE_WHISPER_MULTILINGUAL_SMALL)
63
63
  recognized_text = speech.transcribe(audio_waveform, sampling_rate)
64
- print(recognized_text)
64
+ for text in recognized_text:
65
+ print(text)
65
66
  ```
66
67
 
67
68
  ## API specification
@@ -54,7 +54,7 @@ if __name__ == "__main__":
54
54
  setup(
55
55
  name="ailia_speech",
56
56
  scripts=scripts,
57
- version="1.3.0.4",
57
+ version="1.3.0.5",
58
58
  install_requires=[
59
59
  "ailia",
60
60
  "ailia_tokenizer",
File without changes