PyPI - yta-audio-narration - Versions diffs - 0.0.1__py3-none-any.whl - Mend

yta-audio-narration 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

yta_audio_narration/__init__.py +3 -0
yta_audio_narration/consts.py +1 -0
yta_audio_narration/enums.py +377 -0
yta_audio_narration/narrator.py +552 -0
yta_audio_narration/voice.py +88 -0
yta_audio_narration/voices/__init__.py +0 -0
yta_audio_narration/voices/coqui.py +278 -0
yta_audio_narration/voices/google.py +266 -0
yta_audio_narration/voices/microsoft.py +209 -0
yta_audio_narration/voices/open_voice.py +348 -0
yta_audio_narration/voices/tetyys.py +263 -0
yta_audio_narration/voices/tiktok.py +232 -0
yta_audio_narration/voices/tortoise.py +217 -0
yta_audio_narration/voices/ttsmp3.py +266 -0
yta_audio_narration-0.0.1.dist-info/LICENSE +19 -0
yta_audio_narration-0.0.1.dist-info/METADATA +27 -0
yta_audio_narration-0.0.1.dist-info/RECORD +18 -0
yta_audio_narration-0.0.1.dist-info/WHEEL +4 -0

yta_audio_narration/voices/coqui.py ADDED Viewed

@@ -0,0 +1,278 @@
+"""
+You can see anything you need here:
+- https://docs.coqui.ai/en/latest/
+As this is the first voice generator engine,
+I will explain some things here that are
+important for all the voice narrator engines
+that we are creating.
+We have options, and we will have all the
+array options fulfilled with, at least, a
+NORMAL and a DEFAULT options. This, even if
+the voice narrator engine doesn't use those
+options, will be handled. Then, when
+generating the voice narration, it will be
+ignored by our system.
+-- Update 19/04/2025 --
+I've found that they created a fork in
+https://github.com/idiap/coqui-ai-TTS with
+a new version that is maintained, and the
+'tts' was generating conflicts.
+"""
+from yta_audio_narration.consts import DEFAULT_VOICE
+from yta_audio_narration.enums import NarrationLanguage, VoiceEmotion, VoiceSpeed, VoicePitch
+from yta_audio_narration.voice import NarrationVoice
+from yta_constants.enum import YTAEnum as Enum
+from yta_constants.file import FileType
+from yta_programming.output import Output
+from typing import Union
+from TTS.api import TTS
+"""
+The options below are specified even if we
+don't use them later when processing the
+voice narration. This is to keep the same
+structure for any voice narration and to
+simplify the way we offer the options in
+an API that is able to make requests.
+"""
+# 1. The voices we accept, as Enums
+class CoquiVoiceName(Enum):
+    """
+    Available voices. The value is what is used
+    for the audio creation.
+    """
+    # tts_es_fastpitch_multispeaker.nemo
+    # These below are the 2 Spanish models that exist
+    DEFAULT = DEFAULT_VOICE
+    SPANISH_MODEL_A = 'tts_models/es/mai/tacotron2-DDC'
+    SPANISH_MODEL_B = 'tts_models/es/css10/vits'
+    # TODO: There are more voices
+# 2. The languages we accept
+LANGUAGE_OPTIONS = [
+    NarrationLanguage.DEFAULT,
+    NarrationLanguage.SPANISH
+]
+# 3. The emotions we accept
+EMOTION_OPTIONS = [
+    VoiceEmotion.DEFAULT,
+    VoiceEmotion.NORMAL,
+]
+# 4. The speeds we accept
+SPEED_OPTIONS = [
+    VoiceSpeed.DEFAULT,
+    VoiceSpeed.NORMAL,
+]
+# 5. The pitches we accept
+PITCH_OPTIONS = [
+    VoicePitch.DEFAULT,
+    VoicePitch.NORMAL,
+]
+class CoquiNarrationVoice(NarrationVoice):
+    """
+    Voice instance to be used when narrating with
+    Coqui engine.
+    """
+    @property
+    def processed_name(
+        self
+    ) -> str:
+        """
+        Get the usable name value from the one that has
+        been set when instantiating the instance.
+        """
+        return (
+            CoquiVoiceName.SPANISH_MODEL_A.value
+            if CoquiVoiceName.to_enum(self.name) == CoquiVoiceName.DEFAULT else
+            CoquiVoiceName.to_enum(self.name).value
+        )
+    @property
+    def processed_emotion(
+        self
+    ) -> str:
+        """
+        Get the usable emotion value from the one that
+        has been set when instantiating the instance.
+        """
+        # This narration is not able to handle any
+        # emotion (at least by now)
+        return None
+    @property
+    def processed_speed(
+        self
+    ) -> float:
+        """
+        Get the usable speed value from the one that
+        has been set when instantiating the instance.
+        """
+        # By now we are not handling the speed with
+        # this voice
+        return 1.0
+    @property
+    def processed_pitch(
+        self
+    ) -> float:
+        """
+        Get the usable pitch value from the one that
+        has been set when instantiating the instance.
+        """
+        # By now we are not handling the pitch with
+        # this voice
+        return None
+    @property
+    def processed_language(
+        self
+    ) -> str:
+        """
+        Get the usable language value from the one that
+        has been set when instantiating the instance.
+        """
+        return self.language.value
+    def validate(
+        self,
+        name: str,
+        emotion: VoiceEmotion,
+        speed: VoiceSpeed,
+        pitch: VoicePitch,
+        language: NarrationLanguage
+    ):
+        CoquiVoiceName.to_enum(name)
+        if VoiceEmotion.to_enum(emotion) not in EMOTION_OPTIONS:
+            raise Exception(f'The provided {emotion} is not valid for this narration voice.')
+        if VoiceSpeed.to_enum(speed) not in SPEED_OPTIONS:
+            raise Exception(f'The provided {speed} is not valid for this narration voice.')
+        if VoicePitch.to_enum(pitch) not in PITCH_OPTIONS:
+            raise Exception(f'The provided {pitch} is not valid for this narration voice.')
+        if NarrationLanguage.to_enum(language) not in LANGUAGE_OPTIONS:
+            raise Exception(f'The provided {language} is not valid for this narration voice.')
+    @staticmethod
+    def default():
+        return CoquiNarrationVoice(
+            name = CoquiVoiceName.DEFAULT.value,
+            emotion = VoiceEmotion.DEFAULT,
+            speed = VoiceSpeed.DEFAULT,
+            pitch = VoicePitch.DEFAULT,
+            language = NarrationLanguage.DEFAULT
+        )
+# The voices but for a specific language, to be able to
+# choose one when this is requested from the outside
+def get_narrator_names_by_language(
+    language: NarrationLanguage
+) -> list[str]:
+    """
+    Get the voices that are available for the
+    given 'language'.
+    """
+    language = NarrationLanguage.to_enum(language)
+    language = (
+        NarrationLanguage.SPANISH
+        if language is NarrationLanguage.DEFAULT else
+        language
+    )
+    return {
+        NarrationLanguage.SPANISH: [
+            CoquiVoiceName.DEFAULT.value,
+            CoquiVoiceName.SPANISH_MODEL_A.value,
+            CoquiVoiceName.SPANISH_MODEL_B.value
+        ]
+    }[language]
+# All the remaining functionality we need to make it
+# work properly
+def narrate(
+    text: str,
+    voice: CoquiNarrationVoice = CoquiNarrationVoice.default(),
+    output_filename: Union[str, None] = None
+) -> str:
+    """
+    Generates a narration audio file with the provided 'text' that
+    will be stored as 'output_filename' file.
+    This method uses a Spanish model so 'text' must be in Spanish.
+    This method will take some time to generate the narration.
+    """
+    output_filename = Output.get_filename(output_filename, FileType.AUDIO)
+    TTS(
+        model_name = voice.processed_name
+    ).tts_to_file(
+        # TODO: Implement 'emotion', 'speed', etc. when known
+        # how they work, the accepted values, etc. By now I'm
+        # using the properties but with the default values
+        text = text,
+        speaker = None,
+        language = None,
+        emotion = voice.processed_emotion,
+        speed = voice.processed_speed,
+        file_path = output_filename
+    )
+    # TODO: This was in the previous version, remove when the
+    # above is working.
+    # tts = TTS(model_name = voice.name)
+    # # There is 'language', 'emotion', 'speed'...
+    # tts.tts_to_file(text = text, file_path = output_filename)
+    return output_filename
+def narrate_imitating_voice(
+    text: str,
+    input_filename: str,
+    output_filename: Union[str, None] = None
+):
+    """
+    Narrates the provided 'text' by imitating the provided 'input_filename'
+    audio file (that must be a voice narrating something) and saves the
+    narration as 'output_filename'.
+    The 'input_filename' could be an array of audio filenames.
+    Language is set 'es' in code by default.
+    This method will take time as it will recreate the voice parameters with
+    which the narration will be created after that.
+    ANNOTATIONS: This method is only copying the way the narration voice
+    talks, but not the own voice. This is not working as expected, as we are
+    not cloning voices, we are just imitating the tone. We need another way
+    to actually clone the voice as Elevenlabs do.
+    """
+    # TODO: This is not validating if audio file...
+    if not input_filename:
+        raise Exception('No "input_filename" provided.')
+    output_filename = Output.get_filename(output_filename, FileType.AUDIO)
+    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
+    # This below will use the latest XTTS_v2 (needs to download the model)
+    #tts = TTS('xtts')
+    # TODO: Implement a way of identifying and storing the voices we create to
+    # be able to use again them without recreating them twice.
+    # input_filename can be an array of wav files
+    # generate speech by cloning a voice using default settings
+    tts.tts_to_file(text = text, file_path = output_filename, speaker_wav = input_filename, language = 'es')
+    return output_filename

yta_audio_narration/voices/google.py ADDED Viewed

@@ -0,0 +1,266 @@
+"""
+This link could be interesting:
+- https://github.com/DarinRowe/googletrans
+You have a lot of information here:
+- https://en.wikipedia.org/wiki/IETF_language_tag
+- https://pypi.org/project/langcodes/
+- https://gtts.readthedocs.io/en/latest/module.html#languages-gtts-lang
+"""
+from yta_audio_narration.consts import DEFAULT_VOICE
+from yta_audio_narration.enums import NarrationLanguage, VoiceEmotion, VoiceSpeed, VoicePitch
+from yta_audio_narration.voice import NarrationVoice
+from yta_constants.enum import YTAEnum as Enum
+from yta_constants.file import FileType
+from yta_programming.output import Output
+from typing import Union
+from gtts import gTTS
+"""
+This specific voice narration engine needs
+specific values and a different parameter
+handling.
+"""
+class GoogleNarrationLanguage(Enum):
+    """
+    The google narration languages accepted by their
+    API
+    """
+    SPANISH = 'es'
+    ENGLISH = 'en'
+    @staticmethod
+    def from_general_language(
+        language: NarrationLanguage
+    ) -> 'GoogleNarrationLanguage':
+        """
+        Turn a general 'language' instance into a Google
+        narration language instance.
+        """
+        return {
+            NarrationLanguage.DEFAULT: GoogleNarrationLanguage.SPANISH,
+            NarrationLanguage.SPANISH: GoogleNarrationLanguage.SPANISH,
+            NarrationLanguage.ENGLISH: GoogleNarrationLanguage.ENGLISH,
+        }[NarrationLanguage.to_enum(language)]
+"""
+The options below are specified even if we
+don't use them later when processing the
+voice narration. This is to keep the same
+structure for any voice narration and to
+simplify the way we offer the options in
+an API that is able to make requests.
+If the engine doesn't have one specific
+option (I mean, you cannot handle the
+narration speed, for example) we will allow
+the user choose 'normal' value and it will
+be handled just by ignoring it, but the user
+will be able to choose it.
+"""
+# 1. The voices we accept, as Enums
+class GoogleTld(Enum):
+    DEFAULT = DEFAULT_VOICE
+    SPANISH_SPAIN = 'es'
+    SPANISH_MEXICO = 'com.mx'
+    SPANISH_US = 'us'
+    # TODO: How can I get the list of Tlds?I need it
+    @staticmethod
+    def from_google_language(
+        language: GoogleNarrationLanguage
+    ) -> 'GoogleTld':
+        """
+        Turn the Google narration 'language' into the
+        corresponding Google TLD.
+        """
+        return {
+            GoogleNarrationLanguage.SPANISH: GoogleTld.SPANISH_SPAIN,
+            # TODO: Change this
+            GoogleNarrationLanguage.ENGLISH:  GoogleTld.SPANISH_US,
+        }[GoogleNarrationLanguage.to_enum(language)]
+# 2. The languages we accept
+LANGUAGE_OPTIONS = [
+    NarrationLanguage.SPANISH,
+    # TODO: Unavailable until I detect some valid TLDs
+    #NarrationLanguage.ENGLISH,
+    NarrationLanguage.DEFAULT
+]
+# 3. The emotions we accept
+EMOTION_OPTIONS = [
+    VoiceEmotion.DEFAULT,
+    VoiceEmotion.NORMAL,
+]
+# 4. The speeds we accept
+SPEED_OPTIONS = [
+    VoiceSpeed.DEFAULT,
+    VoiceSpeed.NORMAL,
+    VoiceSpeed.SLOW,
+]
+# 5. The pitches we accept
+PITCH_OPTIONS = [
+    VoicePitch.DEFAULT,
+    VoicePitch.NORMAL,
+]
+class GoogleNarrationVoice(NarrationVoice):
+    """
+    Voice instance to be used when narrating with
+    Google engine.
+    """
+    @property
+    def processed_name(
+        self
+    ) -> str:
+        """
+        Get the usable name value from the one that has
+        been set when instantiating the instance.
+        """
+        # TODO: Maybe this DEFAULT value has to exist
+        # for each language so it chooses one voice name
+        # for that language
+        return (
+            GoogleTld.SPANISH_SPAIN.value
+            if GoogleTld.to_enum(self.name) == GoogleTld.DEFAULT else
+            GoogleTld.to_enum(self.name).value
+        )
+    @property
+    def processed_emotion(
+        self
+    ) -> str:
+        """
+        Get the usable emotion value from the one that
+        has been set when instantiating the instance.
+        """
+        # This narration is not able to handle any
+        # emotion (at least by now)
+        return None
+    @property
+    def processed_speed(
+        self
+    ) -> bool:
+        """
+        Get the usable speed value from the one that
+        has been set when instantiating the instance.
+        """
+        # This value is actually saying if we are using
+        # the slow mode or not
+        return {
+            VoiceSpeed.SLOW: True,
+            VoiceSpeed.DEFAULT: False,
+            VoiceSpeed.NORMAL: False
+        }[self.speed]
+    @property
+    def processed_pitch(
+        self
+    ) -> float:
+        """
+        Get the usable pitch value from the one that
+        has been set when instantiating the instance.
+        """
+        # By now we are not handling the pitch with
+        # this voice
+        return None
+    @property
+    def processed_language(
+        self
+    ) -> str:
+        """
+        Get the usable language value from the one that
+        has been set when instantiating the instance.
+        """
+        return GoogleNarrationLanguage.from_general_language(self.language).value
+    def validate_and_process(
+        self,
+        name: str,
+        emotion: VoiceEmotion,
+        speed: VoiceSpeed,
+        pitch: VoicePitch,
+        language: NarrationLanguage
+    ):
+        GoogleTld.to_enum(name)
+        if VoiceEmotion.to_enum(emotion) not in EMOTION_OPTIONS:
+            raise Exception(f'The provided {emotion} is not valid for this narration voice.')
+        if VoiceSpeed.to_enum(speed) not in SPEED_OPTIONS:
+            raise Exception(f'The provided {speed} is not valid for this narration voice.')
+        if VoicePitch.to_enum(pitch) not in PITCH_OPTIONS:
+            raise Exception(f'The provided {pitch} is not valid for this narration voice.')
+        if NarrationLanguage.to_enum(language) not in LANGUAGE_OPTIONS:
+            raise Exception(f'The provided {language} is not valid for this narration voice.')
+    @staticmethod
+    def default():
+        return GoogleNarrationVoice(
+            name = GoogleTld.DEFAULT.value,
+            emotion = VoiceEmotion.DEFAULT,
+            speed = VoiceSpeed.DEFAULT,
+            pitch = VoicePitch.DEFAULT,
+            language = NarrationLanguage.DEFAULT
+        )
+        # TODO: This was in the previous version, remove when
+        # confirmed that the above is working
+        # return GoogleNarrationVoice('', '', 130, 1.0, NarrationLanguage.DEFAULT)
+# The voices but for a specific language, to be able to
+# choose one when this is requested from the outside
+def get_narrator_names_by_language(
+    language: NarrationLanguage
+) -> list[str]:
+    language = NarrationLanguage.to_enum(language)
+    language = (
+        NarrationLanguage.SPANISH
+        if language is NarrationLanguage.DEFAULT else
+        language
+    )
+    return {
+        NarrationLanguage.SPANISH: [
+            GoogleTld.DEFAULT.value,
+            GoogleTld.SPANISH_SPAIN.value,
+            GoogleTld.SPANISH_MEXICO.value,
+            GoogleTld.SPANISH_US.value
+        ]
+    }[language]
+# All the remaining functionality we need to make it
+# work properly
+def narrate(
+    text: str,
+    voice: GoogleNarrationVoice = GoogleNarrationVoice.default(),
+    output_filename: Union[str, None] = None
+):
+    """
+    Creates an audio narration of the provided 'text' with the Google voice and stores it
+    as 'output_filename'. This will use the provided 'language' language for the narration.
+    """
+    output_filename = Output.get_filename(output_filename, FileType.AUDIO)
+    tld = GoogleTld.from_google_language(voice.processed_language).value
+    gTTS(
+        text = text,
+        lang = voice.processed_language,
+        tld = tld,
+        slow = voice.processed_speed
+    ).save(
+        output_filename
+    )
+    return output_filename