yta-audio-narration 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,278 @@
1
+ """
2
+ You can see anything you need here:
3
+ - https://docs.coqui.ai/en/latest/
4
+
5
+ As this is the first voice generator engine,
6
+ I will explain some things here that are
7
+ important for all the voice narrator engines
8
+ that we are creating.
9
+
10
+ We have options, and we will have all the
11
+ array options fulfilled with, at least, a
12
+ NORMAL and a DEFAULT options. This, even if
13
+ the voice narrator engine doesn't use those
14
+ options, will be handled. Then, when
15
+ generating the voice narration, it will be
16
+ ignored by our system.
17
+
18
+ -- Update 19/04/2025 --
19
+ I've found that they created a fork in
20
+ https://github.com/idiap/coqui-ai-TTS with
21
+ a new version that is maintained, and the
22
+ 'tts' was generating conflicts.
23
+ """
24
+ from yta_audio_narration.consts import DEFAULT_VOICE
25
+ from yta_audio_narration.enums import NarrationLanguage, VoiceEmotion, VoiceSpeed, VoicePitch
26
+ from yta_audio_narration.voice import NarrationVoice
27
+ from yta_constants.enum import YTAEnum as Enum
28
+ from yta_constants.file import FileType
29
+ from yta_programming.output import Output
30
+ from typing import Union
31
+ from TTS.api import TTS
32
+
33
+
34
+ """
35
+ The options below are specified even if we
36
+ don't use them later when processing the
37
+ voice narration. This is to keep the same
38
+ structure for any voice narration and to
39
+ simplify the way we offer the options in
40
+ an API that is able to make requests.
41
+ """
42
+
43
+ # 1. The voices we accept, as Enums
44
+ class CoquiVoiceName(Enum):
45
+ """
46
+ Available voices. The value is what is used
47
+ for the audio creation.
48
+ """
49
+
50
+ # tts_es_fastpitch_multispeaker.nemo
51
+ # These below are the 2 Spanish models that exist
52
+ DEFAULT = DEFAULT_VOICE
53
+ SPANISH_MODEL_A = 'tts_models/es/mai/tacotron2-DDC'
54
+ SPANISH_MODEL_B = 'tts_models/es/css10/vits'
55
+ # TODO: There are more voices
56
+
57
+ # 2. The languages we accept
58
+ LANGUAGE_OPTIONS = [
59
+ NarrationLanguage.DEFAULT,
60
+ NarrationLanguage.SPANISH
61
+ ]
62
+
63
+ # 3. The emotions we accept
64
+ EMOTION_OPTIONS = [
65
+ VoiceEmotion.DEFAULT,
66
+ VoiceEmotion.NORMAL,
67
+ ]
68
+
69
+ # 4. The speeds we accept
70
+ SPEED_OPTIONS = [
71
+ VoiceSpeed.DEFAULT,
72
+ VoiceSpeed.NORMAL,
73
+ ]
74
+
75
+ # 5. The pitches we accept
76
+ PITCH_OPTIONS = [
77
+ VoicePitch.DEFAULT,
78
+ VoicePitch.NORMAL,
79
+ ]
80
+
81
+ class CoquiNarrationVoice(NarrationVoice):
82
+ """
83
+ Voice instance to be used when narrating with
84
+ Coqui engine.
85
+ """
86
+
87
+ @property
88
+ def processed_name(
89
+ self
90
+ ) -> str:
91
+ """
92
+ Get the usable name value from the one that has
93
+ been set when instantiating the instance.
94
+ """
95
+ return (
96
+ CoquiVoiceName.SPANISH_MODEL_A.value
97
+ if CoquiVoiceName.to_enum(self.name) == CoquiVoiceName.DEFAULT else
98
+ CoquiVoiceName.to_enum(self.name).value
99
+ )
100
+
101
+ @property
102
+ def processed_emotion(
103
+ self
104
+ ) -> str:
105
+ """
106
+ Get the usable emotion value from the one that
107
+ has been set when instantiating the instance.
108
+ """
109
+ # This narration is not able to handle any
110
+ # emotion (at least by now)
111
+ return None
112
+
113
+ @property
114
+ def processed_speed(
115
+ self
116
+ ) -> float:
117
+ """
118
+ Get the usable speed value from the one that
119
+ has been set when instantiating the instance.
120
+ """
121
+ # By now we are not handling the speed with
122
+ # this voice
123
+ return 1.0
124
+
125
+ @property
126
+ def processed_pitch(
127
+ self
128
+ ) -> float:
129
+ """
130
+ Get the usable pitch value from the one that
131
+ has been set when instantiating the instance.
132
+ """
133
+ # By now we are not handling the pitch with
134
+ # this voice
135
+ return None
136
+
137
+ @property
138
+ def processed_language(
139
+ self
140
+ ) -> str:
141
+ """
142
+ Get the usable language value from the one that
143
+ has been set when instantiating the instance.
144
+ """
145
+ return self.language.value
146
+
147
+ def validate(
148
+ self,
149
+ name: str,
150
+ emotion: VoiceEmotion,
151
+ speed: VoiceSpeed,
152
+ pitch: VoicePitch,
153
+ language: NarrationLanguage
154
+ ):
155
+ CoquiVoiceName.to_enum(name)
156
+ if VoiceEmotion.to_enum(emotion) not in EMOTION_OPTIONS:
157
+ raise Exception(f'The provided {emotion} is not valid for this narration voice.')
158
+ if VoiceSpeed.to_enum(speed) not in SPEED_OPTIONS:
159
+ raise Exception(f'The provided {speed} is not valid for this narration voice.')
160
+ if VoicePitch.to_enum(pitch) not in PITCH_OPTIONS:
161
+ raise Exception(f'The provided {pitch} is not valid for this narration voice.')
162
+ if NarrationLanguage.to_enum(language) not in LANGUAGE_OPTIONS:
163
+ raise Exception(f'The provided {language} is not valid for this narration voice.')
164
+
165
+ @staticmethod
166
+ def default():
167
+ return CoquiNarrationVoice(
168
+ name = CoquiVoiceName.DEFAULT.value,
169
+ emotion = VoiceEmotion.DEFAULT,
170
+ speed = VoiceSpeed.DEFAULT,
171
+ pitch = VoicePitch.DEFAULT,
172
+ language = NarrationLanguage.DEFAULT
173
+ )
174
+
175
+ # The voices but for a specific language, to be able to
176
+ # choose one when this is requested from the outside
177
+ def get_narrator_names_by_language(
178
+ language: NarrationLanguage
179
+ ) -> list[str]:
180
+ """
181
+ Get the voices that are available for the
182
+ given 'language'.
183
+ """
184
+ language = NarrationLanguage.to_enum(language)
185
+ language = (
186
+ NarrationLanguage.SPANISH
187
+ if language is NarrationLanguage.DEFAULT else
188
+ language
189
+ )
190
+
191
+ return {
192
+ NarrationLanguage.SPANISH: [
193
+ CoquiVoiceName.DEFAULT.value,
194
+ CoquiVoiceName.SPANISH_MODEL_A.value,
195
+ CoquiVoiceName.SPANISH_MODEL_B.value
196
+ ]
197
+ }[language]
198
+
199
+
200
+ # All the remaining functionality we need to make it
201
+ # work properly
202
+ def narrate(
203
+ text: str,
204
+ voice: CoquiNarrationVoice = CoquiNarrationVoice.default(),
205
+ output_filename: Union[str, None] = None
206
+ ) -> str:
207
+ """
208
+ Generates a narration audio file with the provided 'text' that
209
+ will be stored as 'output_filename' file.
210
+
211
+ This method uses a Spanish model so 'text' must be in Spanish.
212
+
213
+ This method will take some time to generate the narration.
214
+ """
215
+ output_filename = Output.get_filename(output_filename, FileType.AUDIO)
216
+
217
+ TTS(
218
+ model_name = voice.processed_name
219
+ ).tts_to_file(
220
+ # TODO: Implement 'emotion', 'speed', etc. when known
221
+ # how they work, the accepted values, etc. By now I'm
222
+ # using the properties but with the default values
223
+ text = text,
224
+ speaker = None,
225
+ language = None,
226
+ emotion = voice.processed_emotion,
227
+ speed = voice.processed_speed,
228
+ file_path = output_filename
229
+ )
230
+
231
+ # TODO: This was in the previous version, remove when the
232
+ # above is working.
233
+ # tts = TTS(model_name = voice.name)
234
+ # # There is 'language', 'emotion', 'speed'...
235
+ # tts.tts_to_file(text = text, file_path = output_filename)
236
+
237
+ return output_filename
238
+
239
+ def narrate_imitating_voice(
240
+ text: str,
241
+ input_filename: str,
242
+ output_filename: Union[str, None] = None
243
+ ):
244
+ """
245
+ Narrates the provided 'text' by imitating the provided 'input_filename'
246
+ audio file (that must be a voice narrating something) and saves the
247
+ narration as 'output_filename'.
248
+
249
+ The 'input_filename' could be an array of audio filenames.
250
+
251
+ Language is set 'es' in code by default.
252
+
253
+ This method will take time as it will recreate the voice parameters with
254
+ which the narration will be created after that.
255
+
256
+ ANNOTATIONS: This method is only copying the way the narration voice
257
+ talks, but not the own voice. This is not working as expected, as we are
258
+ not cloning voices, we are just imitating the tone. We need another way
259
+ to actually clone the voice as Elevenlabs do.
260
+ """
261
+ # TODO: This is not validating if audio file...
262
+ if not input_filename:
263
+ raise Exception('No "input_filename" provided.')
264
+
265
+ output_filename = Output.get_filename(output_filename, FileType.AUDIO)
266
+
267
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
268
+ # This below will use the latest XTTS_v2 (needs to download the model)
269
+ #tts = TTS('xtts')
270
+
271
+ # TODO: Implement a way of identifying and storing the voices we create to
272
+ # be able to use again them without recreating them twice.
273
+
274
+ # input_filename can be an array of wav files
275
+ # generate speech by cloning a voice using default settings
276
+ tts.tts_to_file(text = text, file_path = output_filename, speaker_wav = input_filename, language = 'es')
277
+
278
+ return output_filename
@@ -0,0 +1,266 @@
1
+ """
2
+ This link could be interesting:
3
+ - https://github.com/DarinRowe/googletrans
4
+
5
+ You have a lot of information here:
6
+ - https://en.wikipedia.org/wiki/IETF_language_tag
7
+ - https://pypi.org/project/langcodes/
8
+ - https://gtts.readthedocs.io/en/latest/module.html#languages-gtts-lang
9
+ """
10
+ from yta_audio_narration.consts import DEFAULT_VOICE
11
+ from yta_audio_narration.enums import NarrationLanguage, VoiceEmotion, VoiceSpeed, VoicePitch
12
+ from yta_audio_narration.voice import NarrationVoice
13
+ from yta_constants.enum import YTAEnum as Enum
14
+ from yta_constants.file import FileType
15
+ from yta_programming.output import Output
16
+ from typing import Union
17
+ from gtts import gTTS
18
+
19
+
20
+ """
21
+ This specific voice narration engine needs
22
+ specific values and a different parameter
23
+ handling.
24
+ """
25
+
26
+ class GoogleNarrationLanguage(Enum):
27
+ """
28
+ The google narration languages accepted by their
29
+ API
30
+ """
31
+
32
+ SPANISH = 'es'
33
+ ENGLISH = 'en'
34
+
35
+ @staticmethod
36
+ def from_general_language(
37
+ language: NarrationLanguage
38
+ ) -> 'GoogleNarrationLanguage':
39
+ """
40
+ Turn a general 'language' instance into a Google
41
+ narration language instance.
42
+ """
43
+ return {
44
+ NarrationLanguage.DEFAULT: GoogleNarrationLanguage.SPANISH,
45
+ NarrationLanguage.SPANISH: GoogleNarrationLanguage.SPANISH,
46
+ NarrationLanguage.ENGLISH: GoogleNarrationLanguage.ENGLISH,
47
+ }[NarrationLanguage.to_enum(language)]
48
+
49
+ """
50
+ The options below are specified even if we
51
+ don't use them later when processing the
52
+ voice narration. This is to keep the same
53
+ structure for any voice narration and to
54
+ simplify the way we offer the options in
55
+ an API that is able to make requests.
56
+
57
+ If the engine doesn't have one specific
58
+ option (I mean, you cannot handle the
59
+ narration speed, for example) we will allow
60
+ the user choose 'normal' value and it will
61
+ be handled just by ignoring it, but the user
62
+ will be able to choose it.
63
+ """
64
+
65
+ # 1. The voices we accept, as Enums
66
+ class GoogleTld(Enum):
67
+
68
+ DEFAULT = DEFAULT_VOICE
69
+ SPANISH_SPAIN = 'es'
70
+ SPANISH_MEXICO = 'com.mx'
71
+ SPANISH_US = 'us'
72
+ # TODO: How can I get the list of Tlds?I need it
73
+
74
+ @staticmethod
75
+ def from_google_language(
76
+ language: GoogleNarrationLanguage
77
+ ) -> 'GoogleTld':
78
+ """
79
+ Turn the Google narration 'language' into the
80
+ corresponding Google TLD.
81
+ """
82
+ return {
83
+ GoogleNarrationLanguage.SPANISH: GoogleTld.SPANISH_SPAIN,
84
+ # TODO: Change this
85
+ GoogleNarrationLanguage.ENGLISH: GoogleTld.SPANISH_US,
86
+ }[GoogleNarrationLanguage.to_enum(language)]
87
+
88
+ # 2. The languages we accept
89
+ LANGUAGE_OPTIONS = [
90
+ NarrationLanguage.SPANISH,
91
+ # TODO: Unavailable until I detect some valid TLDs
92
+ #NarrationLanguage.ENGLISH,
93
+ NarrationLanguage.DEFAULT
94
+ ]
95
+
96
+ # 3. The emotions we accept
97
+ EMOTION_OPTIONS = [
98
+ VoiceEmotion.DEFAULT,
99
+ VoiceEmotion.NORMAL,
100
+ ]
101
+
102
+ # 4. The speeds we accept
103
+ SPEED_OPTIONS = [
104
+ VoiceSpeed.DEFAULT,
105
+ VoiceSpeed.NORMAL,
106
+ VoiceSpeed.SLOW,
107
+ ]
108
+
109
+ # 5. The pitches we accept
110
+ PITCH_OPTIONS = [
111
+ VoicePitch.DEFAULT,
112
+ VoicePitch.NORMAL,
113
+ ]
114
+
115
+
116
+ class GoogleNarrationVoice(NarrationVoice):
117
+ """
118
+ Voice instance to be used when narrating with
119
+ Google engine.
120
+ """
121
+
122
+ @property
123
+ def processed_name(
124
+ self
125
+ ) -> str:
126
+ """
127
+ Get the usable name value from the one that has
128
+ been set when instantiating the instance.
129
+ """
130
+ # TODO: Maybe this DEFAULT value has to exist
131
+ # for each language so it chooses one voice name
132
+ # for that language
133
+ return (
134
+ GoogleTld.SPANISH_SPAIN.value
135
+ if GoogleTld.to_enum(self.name) == GoogleTld.DEFAULT else
136
+ GoogleTld.to_enum(self.name).value
137
+ )
138
+
139
+ @property
140
+ def processed_emotion(
141
+ self
142
+ ) -> str:
143
+ """
144
+ Get the usable emotion value from the one that
145
+ has been set when instantiating the instance.
146
+ """
147
+ # This narration is not able to handle any
148
+ # emotion (at least by now)
149
+ return None
150
+
151
+ @property
152
+ def processed_speed(
153
+ self
154
+ ) -> bool:
155
+ """
156
+ Get the usable speed value from the one that
157
+ has been set when instantiating the instance.
158
+ """
159
+ # This value is actually saying if we are using
160
+ # the slow mode or not
161
+ return {
162
+ VoiceSpeed.SLOW: True,
163
+ VoiceSpeed.DEFAULT: False,
164
+ VoiceSpeed.NORMAL: False
165
+ }[self.speed]
166
+
167
+ @property
168
+ def processed_pitch(
169
+ self
170
+ ) -> float:
171
+ """
172
+ Get the usable pitch value from the one that
173
+ has been set when instantiating the instance.
174
+ """
175
+ # By now we are not handling the pitch with
176
+ # this voice
177
+ return None
178
+
179
+ @property
180
+ def processed_language(
181
+ self
182
+ ) -> str:
183
+ """
184
+ Get the usable language value from the one that
185
+ has been set when instantiating the instance.
186
+ """
187
+ return GoogleNarrationLanguage.from_general_language(self.language).value
188
+
189
+ def validate_and_process(
190
+ self,
191
+ name: str,
192
+ emotion: VoiceEmotion,
193
+ speed: VoiceSpeed,
194
+ pitch: VoicePitch,
195
+ language: NarrationLanguage
196
+ ):
197
+ GoogleTld.to_enum(name)
198
+ if VoiceEmotion.to_enum(emotion) not in EMOTION_OPTIONS:
199
+ raise Exception(f'The provided {emotion} is not valid for this narration voice.')
200
+ if VoiceSpeed.to_enum(speed) not in SPEED_OPTIONS:
201
+ raise Exception(f'The provided {speed} is not valid for this narration voice.')
202
+ if VoicePitch.to_enum(pitch) not in PITCH_OPTIONS:
203
+ raise Exception(f'The provided {pitch} is not valid for this narration voice.')
204
+ if NarrationLanguage.to_enum(language) not in LANGUAGE_OPTIONS:
205
+ raise Exception(f'The provided {language} is not valid for this narration voice.')
206
+
207
+ @staticmethod
208
+ def default():
209
+ return GoogleNarrationVoice(
210
+ name = GoogleTld.DEFAULT.value,
211
+ emotion = VoiceEmotion.DEFAULT,
212
+ speed = VoiceSpeed.DEFAULT,
213
+ pitch = VoicePitch.DEFAULT,
214
+ language = NarrationLanguage.DEFAULT
215
+ )
216
+ # TODO: This was in the previous version, remove when
217
+ # confirmed that the above is working
218
+ # return GoogleNarrationVoice('', '', 130, 1.0, NarrationLanguage.DEFAULT)
219
+
220
+ # The voices but for a specific language, to be able to
221
+ # choose one when this is requested from the outside
222
+ def get_narrator_names_by_language(
223
+ language: NarrationLanguage
224
+ ) -> list[str]:
225
+ language = NarrationLanguage.to_enum(language)
226
+ language = (
227
+ NarrationLanguage.SPANISH
228
+ if language is NarrationLanguage.DEFAULT else
229
+ language
230
+ )
231
+
232
+ return {
233
+ NarrationLanguage.SPANISH: [
234
+ GoogleTld.DEFAULT.value,
235
+ GoogleTld.SPANISH_SPAIN.value,
236
+ GoogleTld.SPANISH_MEXICO.value,
237
+ GoogleTld.SPANISH_US.value
238
+ ]
239
+ }[language]
240
+
241
+
242
+ # All the remaining functionality we need to make it
243
+ # work properly
244
+ def narrate(
245
+ text: str,
246
+ voice: GoogleNarrationVoice = GoogleNarrationVoice.default(),
247
+ output_filename: Union[str, None] = None
248
+ ):
249
+ """
250
+ Creates an audio narration of the provided 'text' with the Google voice and stores it
251
+ as 'output_filename'. This will use the provided 'language' language for the narration.
252
+ """
253
+ output_filename = Output.get_filename(output_filename, FileType.AUDIO)
254
+
255
+ tld = GoogleTld.from_google_language(voice.processed_language).value
256
+
257
+ gTTS(
258
+ text = text,
259
+ lang = voice.processed_language,
260
+ tld = tld,
261
+ slow = voice.processed_speed
262
+ ).save(
263
+ output_filename
264
+ )
265
+
266
+ return output_filename